In [1]:
import pandas as pd
import re
import string
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [3]:
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [4]:
from nltk import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
from nltk.corpus import words

In [5]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from gensim.models import KeyedVectors

In [6]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import spacy

In [7]:
import joblib

In [8]:
from collections import Counter

In [9]:
pio.renderers.default = "notebook"

## Read Data

In [11]:
data = pd.read_csv("data/training.16m.tweet.csv", encoding="ISO-8859-1", names=["label", "id", "date", "query", "user", "tweet"])


In [12]:
data.sample(5)

Unnamed: 0,label,id,date,query,user,tweet
327176,0,2009030982,Tue Jun 02 15:02:26 PDT 2009,NO_QUERY,shep689,I'm always sad when i leave the gym.
396001,0,2056250825,Sat Jun 06 10:48:09 PDT 2009,NO_QUERY,JessieKaitlin,About to eat a late lunch. I don't feel rested
762315,0,2297787624,Tue Jun 23 10:52:24 PDT 2009,NO_QUERY,Sally_Ena,Boo I hate going to the doctor
140255,0,1880995606,Fri May 22 03:14:12 PDT 2009,NO_QUERY,SorenLorensen,@belle_lulu Would do as am not at work but can...
709804,0,2257728138,Sat Jun 20 14:47:53 PDT 2009,NO_QUERY,1Sjes,Blood: the Last Vampire 2009 live-action is re...


In [13]:
data.describe()

Unnamed: 0,label,id
count,1600000.0,1600000.0
mean,2.0,1998818000.0
std,2.000001,193576100.0
min,0.0,1467810000.0
25%,0.0,1956916000.0
50%,2.0,2002102000.0
75%,4.0,2177059000.0
max,4.0,2329206000.0


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   label   1600000 non-null  int64 
 1   id      1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   query   1600000 non-null  object
 4   user    1600000 non-null  object
 5   tweet   1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


### Clean tweets: remove @username or https website from text.

In [10]:
def clean_tweets_user(text):
    REGx_tweet = r"@\S+|https?:\S+|http?:\S"
    text = re.sub(REGx_tweet, "", str(text).lower()).strip()
    return "".join(text)


#### remove hashtag

In [None]:
def remove_hashtag(text) -> str:
    REGx_hastag = r"#[A-Za-z0-9_]+"
    text = re.sub(REGx_hastag, " ", str(text).lower()).strip()
    return text

### Clean punctuation: remove punctuation from text.

In [11]:
def clean_punctuation(text):
    clean_text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    return "".join(clean_text.strip())


### Clean number from text.

In [12]:
def clean_numbers(text):
    clean_text = re.sub('\w*\d\w*', "", text)
    return "".join(clean_text)
    

### Clean stop words.

In [13]:
stop_words = stopwords.words('english')
stop_words.remove("not")
def clean_stopwords(text):
    wordList = word_tokenize(text)
    clean_text = []
    for word in wordList:
            if word in stop_words:
                continue
            else:
                clean_text.append(word+" ")
    return "".join(clean_text)

### Stemmers & Lemmatized .

In [14]:
snowBallStemmer = SnowballStemmer("english")
def tweet_stemmers(text):
    wordList = word_tokenize(text)
    stemWords = [snowBallStemmer.stem(word+" ") for word in wordList]
    return "".join(stemWords)

In [15]:
%%time
nlp = spacy.load("en_core_web_lg",disable = ['tagger','perser','ner'])
def tweet_lemmatized(text) -> str:
    doc = nlp(text)
    clean_text =[str(word.lemma_) if word.lemma_ != "-PRON-" else str(word) for word in doc]
    return " ".join(clean_text)

CPU times: user 3.95 s, sys: 1.09 s, total: 5.04 s
Wall time: 7.49 s


### Remove emojies

In [16]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r" ", text)

#### remove arabic word 

In [None]:
def remove_arabic(text) -> str:
    arabic_compile = re.compile(r"[\u0600-\u06FF]+",flags=re.UNICODE )
    clean_text = arabic_compile.sub(r" ", text)
    clean_text = clean_text.strip()
    clean_text = clean_text.encode("ascii", "ignore")
    return str(clean_text.decode())

### Encode Text

In [17]:
def encode_text(text):
    clean_text = text.strip()
    clean_text = clean_text.encode("ascii", "ignore")
    return str(clean_text.decode())


## Pre Process Text

In [22]:
%%time
data["tweet"] = data["tweet"].apply(lambda text: clean_tweets_user(text))

CPU times: user 13 s, sys: 181 ms, total: 13.2 s
Wall time: 13.2 s


In [23]:
%%time
data["tweet"] = data["tweet"].apply(lambda text: tweet_lemmatized(text))

CPU times: user 1h 12min 43s, sys: 7.63 s, total: 1h 12min 51s
Wall time: 1h 12min 52s


In [24]:
%%time
data["tweet"] = data["tweet"].apply(lambda text: tweet_stemmers(text))

CPU times: user 8min 12s, sys: 525 ms, total: 8min 13s
Wall time: 8min 13s


In [25]:
%%time
data["tweet"] = data["tweet"].apply(lambda text: clean_punctuation(text))

CPU times: user 9.51 s, sys: 998 µs, total: 9.51 s
Wall time: 9.51 s


In [26]:
%%time
data["tweet"] = data["tweet"].apply(lambda text: clean_stopwords(text))

CPU times: user 3min 24s, sys: 18 ms, total: 3min 24s
Wall time: 3min 24s


In [27]:
%%time
data["tweet"] = data["tweet"].apply(lambda text: clean_numbers(text))

CPU times: user 11.9 s, sys: 1.98 ms, total: 11.9 s
Wall time: 11.9 s


In [28]:
%%time
data["tweet"] = data["tweet"].apply(lambda text: encode_text(text))

CPU times: user 1.09 s, sys: 1.01 ms, total: 1.09 s
Wall time: 1.09 s


## Gensim Model
#####   

#### Cpu Cores count 

In [30]:
import multiprocessing
cores = multiprocessing.cpu_count()
cores

8

#### TaggedDocument

In [31]:
%%time
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data["tweet"])]

CPU times: user 2min 31s, sys: 640 ms, total: 2min 32s
Wall time: 2min 32s


In [32]:
tagged_data_DBOW, tagged_data_DM = train_test_split(tagged_data, test_size=0.5,random_state=42)

#### Train Model

#### PV-DBOW model

In [33]:
%%time
model_dbow = gensim.models.doc2vec.Doc2Vec(tagged_data_DBOW ,vector_size=300, negative=10, workers=(cores-1),
                                           window=5,alpha=0.025, min_alpha=0.0001, seed=1, min_count=5,
                                           sample=0.001, epochs=65,hs=0,dm=0, ns_exponent=0.75)

CPU times: user 1h 2min 40s, sys: 8min 56s, total: 1h 11min 37s
Wall time: 40min 58s


In [34]:
%%time
model_dbow.build_vocab(tagged_data_DBOW, update=True)

CPU times: user 3.34 s, sys: 6.04 ms, total: 3.35 s
Wall time: 3.35 s


In [35]:
%%time
model_dbow.train(tagged_data_DBOW, total_examples=len(tagged_data_DBOW), epochs=65)

CPU times: user 1h 44min 51s, sys: 18min 57s, total: 2h 3min 48s
Wall time: 1h 12min 4s


In [36]:
model_dbow_file = "models/DBOW_model/"

In [37]:
%%time
model_dbow.save(model_dbow_file+"model.model")

CPU times: user 5.91 s, sys: 2.01 s, total: 7.92 s
Wall time: 9.19 s


In [38]:
model_dbow.wv.save_word2vec_format(model_dbow_file+"model_format/model.bin")

In [39]:
model_dbow.wv.save_word2vec_format(model_dbow_file+"model_format/model.csv", binary=False)

In [40]:
model_dbow.wv.save_word2vec_format(model_dbow_file+"model_format/model.txt", binary=False)

#### DM Model

In [41]:
%%time
model_dm = gensim.models.doc2vec.Doc2Vec(tagged_data_DM ,vector_size=300, negative=10, workers=(cores-1),
                                         window=5,alpha=0.025, min_alpha=0.0001, seed=1, min_count=5,
                                         sample=0.001, epochs=65,hs=0, dm=1, ns_exponent=0.75)

CPU times: user 3h 42min 39s, sys: 54min 14s, total: 4h 36min 53s
Wall time: 2h 59min 15s


In [42]:
%%time
model_dm.build_vocab(tagged_data_DM, update=True)

CPU times: user 3.83 s, sys: 0 ns, total: 3.83 s
Wall time: 3.84 s


In [43]:
%%time
model_dm.train(tagged_data_DM, total_examples=len(tagged_data_DM), epochs=65)

CPU times: user 1h 47min 36s, sys: 26min 38s, total: 2h 14min 15s
Wall time: 1h 28min 16s


In [44]:
model_dm_file = "models/DM_model/"

In [45]:
%%time
model_dm.save(model_dm_file+"model.model")

CPU times: user 6.08 s, sys: 2.41 s, total: 8.49 s
Wall time: 9.07 s


In [47]:
model_dm.wv.save_word2vec_format(model_dm_file+"model_format/model.bin")

In [48]:
model_dm.wv.save_word2vec_format(model_dm_file+"model_format/model.csv", binary=False)

In [49]:
model_dm.wv.save_word2vec_format(model_dm_file+"model_format/model.txt", binary=False)

### Load Models

In [18]:
doc2vec_dm_model = gensim.models.doc2vec.Doc2Vec.load("models/DM_model/model.model")

In [19]:
doc2vec_dm_model.wv.most_similar("text")

[('phone', 0.7605751156806946),
 ('tweet', 0.7427206039428711),
 ('twitter', 0.7217567563056946),
 ('talk', 0.6946784257888794),
 ('sorry', 0.6919742822647095),
 ('okay', 0.6901240348815918),
 ('txt', 0.683121919631958),
 ('lol', 0.6818835735321045),
 ('not', 0.6811061501502991),
 ('know', 0.6714918613433838)]

In [20]:
len(doc2vec_dm_model.wv.vocab)

28729

In [21]:
doc2vec_dbow_model = gensim.models.doc2vec.Doc2Vec.load("models/DBOW_model/model.model")

In [22]:
doc2vec_dbow_model.wv.most_similar("great")

[('roll', 0.2352784126996994),
 ('signage', 0.22303415834903717),
 ('undies', 0.21808171272277832),
 ('sunbed', 0.21286363899707794),
 ('shoutout', 0.19729633629322052),
 ('upp', 0.19452157616615295),
 ('precal', 0.19269666075706482),
 ('rediscover', 0.19240522384643555),
 ('linus', 0.19108791649341583),
 ('bonnet', 0.1899510771036148)]

In [23]:
len(doc2vec_dbow_model.wv.vocab)

28681

### Tokenizer spacy

In [24]:
nlp = English()
tokenizer_ng = Tokenizer(nlp.vocab)

## Train data for ML 

In [25]:
train = pd.read_csv("data/data_tweet.csv")

In [26]:
train.sample(5)

Unnamed: 0.1,Unnamed: 0,tweet,polarity,subjectivity,sentiment
35407,30039,love tonight have interview two great guy hav...,0.266667,0.61,positive
12817,7449,allahsoil stone and human be different but to...,0.0,0.35,negative
12109,6741,scatter joy may we blessampkeep shine upon be...,0.633333,0.566667,positive
36571,31203,off to wembley stadium capitalstb readyt...,0.0,0.0,neutral
28004,22636,edc las vega we literally can not even omg ...,0.0,0.0,neutral


In [27]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37330 entries, 0 to 37329
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    37330 non-null  int64  
 1   tweet         37328 non-null  object 
 2   polarity      37330 non-null  float64
 3   subjectivity  37330 non-null  float64
 4   sentiment     37330 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 1.4+ MB


In [28]:
train = train.drop("Unnamed: 0", axis=1)

In [29]:
train["tweet"] = train["tweet"].apply(lambda text: clean_tweets_user(text))

train["tweet"] = train["tweet"].apply(lambda text: remove_emoji(text))
train["tweet"] = train["tweet"].apply(lambda text: encode_text(text))

train["tweet"] = train["tweet"].apply(lambda text: tweet_lemmatized(text))
train["tweet"] = train["tweet"].apply(lambda text: tweet_stemmers(text))

train["tweet"] = train["tweet"].apply(lambda text: clean_numbers(text))
train["tweet"] = train["tweet"].apply(lambda text: clean_punctuation(text))
train["tweet"] = train["tweet"].apply(lambda text: clean_stopwords(text))

### Doc2Vec & ML Models

In [30]:
tagged_train_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(train["tweet"])]

In [31]:
tagged_data_train_DBOW, tagged_data_train_DM = train_test_split(tagged_train_data, test_size=0.5,random_state=42)

In [32]:
doc2vec_dbow_model.build_vocab(tagged_data_train_DBOW, update=True)

In [33]:
doc2vec_dm_model.build_vocab(tagged_data_train_DM, update=True)

In [34]:
doc2vec_dm_model.estimate_memory()

{'doctag_lookup': 161844200,
 'doctag_syn0': 971065200,
 'vocab': 14662500,
 'vectors': 35190000,
 'syn1neg': 35190000,
 'total': 1217951900}

In [35]:
doc2vec_dbow_model.estimate_memory()

{'doctag_lookup': 161855200,
 'doctag_syn0': 971131200,
 'vocab': 14648000,
 'vectors': 35155200,
 'syn1neg': 35155200,
 'total': 1217944800}

In [36]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec([doc2vec_dbow_model, doc2vec_dm_model])

In [37]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
train[["label"]] = enc.fit_transform(train[["sentiment"]])

In [38]:
train_x, test_x = train_test_split(train, test_size=0.2, random_state=42)
print("Train size:", len(train_x))
print("Test size:", len(test_x))

Train size: 29864
Test size: 7466


In [39]:
len(doc2vec_dbow_model.wv.vocab)

29296

In [40]:
len(doc2vec_dm_model.wv.vocab)

29325

## Vector of word

In [41]:
%%time
train_a = []
for text in train_x['tweet']:
    train_a.append(new_model.infer_vector([str(word) for word in tokenizer_ng(text)]))   

CPU times: user 2min 19s, sys: 283 ms, total: 2min 19s
Wall time: 2min 19s


In [42]:
train_vec = pd.DataFrame(train_a)

In [43]:
%%time
test_a=[]
for text in test_x['tweet']:
    test_a.append(new_model.infer_vector([str(word) for word in tokenizer_ng(text)]))

CPU times: user 35.1 s, sys: 53.2 ms, total: 35.1 s
Wall time: 35.2 s


In [44]:
test_vec = pd.DataFrame(test_a)

In [45]:
y_train = train_x["label"]
y_test = test_x["label"]

### Scale data

In [46]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()

In [47]:
train_scale = std.fit_transform(train_vec)
test_scale = std.fit_transform(test_vec)

In [48]:
train_norm = pd.DataFrame(train_scale) 
test_norm = pd.DataFrame(test_scale) 

### ML Model

In [49]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier

In [50]:
%%time
ovr_sv = OneVsRestClassifier(SVC(kernel="rbf", C=20))
ovr_sv.fit(train_norm, y_train)
ovr_sv_accuracy = round(ovr_sv.score(test_norm, y_test) * 100, 3)
ovr_sv_accuracy

CPU times: user 1h 30min 29s, sys: 2.37 s, total: 1h 30min 32s
Wall time: 1h 30min 35s


82.681

### Fine tuniing

In [53]:
# Grid Search
param_grid = {"C": [10,20,30]}

In [54]:
%%time
svc_g = OneVsRestClassifier(SVC(kernel="rbf"))
grid_svc = GridSearchCV(estimator=svc_g, param_grid=param_grid, cv=5)

CPU times: user 0 ns, sys: 709 µs, total: 709 µs
Wall time: 1.32 ms


In [55]:
%%time
grid_svc.fit(train_norm, y_train)

CPU times: user 4h 9min 35s, sys: 2.89 s, total: 4h 9min 38s
Wall time: 4h 9min 45s


GridSearchCV(cv=5, estimator=SVC(class_weight='balanced', gamma='auto'),
             param_grid={'C': [10, 50, 100]})

In [56]:
%%time
print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid_svc.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_svc.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_svc.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 SVC(C=10, class_weight='balanced', gamma='auto')

 The best score across ALL searched params:
 0.8112109034078223

 The best parameters across ALL searched params:
 {'C': 10}
CPU times: user 110 µs, sys: 1e+03 µs, total: 1.11 ms
Wall time: 697 µs


### Metric

In [50]:
loaded_model = joblib.load("models/ml_model/ml_model.sav")

In [51]:
svc_pre = ovr_sv.score(test_norm, y_test)

In [57]:
%%time
predictions = cross_val_predict(ovr_sv, test_norm, y_test, cv=15)
print(predictions)

[2. 1. 2. ... 1. 2. 1.]
CPU times: user 17min 14s, sys: 3.21 s, total: 17min 18s
Wall time: 17min 18s


In [57]:
print("Classification Report:\n", classification_report(y_test,predictions))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.71      0.60      0.65      1740
         1.0       0.70      0.71      0.71      1792
         2.0       0.80      0.85      0.83      3934

    accuracy                           0.76      7466
   macro avg       0.74      0.72      0.73      7466
weighted avg       0.76      0.76      0.76      7466



In [58]:
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

Confusion Matrix:
 [[1043  214  483]
 [ 165 1276  351]
 [ 253  333 3348]]


### save ml model

In [59]:
filename = "models/ml_model/ml_model.sav"
joblib.dump(ovo_sv, filename)

['models/ml_model/ml_model.sav']

In [61]:
loaded_model = joblib.load(filename)
result = loaded_model.score(test_norm, y_test)
print(result)

0.8197160460755425
