# Sentiment Classifier - SVC

In [1]:
import datetime
import pandas as pd
import warnings
import sklearn.externals as extjoblib
import joblib
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [2]:
RANDOM_SEED = 42

In [3]:
df_proc = pd.read_csv('cleaned_non-vectorized_data.csv')
df_proc = df_proc[~df_proc['Text'].isna()]
X = df_proc['Text']

In [4]:
df_proc

Unnamed: 0,Text,tokens,disgust,joy,anger,surprised,sad,fear,neutral
0,come mert ’ today let u take care lunch enjoy ...,"['come', 'mert', '’', 'today', 'let', 'u', 'ta...",0,0,0,0,0,0,1
1,nxt gt lay 20 staff tech 's latest cutback rb_...,"['nxt', 'gt', 'lay', '20', 'staff', 'tech', ""'...",0,0,0,0,0,0,1
2,layoff 20 workforce 100 employee sf bay area,"['layoff', '20', 'workforce', '100', 'employee...",0,0,0,0,0,0,1
3,today ’ lunch special smoked pork sausage onio...,"['today', '’', 'lunch', 'special', 'smoked', '...",0,0,0,0,0,0,1
4,come mert ’ today grab salmon cake two home co...,"['come', 'mert', '’', 'today', 'grab', 'salmon...",0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
9277,traik01 cdc people warmed u 2 year ago .... sa...,"['traik01', 'cdc', 'people', 'warmed', 'u', '2...",0,0,0,0,1,0,0
9278,sorry ’ promo code share lately 😭 promos autom...,"['sorry', '’', 'promo', 'code', 'share', 'late...",0,0,0,0,1,0,0
9279,poor lad,"['poor', 'lad']",0,0,0,0,1,0,0
9280,one day able bill order tmobile bill sadly tod...,"['one', 'day', 'able', 'bill', 'order', 'tmobi...",0,0,0,0,1,0,0


In [5]:
y = df_proc.loc[:,['disgust', 'joy', 'anger', 'surprised', 'sad', 'fear', 'neutral']]

In [6]:
y

Unnamed: 0,disgust,joy,anger,surprised,sad,fear,neutral
0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...
9277,0,0,0,0,1,0,0
9278,0,0,0,0,1,0,0
9279,0,0,0,0,1,0,0
9280,0,0,0,0,1,0,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

### Bag-Of-Words Approach

In [8]:
vectorizer = TfidfVectorizer(lowercase=True, 
                             stop_words='english', 
                             ngram_range=(1,2),
                             max_df=0.75,
                             min_df=50,
                             norm='l1')

In [9]:
clf = LinearSVC(random_state=RANDOM_SEED)
multi_out_clf = MultiOutputClassifier(clf)

In [11]:
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('multi-classifier', multi_out_clf)
])

In [12]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vectorizer', 'multi-classifier', 'vectorizer__analyzer', 'vectorizer__binary', 'vectorizer__decode_error', 'vectorizer__dtype', 'vectorizer__encoding', 'vectorizer__input', 'vectorizer__lowercase', 'vectorizer__max_df', 'vectorizer__max_features', 'vectorizer__min_df', 'vectorizer__ngram_range', 'vectorizer__norm', 'vectorizer__preprocessor', 'vectorizer__smooth_idf', 'vectorizer__stop_words', 'vectorizer__strip_accents', 'vectorizer__sublinear_tf', 'vectorizer__token_pattern', 'vectorizer__tokenizer', 'vectorizer__use_idf', 'vectorizer__vocabulary', 'multi-classifier__estimator__C', 'multi-classifier__estimator__class_weight', 'multi-classifier__estimator__dual', 'multi-classifier__estimator__fit_intercept', 'multi-classifier__estimator__intercept_scaling', 'multi-classifier__estimator__loss', 'multi-classifier__estimator__max_iter', 'multi-classifier__estimator__multi_class', 'multi-classifier__estimator__penalty', 'multi-classifier__estimato

In [13]:
parameters = {
    "vectorizer__max_df": (0.5, 0.75, 1.0),
    "vectorizer__min_df": (50, 100, 200),
    "vectorizer__ngram_range": ((1, 1), (1, 2)), 
    "vectorizer__norm": ('l1', 'l2'),
    "multi-classifier__estimator__multi_class": ('ovr', 'crammer_singer'),
    'multi-classifier__estimator__C': (0.1, 0.5, 1.0), 
}


In [14]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


GridSearchCV(estimator=Pipeline(steps=[('vectorizer',
                                        TfidfVectorizer(max_df=0.75, min_df=50,
                                                        ngram_range=(1, 2),
                                                        norm='l1',
                                                        stop_words='english')),
                                       ('multi-classifier',
                                        MultiOutputClassifier(estimator=LinearSVC(random_state=42)))]),
             n_jobs=-1,
             param_grid={'multi-classifier__estimator__C': (0.1, 0.5, 1.0),
                         'multi-classifier__estimator__multi_class': ('ovr',
                                                                      'crammer_singer'),
                         'vectorizer__max_df': (0.5, 0.75, 1.0),
                         'vectorizer__min_df': (50, 100, 200),
                         'vectorizer__ngram_range': ((1, 1), (1, 2)),
               

In [15]:
best_parameters = grid_search.best_estimator_.get_params()

In [16]:
best_parameters

{'memory': None,
 'steps': [('vectorizer',
   TfidfVectorizer(max_df=0.5, min_df=50, ngram_range=(1, 2), stop_words='english')),
  ('multi-classifier',
   MultiOutputClassifier(estimator=LinearSVC(random_state=42)))],
 'verbose': False,
 'vectorizer': TfidfVectorizer(max_df=0.5, min_df=50, ngram_range=(1, 2), stop_words='english'),
 'multi-classifier': MultiOutputClassifier(estimator=LinearSVC(random_state=42)),
 'vectorizer__analyzer': 'word',
 'vectorizer__binary': False,
 'vectorizer__decode_error': 'strict',
 'vectorizer__dtype': numpy.float64,
 'vectorizer__encoding': 'utf-8',
 'vectorizer__input': 'content',
 'vectorizer__lowercase': True,
 'vectorizer__max_df': 0.5,
 'vectorizer__max_features': None,
 'vectorizer__min_df': 50,
 'vectorizer__ngram_range': (1, 2),
 'vectorizer__norm': 'l2',
 'vectorizer__preprocessor': None,
 'vectorizer__smooth_idf': True,
 'vectorizer__stop_words': 'english',
 'vectorizer__strip_accents': None,
 'vectorizer__sublinear_tf': False,
 'vectorizer__t

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score

def print_classification_scores(y_test, pred):
    print('Accuracy Score:',accuracy_score(y_test, pred))
    print('Precision Score:',precision_score(y_test, pred, average='micro'))
    print('Recall Score:',recall_score(y_test, pred, average='micro'))
    print('F1 Score:',f1_score(y_test, pred, average='micro'))
    print('AUC Score:',roc_auc_score(y_test, pred, average='micro'))

In [18]:
pred = grid_search.predict(X_test)
print_classification_scores(y_test, pred)

Accuracy Score: 0.46045503791982667
Precision Score: 0.6736507936507936
Recall Score: 0.496026180458158
F1 Score: 0.5713516424340334
AUC Score: 0.7241792777464675


### Word Embeddings Approach

In [19]:
%pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [20]:
import gzip
import json
import matplotlib.pyplot as plt
import numpy as np
import re
import random
import pandas as pd
import seaborn as sns
import gensim
import nltk
import gensim.downloader
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm

In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/m.nguyen.2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
#https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
stop_words = set(stopwords.words('english'))

In [23]:
all_tokenized_reviews = []

for review in tqdm(X):
    tokens = [token for token in re.findall(r'\w+', review) if token not in stop_words]
    all_tokenized_reviews.append(tokens)

100%|██████████| 9226/9226 [00:00<00:00, 117179.48it/s]


In [24]:
full_model = Word2Vec(sentences=all_tokenized_reviews, vector_size=100, 
                      window=2, min_count=100, workers=4, seed=RANDOM_SEED)

In [25]:
full_model.save("word2vec_tweets.model")

In [26]:
full_model_kv = full_model.wv

In [27]:
gnews = gensim.downloader.load('word2vec-google-news-300')
glove_small = gensim.downloader.load('glove-wiki-gigaword-100')
glove_big = gensim.downloader.load('glove-wiki-gigaword-300')

In [28]:
def generate_dense_features(tokenized_texts, word_vectors): 
    #HINT: Create an empty list to hold your results 
        #HINT:Iterate through each item in tokenized_text
            #HINT:Create a list that contains current item(s) if found in word_vectors
            #HINT:if the length of this list is greater than zero:
                #HINT:We set this as a feature, this is done by using numpy’s mean function and append it to our results list 
            #HINT:Otherwise: create a vector of numpy zeros using word_vectors.vector_size as the parameter and append it to the results list
    #HINT:Return the results list as a numpy array (data type)

    res = []
    for token in tokenized_texts:
        items_in_vocab = [item for item in token if item in word_vectors]
        if len(items_in_vocab) > 0:
            res.append(np.mean(word_vectors[items_in_vocab], axis=0))
        else:
            res.append(np.zeros(word_vectors.vector_size))
    return np.array(res)


In [29]:
tokenized_train_items = []
for review in tqdm(X_train):
    tokens = [token for token in re.findall(r'\w+', review) if token not in stop_words]
    tokenized_train_items.append(tokens)

100%|██████████| 7380/7380 [00:04<00:00, 1771.48it/s]


In [30]:
tokenized_test_items = []
for review in tqdm(X_test):
    tokens = [token for token in re.findall(r'\w+', review) if token not in stop_words]
    tokenized_test_items.append(tokens)

100%|██████████| 1846/1846 [00:00<00:00, 126316.32it/s]


In [31]:
X_train_wp = generate_dense_features(tokenized_train_items, full_model_kv)

In [32]:
X_test_wp = generate_dense_features(tokenized_test_items, full_model_kv)

In [34]:
clf = LinearSVC(random_state=RANDOM_SEED)
multi_out_clf = MultiOutputClassifier(clf)


multi_out_clf.fit(X_train_wp, y_train)

MultiOutputClassifier(estimator=LinearSVC(random_state=42))

In [35]:
pred = multi_out_clf.predict(X_test_wp)
print_classification_scores(y_test, pred)

Accuracy Score: 0.2746478873239437
Precision Score: 0.6702014846235419
Recall Score: 0.2954651706404862
F1 Score: 0.4101232965606749
AUC Score: 0.6333117376897136


Now we can repeat the same process with the pre-trained embeddings

In [36]:
X_train_wp = generate_dense_features(tokenized_train_items, gnews)

In [37]:
X_test_wp = generate_dense_features(tokenized_test_items, gnews)

In [38]:
clf = LinearSVC(random_state=RANDOM_SEED)
multi_out_clf = MultiOutputClassifier(clf)


multi_out_clf.fit(X_train_wp, y_train)

MultiOutputClassifier(estimator=LinearSVC(random_state=42))

In [39]:
pred = multi_out_clf.predict(X_test_wp)
print_classification_scores(y_test, pred)

Accuracy Score: 0.44257854821235104
Precision Score: 0.6970284237726099
Recall Score: 0.5044413277232351
F1 Score: 0.5852997016544617
AUC Score: 0.7304734692033592


Glove_small

In [40]:
X_train_wp = generate_dense_features(tokenized_train_items, glove_small)

In [41]:
X_test_wp = generate_dense_features(tokenized_test_items, glove_small)

In [42]:
clf = LinearSVC(random_state=RANDOM_SEED)
multi_out_clf = MultiOutputClassifier(clf)


multi_out_clf.fit(X_train_wp, y_train)

MultiOutputClassifier(estimator=LinearSVC(random_state=42))

In [43]:
pred = multi_out_clf.predict(X_test_wp)
print_classification_scores(y_test, pred)

Accuracy Score: 0.38353196099674974
Precision Score: 0.6579898770788142
Recall Score: 0.4254324450677887
F1 Score: 0.5167518455423056
AUC Score: 0.6907835507356934


Glove_big

In [44]:
X_train_wp = generate_dense_features(tokenized_train_items, glove_big)

In [45]:
X_test_wp = generate_dense_features(tokenized_test_items, glove_big)

In [46]:
clf = LinearSVC(random_state=RANDOM_SEED)
multi_out_clf = MultiOutputClassifier(clf)


multi_out_clf.fit(X_train_wp, y_train)

MultiOutputClassifier(estimator=LinearSVC(random_state=42))

In [47]:
pred = multi_out_clf.predict(X_test_wp)
print_classification_scores(y_test, pred)

Accuracy Score: 0.43932827735644636
Precision Score: 0.680973734785394
Recall Score: 0.49696119682094436
F1 Score: 0.5745945945945946
AUC Score: 0.7253886944876307
