# Loading the dataset

In [11]:
%load_ext autoreload
%autoreload 2 

import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np

cols = ['sentiment','id','date','query_string','user','text']
filename = os.path.join("data", "merge_dataset.csv")
dataset = pd.read_csv(filename, encoding="ISO-8859-1")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
dataset.head()

Unnamed: 0,sentiment,id,date,query_string,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# Preprocess the data
- X = tokens from tweets 
- Y = one hot encoding of sentiments


In [17]:
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm 

def preprocess(df, tknzr=TweetTokenizer(strip_handles=True, reduce_len=True), split=0.1):
    df = df.sample(frac=1).reset_index(drop=True)
    x = df['text'].values#np.array([tknzr.tokenize(t) for t in tqdm(df['text'])])
    y = df['sentiment'].values.reshape((-1,1))
#     y = OneHotEncoder(categories='auto').fit_transform(y).toarray()
    if split is None:
        return x, y
    if not isinstance(split, (list,)):
        split = list(split)
    split = [0] + [int(s*len(x)) for s in split]
    split = np.cumsum(split)
    ret = []
    for i in range(len(split) - 1):
        ret += [x[split[i]:split[i+1]], y[split[i]:split[i+1]]]
    return ret

x_train, y_train, x_valid, y_valid, x_test, y_test = preprocess(dataset, split=[0.8,0.1,0.1])

# Training a logistic regression with embedding
See details on ["Learning to Generate Reviews and Discovering Sentiment"](https://arxiv.org/abs/1704.01444)

In [None]:
from app.classifier import Embedding

embd_model = Embedding()

In [None]:
# Embedding with neural networks
X_train = embd_model.transform(x_train)
X_dev = embd_model.transform(x_dev)
X_test = embd_model.transform(x_test)

# Find best hyper-parameters for the logistic regression 
C = 2**np.arange(-8, 1).astype(np.float)
scores = []
for i, c in enumerate(C):
    model = LogisticRegression(C=c, penalty='l1', \
                solver='lbfgs', multi_class='auto')
    model.fit(X_train, y_train)
    score = model.score(X_dev, y_dev)
    scores.append(score)
    
# Train (again) on the best classifier
c = C[np.argmax(scores)]
model = LogisticRegression(C=c, penalty='l1', \
                solver='lbfgs', multi_class='auto')
model.fit(X_train, y_train)
nnotzero = np.sum(model.coef_ != 0)

score = accuracy_score(y_dev, model.predict(X_dev))*100.

# Classification results
print(f'Accuracy: {score:05.2f}')
print(f'Regularization L1: {c:05.2f}')
print(f'Used features: {nnotzero:05d}')

# Write results on test set
y_test = model.predict(X_test)
lines = '\n'.join([str(np.argmax(pred)) for pred in y_test])
with open('logreg_embdnn_y_test_sst.txt','w') as f:
    f.writelines(lines)

# Train a Naive Bayes Classifier

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

text_clf.fit(x_valid, y_valid)


print(classification_report(y_test, text_clf.predict(x_test), digits=4))

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0     0.7325    0.8314    0.7788     80253
           2     0.0000    0.0000    0.0000        22
           4     0.8036    0.6946    0.7451     79784

   micro avg     0.7631    0.7631    0.7631    160059
   macro avg     0.5120    0.5087    0.5080    160059
weighted avg     0.7678    0.7631    0.7619    160059



  'precision', 'predicted', average, warn_for)


Sanity check:

In [29]:
text_clf.predict(["I am happy", "I am sad", "I love trump", "I do not love my mum"])

array([4, 0, 4, 0])

# Grid search to find best hyperparameters
We could also use genetic algorithms

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import re

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}


score = 'f1_macro'
text_clf = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring=score)

print(classification_report(y_test, text_clf.predict(x_test), digits=4))

# Training with imbalance

In [28]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.metrics import classification_report_imbalanced
import re

text_clf = make_pipeline_imb(TfidfVectorizer(),
                         MultinomialNB())
text_clf.fit(x_train, y_train)
print(classification_report_imbalanced(y_test, text_clf.predict(x_test)))

  y = column_or_1d(y, warn=True)


                   pre       rec       spe        f1       geo       iba       sup

          0       0.75      0.82      0.73      0.78      0.77      0.60     80253
          2       0.00      0.00      1.00      0.00      0.00      0.00        22
          4       0.80      0.73      0.82      0.76      0.77      0.59     79784

avg / total       0.78      0.77      0.77      0.77      0.77      0.60    160059



  'precision', 'predicted', average, warn_for)


# With Random Undersampler

In [30]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.metrics import classification_report_imbalanced
from imblearn.under_sampling import RandomUnderSampler
import re


text_clf = make_pipeline_imb(TfidfVectorizer(),
                         RandomUnderSampler(),
                         MultinomialNB())


tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}


score = 'f1_macro'
text_clf.fit(x_train, y_train)

print(classification_report_imbalanced(y_test, text_clf.predict(x_test)))

  y = column_or_1d(y, warn=True)


                   pre       rec       spe        f1       geo       iba       sup

          0       0.63      0.68      0.60      0.65      0.64      0.41     80253
          2       0.00      0.82      0.92      0.00      0.87      0.75        22
          4       0.65      0.50      0.74      0.57      0.61      0.36     79784

avg / total       0.64      0.59      0.67      0.61      0.62      0.39    160059



In [32]:
text_clf.predict(["Paris is in France"])

array([4])

# Export model with joblib
joblib is more efficient than pickle for large array

In [29]:
from joblib import dump, load
dump(text_clf, 'model/multinomial-nb.joblib') 

['model/multinomial-nb.joblib']

# With Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report


rnm_frt = RandomForestClassifier(n_estimators=100, max_depth=5,
                             random_state=0, class_weight=None)

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', rnm_frt)])

text_clf.fit(x_train, y_train)

print(classification_report(y_test, text_clf.predict(x_test), digits=4))

  self._final_estimator.fit(Xt, y, **fit_params)


              precision    recall  f1-score   support

           0     0.7081    0.6162    0.6590     80253
           2     0.0000    0.0000    0.0000        22
           4     0.6584    0.7445    0.6988     79784

   micro avg     0.6801    0.6801    0.6801    160059
   macro avg     0.4555    0.4536    0.4526    160059
weighted avg     0.6832    0.6801    0.6787    160059



  'precision', 'predicted', average, warn_for)


In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report


rnm_frt = RandomForestClassifier(n_estimators=100, max_depth=30,
                             random_state=0, class_weight="balanced_subsample")

rf_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', rnm_frt)])

# rf_clf.fit(x_train, y_train)
rf_clf.fit(x_valid, y_valid)

print(classification_report(y_test, rf_clf.predict(x_test), digits=4))

  self._final_estimator.fit(Xt, y, **fit_params)


              precision    recall  f1-score   support

           0     0.7281    0.7486    0.7382     80253
           2     0.0400    0.0909    0.0556        22
           4     0.7397    0.7184    0.7289     79784

   micro avg     0.7335    0.7335    0.7335    160059
   macro avg     0.5026    0.5193    0.5075    160059
weighted avg     0.7337    0.7335    0.7335    160059



In [57]:
from joblib import dump, load
dump(rf_clf, 'model/random-forest.joblib') 

['model/random-forest.joblib']

# With Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report


gb = GradientBoostingClassifier(n_estimators=200, random_state=0, max_depth=3)

gb_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', gb)])

gb_clf.fit(x_train, y_train)
# gb_clf.fit(x_valid, y_valid)

print(classification_report(y_test, gb_clf.predict(x_test), digits=4))

In [None]:
from joblib import dump, load
dump(gb_clf, 'model/gradient-boosting.joblib') 

In [52]:
gb_clf.predict(["I love Paris"])

array([4])