# Loading the dataset

In [None]:
%load_ext autoreload
%autoreload 2 

import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np

cols = ['sentiment','id','date','query_string','user','text']
data = os.path.join("data", "sentiment140")
datasets = os.listdir(data)
dataset = pd.read_csv(os.path.join(data, datasets[1]), header=None, names=cols, encoding="ISO-8859-1")

In [None]:
dataset.head()

# Preprocess the data
- X = tokens from tweets 
- Y = one hot encoding of sentiments


In [9]:
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm 

def preprocess(df, tknzr=TweetTokenizer(strip_handles=True, reduce_len=True), split=0.1):
    df = df.sample(frac=1).reset_index(drop=True)
    x = df['text'].values#np.array([tknzr.tokenize(t) for t in tqdm(df['text'])])
    y = df['sentiment'].values.reshape((-1,1))
#     y = OneHotEncoder(categories='auto').fit_transform(y).toarray()
    if split is None:
        return x, y
    if not isinstance(split, (list,)):
        split = list(split)
    split = [0] + [int(s*len(x)) for s in split]
    split = np.cumsum(split)
    ret = []
    for i in range(len(split) - 1):
        ret += [x[split[i]:split[i+1]], y[split[i]:split[i+1]]]
    return ret

x_train, y_train, x_valid, y_valid, x_test, y_test = preprocess(dataset, split=[0.8,0.1,0.1])

# Training a logistic regression with embedding
See details on ["Learning to Generate Reviews and Discovering Sentiment"](https://arxiv.org/abs/1704.01444)

In [None]:
from app.classifier import Embedding

embd_model = Embedding()

In [None]:
# Embedding with neural networks
X_train = embd_model.transform(x_train)
X_dev = embd_model.transform(x_dev)
X_test = embd_model.transform(x_test)

# Find best hyper-parameters for the logistic regression 
C = 2**np.arange(-8, 1).astype(np.float)
scores = []
for i, c in enumerate(C):
    model = LogisticRegression(C=c, penalty='l1', \
                solver='lbfgs', multi_class='auto')
    model.fit(X_train, y_train)
    score = model.score(X_dev, y_dev)
    scores.append(score)
    
# Train (again) on the best classifier
c = C[np.argmax(scores)]
model = LogisticRegression(C=c, penalty='l1', \
                solver='lbfgs', multi_class='auto')
model.fit(X_train, y_train)
nnotzero = np.sum(model.coef_ != 0)

score = accuracy_score(y_dev, model.predict(X_dev))*100.

# Classification results
print(f'Accuracy: {score:05.2f}')
print(f'Regularization L1: {c:05.2f}')
print(f'Used features: {nnotzero:05d}')

# Write results on test set
y_test = model.predict(X_test)
lines = '\n'.join([str(np.argmax(pred)) for pred in y_test])
with open('logreg_embdnn_y_test_sst.txt','w') as f:
    f.writelines(lines)

# Train a Naive Bayes Classifier

In [14]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import re

def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = re.sub('@[^\s]+','USER', text)
    text = text.lower().replace("ё", "е")
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +',' ', text)
    return text.strip()
# data = [preprocess_text(t) for t in raw_data]




text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}


score = 'f1_macro'
# clf = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring=score)
text_clf.fit(x_valid, y_valid)


print(classification_report(y_test, text_clf.predict(x_test), digits=4))


              precision    recall  f1-score   support

           0     0.7313    0.8264    0.7760     79538
           4     0.8031    0.6999    0.7480     80462

   micro avg     0.7628    0.7628    0.7628    160000
   macro avg     0.7672    0.7631    0.7620    160000
weighted avg     0.7674    0.7628    0.7619    160000



In [29]:
text_clf.predict(["I am happy", "I am sad", "I love trump", "I do not love my mum"])

array([4, 0, 4, 0])

In [32]:
from joblib import dump, load
dump(text_clf, 'model/multinomial-nb.joblib') 

['model/multinomial-nb.joblib']

# Injecting the predictor on the code

In [6]:
from app.load import predictor, multinomialnb
stc = "I love this company #bestcompanyever"
multinomialnb.predict([stc])

# predictor([])

array([4])