In [7]:
import pandas as pd
import re
import spacy
import json
import string
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, classification_report, confusion_matrix
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


In [8]:
data = pd.read_csv('bigram.csv')

# Token

In [9]:
with open('cont_en.json') as file:
    contEn = json.load(file)
    
c_re = re.compile('(%s)' % '|'.join(contEn.keys()))

def expandContractions(text, c_re=c_re):         # expanding contractiion into full form word. ex: won't -> will not
    def replace(match):
        return contEn[match.group(0)]
    return c_re.sub(replace, text.lower())

def stars(text):
    if '*' in text:
        word = text.replace('*',' stars') # replacing * symbol with "star"
    else:
        word = text
    return word

def num2word(text):                        #convert number to string
    if len(text) == 1 and text in '12345':
        if text == '1':
            word = 'one'
        elif text == '2':
            word = 'two'
        elif text == '3':
            word = 'three'
        elif text == '4':
            word = 'four'
        elif text == '5':
            word = 'five'
        else:
            word = text
    else:
        word = text
    return word

def lemma(word):
    lemma_doc = nlp(" ".join(word)) 
    lemma_text = [token.text if '_' in token.text else token.lemma_ for token in lemma_doc]
    return lemma_text

nlp = spacy.load("en_core_web_sm")

nlp.Defaults.stop_words -= {"one", "two","three","four","five"} #removing these word from stop_word
stopword = list(STOP_WORDS)

stopword2 = stopword
stopword2.extend(['good'])

In [10]:
def clean_text(text):
    text = stars(text)
    text = expandContractions(text)
    text = re.split(r'\W+',text)
    text = [num2word(x) for x in text]
    text = [x for x in text if x not in string.punctuation]
    text = ["have" if x == "ve" else x for x in text]
    text = ["game" if x == "games" else x for x in text]
    text = ["phone" if x == "mobile" else x for x in text]
    text = ' '.join(text).replace('one one','1.1').split()
    text = [x for x in text if x not in stopword2 and len(x) > 1 and len(x) <= 45]
    return text

def tokenizer(bow):
    text = clean_text(bow)
    text = lemma(text) #,allowed_postags=['NOUN','VERB']
    return text

# Oversampling

In [11]:
pos = data[data.sentiment_score == "Positive"]
neu = data[data.sentiment_score == "Neutral"]
neg = data[data.sentiment_score == "Negative"]

In [12]:
neuOS = resample(neu, n_samples = len(pos), random_state=42)
negOS = resample(neg, n_samples = len(pos), random_state=42)

In [13]:
dataOS = pd.concat([pos,neuOS,negOS],axis=0)

# Model Selection

In [14]:
x = dataOS['content']
y = dataOS['sentiment_score']

In [15]:
cv = TfidfVectorizer(tokenizer=tokenizer, stop_words=stopword, ngram_range=(1,3))

In [16]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=.7, random_state = 42, stratify = y)

# Pipeline

In [15]:
from sklearn.pipeline import make_pipeline

In [16]:
lrpipe = make_pipeline(cv,LogisticRegression(verbose=1, n_jobs=-1))

In [17]:
lrpipe.fit(xtrain,ytrain)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   11.7s finished


Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(ngram_range=(1, 3),
                                 stop_words=['always', 'unless', 'could', 'the',
                                             'doing', 'above', 'that',
                                             'thereupon', 'hence', 'please',
                                             'whole', 'as', 'ours', 'off',
                                             'amongst', '’ll', 'whither', 'our',
                                             '‘m', 'once', 'well', 'has',
                                             'would', 'other', 'every',
                                             'almost', 'must', 'amount', 'here',
                                             'moreover', ...],
                                 tokenizer=<function tokenizer at 0x000001ADD253E0D0>)),
                ('logisticregression',
                 LogisticRegression(n_jobs=-1, verbose=1))])

In [18]:
ypred = lrpipe.predict(xtest)

In [19]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

    Negative       0.84      0.84      0.84      2146
     Neutral       0.81      0.84      0.82      2147
    Positive       0.83      0.79      0.81      2147

    accuracy                           0.82      6440
   macro avg       0.82      0.82      0.82      6440
weighted avg       0.82      0.82      0.82      6440



In [20]:
lab = ['Positive','Negative','Neutral']

In [21]:
cm = confusion_matrix(ytest,ypred,labels=lab)
pd.DataFrame(cm, columns=lab,index=lab)

Unnamed: 0,Positive,Negative,Neutral
Positive,1698,194,255
Negative,170,1793,183
Neutral,186,152,1809


In [22]:
text = ['This is an okay game. Not bad at all']
lrpipe.predict(text)

array(['Neutral'], dtype=object)

In [23]:
lrpipe.predict_proba(text)

array([[0.39820669, 0.49280325, 0.10899006]])

# Hyper Parameter Tuning

#### Note:
penalty : {'l1', 'l2', 'elasticnet', 'none'}, default='l2'
    Used to specify the norm used in the penalization. The 'newton-cg',
    'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
    only supported by the 'saga' solver. If 'none' (not supported by the
    liblinear solver), no regularization is applied.

In [33]:
logisticregression__C = [1.0, 0.5, 1.5]
logisticregression__solver = ['lbfgs', 'newton-cg', 'sag']
logisticregression__penalty = ['l2']

In [34]:
score = []
for C in logisticregression__C:
    for solver in logisticregression__solver:
        for penalty in logisticregression__penalty:
            print(f'Now running C {C}, solver {solver}, penalty {penalty}')
            params = {
                        'logisticregression__C': [C],
                        'logisticregression__solver': [solver],
                        'logisticregression__penalty': [penalty]
                    }
            gridLR = GridSearchCV(estimator=lrpipe, param_grid=params, cv=4, verbose=1, n_jobs=-1)
            gridLR.fit(xtrain,ytrain)
            ygrid = gridLR.predict(xtest)
            recall = recall_score(ytest,ygrid,average='macro')
            score.append([C,solver,penalty,recall])
            print('================')

Now running C 1.0, solver lbfgs, penalty l2
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   11.2s finished


Now running C 1.0, solver newton-cg, penalty l2
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.9s finished


Now running C 1.0, solver sag, penalty l2
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 20 epochs took 0 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


Now running C 0.5, solver lbfgs, penalty l2
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   10.3s finished


Now running C 0.5, solver newton-cg, penalty l2
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.1s finished


Now running C 0.5, solver sag, penalty l2
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 19 epochs took 1 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


Now running C 1.5, solver lbfgs, penalty l2
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   11.2s finished


Now running C 1.5, solver newton-cg, penalty l2
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.3s finished


Now running C 1.5, solver sag, penalty l2
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 24 epochs took 1 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.7s finished




In [35]:
pd.DataFrame(score, columns=['C','solver','penalty','recall'])

Unnamed: 0,C,solver,penalty,recall
0,1.0,lbfgs,l2,0.822983
1,1.0,newton-cg,l2,0.822983
2,1.0,sag,l2,0.822983
3,0.5,lbfgs,l2,0.793325
4,0.5,newton-cg,l2,0.793325
5,0.5,sag,l2,0.793325
6,1.5,lbfgs,l2,0.836026
7,1.5,newton-cg,l2,0.835871
8,1.5,sag,l2,0.835871


### Best Model LR Parameter: C=1.5, Solver=lbfgs, penalty=l2

In [36]:
params = {
        'logisticregression__C': [1.5],
        'logisticregression__solver': ['lbfgs'],
        'logisticregression__penalty': ['l2']}
gridLR = GridSearchCV(estimator=lrpipe, param_grid=params, cv=4, verbose=1, n_jobs=-1)
gridLR.fit(xtrain,ytrain)
ygrid = gridLR.predict(xtest)

Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.9min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   11.7s finished


In [37]:
print(classification_report(ytest,ygrid))

              precision    recall  f1-score   support

    Negative       0.85      0.84      0.84      2146
     Neutral       0.82      0.86      0.84      2147
    Positive       0.85      0.80      0.82      2147

    accuracy                           0.84      6440
   macro avg       0.84      0.84      0.84      6440
weighted avg       0.84      0.84      0.84      6440



In [38]:
cm = confusion_matrix(ytest,ygrid,labels=lab)
pd.DataFrame(cm, columns=lab,index=lab)

Unnamed: 0,Positive,Negative,Neutral
Positive,1715,193,239
Negative,161,1812,173
Neutral,152,138,1857


In [40]:
model = gridLR.best_estimator_

## Note:

- Hyper Parameter Tuning improve accuracy, precision, recall and f1-score up to 0.2 point.
- Best parameter for Logistic Regression in this projects are c=1.5, solver=lbfgs, and penalty=l2

## Export Model

In [41]:
import joblib

In [42]:
joblib.dump(model, "LogRegModel")

['LogRegModel']

In [43]:
gridLR.best_score_

0.7932778586113873