In [90]:
import pandas as pd
import re
import spacy
import json
import string
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, classification_report, confusion_matrix
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


In [91]:
data = pd.read_csv('bigram.csv')

# Token

In [92]:
with open('cont_en.json') as file:
    contEn = json.load(file)
    
c_re = re.compile('(%s)' % '|'.join(contEn.keys()))

def expandContractions(text, c_re=c_re):         # expanding contractiion into full form word. ex: won't -> will not
    def replace(match):
        return contEn[match.group(0)]
    return c_re.sub(replace, text.lower())

def stars(text):
    if '*' in text:
        word = text.replace('*',' stars') # replacing * symbol with "star"
    else:
        word = text
    return word

def num2word(text):                        #convert number to string
    if len(text) == 1 and text in '12345':
        if text == '1':
            word = 'one'
        elif text == '2':
            word = 'two'
        elif text == '3':
            word = 'three'
        elif text == '4':
            word = 'four'
        elif text == '5':
            word = 'five'
        else:
            word = text
    else:
        word = text
    return word

def lemma(word):
    lemma_doc = nlp(" ".join(word)) 
    lemma_text = [token.text if '_' in token.text else token.lemma_ for token in lemma_doc]
    return lemma_text

nlp = spacy.load("en_core_web_sm")

nlp.Defaults.stop_words -= {"one", "two","three","four","five"} #removing those word from stop_word
stopword = list(STOP_WORDS)

stopword2 = stopword
stopword2.extend(['good'])

In [93]:
def clean_text(text):
    text = stars(text)
    text = expandContractions(text)
    text = re.split(r'\W+',text)
    text = [num2word(x) for x in text]
    text = [x for x in text if x not in string.punctuation]
    text = ["have" if x == "ve" else x for x in text]
    text = ["game" if x == "games" else x for x in text]
    text = ["phone" if x == "mobile" else x for x in text]
    text = ' '.join(text).replace('one one','1.1').split()
    text = [x for x in text if x not in stopword2 and len(x) > 1 and len(x) <= 45]
    return text

def tokenizer(bow):
    text = clean_text(bow)
    text = lemma(text) #,allowed_postags=['NOUN','VERB']
    return text

# Oversampling

In [94]:
pos = data[data.sentiment_score == "Positive"]
neu = data[data.sentiment_score == "Neutral"]
neg = data[data.sentiment_score == "Negative"]

In [95]:
neuOS = resample(neu, n_samples = len(pos), random_state=42)
negOS = resample(neg, n_samples = len(pos), random_state=42)

In [96]:
dataOS = pd.concat([pos,neuOS,negOS],axis=0)

# Model Selection

In [97]:
x = dataOS['content']
y = dataOS['sentiment_score']

In [98]:
cv = TfidfVectorizer(min_df=5, sublinear_tf=True, tokenizer=tokenizer, stop_words=stopword, ngram_range=(1,3))

In [99]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=.7, random_state = 42, stratify = y)

# Pipeline

In [100]:
from sklearn.pipeline import make_pipeline

In [101]:
SVCpipe = make_pipeline(cv,SVC(verbose=1, random_state=42, probability=True))

In [102]:
SVCpipe.fit(xtrain,ytrain)

[LibSVM]

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(min_df=5, ngram_range=(1, 3),
                                 stop_words=['very', "n't", 'too', 'ever',
                                             'perhaps', '’re', 'us', 'become',
                                             'nor', 'moreover', 'most',
                                             'thereupon', 'itself', 'empty',
                                             'whereafter', 'make', 'except',
                                             'whole', 'themselves', 'so', 'is',
                                             'whereas', 'twelve', 'made',
                                             'elsewhere', 'herself',
                                             'therefore', 'done', 'further',
                                             'upon', ...],
                                 sublinear_tf=True,
                                 tokenizer=<function tokenizer at 0x00000146CE28AEE0>)),
                ('svc

In [103]:
ypred = SVCpipe.predict(xtest)

In [104]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

    Negative       0.83      0.86      0.85      2146
     Neutral       0.85      0.85      0.85      2147
    Positive       0.85      0.81      0.83      2147

    accuracy                           0.84      6440
   macro avg       0.84      0.84      0.84      6440
weighted avg       0.84      0.84      0.84      6440



In [105]:
lab = ['Positive','Negative','Neutral']

In [106]:
cm = confusion_matrix(ytest,ypred,labels=lab)
pd.DataFrame(cm, columns=lab,index=lab)

Unnamed: 0,Positive,Negative,Neutral
Positive,1746,219,182
Negative,150,1849,147
Neutral,167,159,1821


In [107]:
text = ['Awfull gacha rates']
SVCpipe.predict(text)

array(['Negative'], dtype=object)

In [108]:
SVCpipe.predict_proba(text)

array([[0.94676819, 0.02186936, 0.03136245]])

# Hyper Parameter Tuning

In [111]:
# SVCpipe.get_params()

In [112]:
svc__gamma = ['scale', 0.5, 1]
svc__kernel = ['linear', 'rbf', 'sigmoid']
svc__C = [1.0, 0.5, 1.5]

In [113]:
score = []
for gamma in svc__gamma:
    for kernel in svc__kernel:
        for C in svc__C:
            print(f'Now running gamma {gamma}, kernel {kernel}, C {C}')
            params = {
                       'svc__gamma': [gamma],
                       'svc__kernel': [kernel],
                       'svc__C': [C]}
            gridSVC = GridSearchCV(estimator=SVCpipe, param_grid=params, cv=4, verbose=1, n_jobs=-1)
            gridSVC.fit(xtrain,ytrain)
            ygrid = gridSVC.predict(xtest)
            recall = recall_score(ytest,ygrid,average='macro')
            score.append([gamma,kernel,C,recall])
            print('================')

Now running gamma scale, kernel linear, C 1.0
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.3min finished


Now running gamma scale, kernel linear, C 0.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.5min finished


Now running gamma scale, kernel linear, C 1.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  7.5min finished


Now running gamma scale, kernel rbf, C 1.0
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  8.0min finished


Now running gamma scale, kernel rbf, C 0.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  7.4min finished


Now running gamma scale, kernel rbf, C 1.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  8.7min finished


Now running gamma scale, kernel sigmoid, C 1.0
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  7.5min finished


Now running gamma scale, kernel sigmoid, C 0.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  5.7min finished


Now running gamma scale, kernel sigmoid, C 1.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.7min finished


Now running gamma 0.5, kernel linear, C 1.0
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  6.7min finished


Now running gamma 0.5, kernel linear, C 0.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  5.6min finished


Now running gamma 0.5, kernel linear, C 1.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  6.7min finished


Now running gamma 0.5, kernel rbf, C 1.0
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  8.2min finished


Now running gamma 0.5, kernel rbf, C 0.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  8.4min finished


Now running gamma 0.5, kernel rbf, C 1.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed: 10.3min finished


Now running gamma 0.5, kernel sigmoid, C 1.0
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  5.1min finished


Now running gamma 0.5, kernel sigmoid, C 0.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  5.0min finished


Now running gamma 0.5, kernel sigmoid, C 1.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.5min finished


Now running gamma 1, kernel linear, C 1.0
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.4min finished


Now running gamma 1, kernel linear, C 0.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.6min finished


Now running gamma 1, kernel linear, C 1.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.4min finished


Now running gamma 1, kernel rbf, C 1.0
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.9min finished


Now running gamma 1, kernel rbf, C 0.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  5.2min finished


Now running gamma 1, kernel rbf, C 1.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  5.0min finished


Now running gamma 1, kernel sigmoid, C 1.0
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.3min finished


Now running gamma 1, kernel sigmoid, C 0.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.4min finished


Now running gamma 1, kernel sigmoid, C 1.5
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.6min finished




In [114]:
pd.DataFrame(score, columns=['gamma','kernel','C','recall'])

Unnamed: 0,gamma,kernel,C,recall
0,scale,linear,1.0,0.767707
1,scale,linear,0.5,0.738669
2,scale,linear,1.5,0.788979
3,scale,rbf,1.0,0.840997
4,scale,rbf,0.5,0.794571
5,scale,rbf,1.5,0.851866
6,scale,sigmoid,1.0,0.713204
7,scale,sigmoid,0.5,0.697364
8,scale,sigmoid,1.5,0.712739
9,0.5,linear,1.0,0.767707


### Best Parameter for SVC Model : gamma = scale, kernel= rbf, c = 1.5

In [115]:
params = {
           'svc__gamma': ['scale'],
           'svc__kernel': ['rbf'],
           'svc__C': [1.5]}
gridSVC = GridSearchCV(estimator=SVCpipe, param_grid=params, cv=4, verbose=1, n_jobs=-1)
gridSVC.fit(xtrain,ytrain)
ygrid = gridSVC.predict(xtest)

Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  5.1min finished


[LibSVM]

In [116]:
print(classification_report(ytest,ygrid))

              precision    recall  f1-score   support

    Negative       0.84      0.87      0.85      2146
     Neutral       0.86      0.86      0.86      2147
    Positive       0.85      0.83      0.84      2147

    accuracy                           0.85      6440
   macro avg       0.85      0.85      0.85      6440
weighted avg       0.85      0.85      0.85      6440



In [117]:
cm = confusion_matrix(ytest,ygrid,labels=lab)
pd.DataFrame(cm, columns=lab,index=lab)

Unnamed: 0,Positive,Negative,Neutral
Positive,1774,213,160
Negative,149,1858,139
Neutral,161,132,1854


In [119]:
model = gridSVC.best_estimator_

## Note:

- Hyper Parameter Tuning improve accuracy, precision, recall and f1-score up to 0.1 point.
- Best parameter for SVC in thie project are gamma = scale, kernel= rbf and c = 1.5

## Export Model

In [120]:
import joblib

In [121]:
joblib.dump(model, "SVC")

['SVC']

In [126]:
gridSVC.best_score_

0.7988686471798613