In [2]:
import pandas as pd
import re
import spacy
import json
import string
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, classification_report, confusion_matrix
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


In [3]:
data = pd.read_csv('bigram.csv')

# Token

In [5]:
with open('cont_en.json') as file:
    contEn = json.load(file)
    
c_re = re.compile('(%s)' % '|'.join(contEn.keys()))

def expandContractions(text, c_re=c_re):         # expanding contractiion into full form word. ex: won't -> will not
    def replace(match):
        return contEn[match.group(0)]
    return c_re.sub(replace, text.lower())

def stars(text):
    if '*' in text:
        word = text.replace('*',' stars') # replacing * symbol with "star"
    else:
        word = text
    return word

def num2word(text):                        #convert number to string
    if len(text) == 1 and text in '12345':
        if text == '1':
            word = 'one'
        elif text == '2':
            word = 'two'
        elif text == '3':
            word = 'three'
        elif text == '4':
            word = 'four'
        elif text == '5':
            word = 'five'
        else:
            word = text
    else:
        word = text
    return word

def lemma(word):
    lemma_doc = nlp(" ".join(word)) 
    lemma_text = [token.text if '_' in token.text else token.lemma_ for token in lemma_doc]
    return lemma_text

nlp = spacy.load("en_core_web_sm")

nlp.Defaults.stop_words -= {"one", "two","three","four","five"} #removing those word from stop_word
stopword = list(STOP_WORDS)

stopword2 = stopword
stopword2.extend(['good'])

In [6]:
def clean_text(text):
    text = stars(text)
    text = expandContractions(text)
    text = re.split(r'\W+',text)
    text = [num2word(x) for x in text]
    text = [x for x in text if x not in string.punctuation]
    text = ["have" if x == "ve" else x for x in text]
    text = ["game" if x == "games" else x for x in text]
    text = ["phone" if x == "mobile" else x for x in text]
    text = ' '.join(text).replace('one one','1.1').split()
    text = [x for x in text if x not in stopword2 and len(x) > 1 and len(x) <= 45]
    return text

def tokenizer(bow):
    text = clean_text(bow)
    text = lemma(text) #,allowed_postags=['NOUN','VERB']
    return text

## Oversampling

In [7]:
pos = data[data.sentiment_score == "Positive"]
neu = data[data.sentiment_score == "Neutral"]
neg = data[data.sentiment_score == "Negative"]

In [8]:
neuOS = resample(neu, n_samples = len(pos), random_state=42)
negOS = resample(neg, n_samples = len(pos), random_state=42)

In [9]:
dataOS = pd.concat([pos,neuOS,negOS],axis=0)

## Model Selection

In [10]:
x = dataOS['content']
y = dataOS['sentiment_score']

In [11]:
cv = TfidfVectorizer(tokenizer=tokenizer, stop_words=stopword, ngram_range=(1,3))

In [12]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=.7, random_state = 42, stratify = y)

## Pipeline

In [13]:
from sklearn.pipeline import make_pipeline

In [15]:
knnpipe = make_pipeline(cv,KNeighborsClassifier())

In [16]:
knnpipe.fit(xtrain,ytrain)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(ngram_range=(1, 3),
                                 stop_words=['over', 'is', 'using', 'toward',
                                             'themselves', 'when', 'below',
                                             'too', 'fifty', 'upon', 'under',
                                             'my', 'becomes', '‘ve', 'many',
                                             'keep', 'everywhere', 'full',
                                             'again', 'off', 'all', 'empty',
                                             'did', 'sometimes', 'after',
                                             'most', 'beside', 'except',
                                             'those', 'moreover', ...],
                                 tokenizer=<function tokenizer at 0x000001A2D6860430>)),
                ('kneighborsclassifier', KNeighborsClassifier())])

In [17]:
ypred = knnpipe.predict(xtest)

In [18]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

    Negative       0.60      0.65      0.63      2146
     Neutral       0.57      0.66      0.61      2147
    Positive       0.71      0.54      0.61      2147

    accuracy                           0.62      6440
   macro avg       0.63      0.62      0.62      6440
weighted avg       0.63      0.62      0.62      6440



In [19]:
lab = ['Positive','Negative','Neutral']

In [20]:
cm = confusion_matrix(ytest,ypred,labels=lab)
pd.DataFrame(cm, columns=lab,index=lab)

Unnamed: 0,Positive,Negative,Neutral
Positive,1155,414,578
Negative,245,1401,500
Neutral,237,501,1409


In [38]:
text = ['This is an okay game. Not bad at all']
knnpipe.predict(text)

array(['Negative'], dtype=object)

In [39]:
knnpipe.predict_proba(text)

array([[0.8, 0.2, 0. ]])

## Hyper Parameter Tuning

In [42]:
# knnpipe.get_params()

In [41]:
kneighborsclassifier__leaf_size = [30, 20, 10]
kneighborsclassifier__n_neighbors = [5, 10, 15]
kneighborsclassifier__p = [1,2]
kneighborsclassifier__weights= ['uniform', 'distance']

In [43]:
score = []
for leaf in kneighborsclassifier__leaf_size:
    for n in kneighborsclassifier__n_neighbors:
        for p in kneighborsclassifier__p:
            for weight in kneighborsclassifier__weights:
                print(f'Now running leaf_size {leaf}, n_num {n}, p {p}, weight {weight}')
                params = {
                            'kneighborsclassifier__leaf_size': [leaf],
                            'kneighborsclassifier__n_neighbors': [n],
                            'kneighborsclassifier__p': [p],
                            'kneighborsclassifier__weights' : [weight]
                        }
                gridKNN = GridSearchCV(estimator=knnpipe, param_grid=params, cv=4, verbose=1, n_jobs=-1)
                gridKNN.fit(xtrain,ytrain)
                ygrid = gridKNN.predict(xtest)
                recall = recall_score(ytest,ygrid,average='macro')
                score.append([leaf,n,p,weight,recall])
                print('================')

Now running leaf_size 30, n_num 5, p 1, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


Now running leaf_size 30, n_num 5, p 1, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.4min finished


Now running leaf_size 30, n_num 5, p 2, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished


Now running leaf_size 30, n_num 5, p 2, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.4min finished


Now running leaf_size 30, n_num 10, p 1, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


Now running leaf_size 30, n_num 10, p 1, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.4min finished


Now running leaf_size 30, n_num 10, p 2, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min finished


Now running leaf_size 30, n_num 10, p 2, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished


Now running leaf_size 30, n_num 15, p 1, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.4min finished


Now running leaf_size 30, n_num 15, p 1, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.4min finished


Now running leaf_size 30, n_num 15, p 2, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min finished


Now running leaf_size 30, n_num 15, p 2, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min finished


Now running leaf_size 20, n_num 5, p 1, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished


Now running leaf_size 20, n_num 5, p 1, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


Now running leaf_size 20, n_num 5, p 2, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished


Now running leaf_size 20, n_num 5, p 2, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished


Now running leaf_size 20, n_num 10, p 1, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


Now running leaf_size 20, n_num 10, p 1, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


Now running leaf_size 20, n_num 10, p 2, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished


Now running leaf_size 20, n_num 10, p 2, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished


Now running leaf_size 20, n_num 15, p 1, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


Now running leaf_size 20, n_num 15, p 1, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


Now running leaf_size 20, n_num 15, p 2, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished


Now running leaf_size 20, n_num 15, p 2, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished


Now running leaf_size 10, n_num 5, p 1, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


Now running leaf_size 10, n_num 5, p 1, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


Now running leaf_size 10, n_num 5, p 2, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished


Now running leaf_size 10, n_num 5, p 2, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.4min finished


Now running leaf_size 10, n_num 10, p 1, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


Now running leaf_size 10, n_num 10, p 1, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


Now running leaf_size 10, n_num 10, p 2, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished


Now running leaf_size 10, n_num 10, p 2, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished


Now running leaf_size 10, n_num 15, p 1, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


Now running leaf_size 10, n_num 15, p 1, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


Now running leaf_size 10, n_num 15, p 2, weight uniform
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished


Now running leaf_size 10, n_num 15, p 2, weight distance
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.3min finished




In [45]:
pd.DataFrame(score, columns=['leaf','n','p','weight','recall'])

Unnamed: 0,leaf,n,p,weight,recall
0,30,5,1,uniform,0.520468
1,30,5,1,distance,0.790048
2,30,5,2,uniform,0.615689
3,30,5,2,distance,0.75513
4,30,10,1,uniform,0.482105
5,30,10,1,distance,0.793617
6,30,10,2,uniform,0.583702
7,30,10,2,distance,0.768951
8,30,15,1,uniform,0.480556
9,30,15,1,distance,0.779488


In [None]:
KNeighborsClassifier()

In [46]:
params = {
            'kneighborsclassifier__leaf_size': [30],
            'kneighborsclassifier__n_neighbors': [10],
            'kneighborsclassifier__p': [1],
            'kneighborsclassifier__weights' : ['distance']
        }
gridKNN = GridSearchCV(estimator=knnpipe, param_grid=params, cv=4, verbose=1, n_jobs=-1)
gridKNN.fit(xtrain,ytrain)
ygrid = gridKNN.predict(xtest)

Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished


In [47]:
print(classification_report(ytest,ygrid))

              precision    recall  f1-score   support

    Negative       0.92      0.69      0.79      2146
     Neutral       0.82      0.85      0.84      2147
    Positive       0.69      0.84      0.76      2147

    accuracy                           0.79      6440
   macro avg       0.81      0.79      0.79      6440
weighted avg       0.81      0.79      0.79      6440



In [48]:
cm = confusion_matrix(ytest,ygrid,labels=lab)
pd.DataFrame(cm, columns=lab,index=lab)

Unnamed: 0,Positive,Negative,Neutral
Positive,1804,100,243
Negative,511,1478,157
Neutral,286,32,1829


## Note:

- Hyper Parameter Tuning improve precision score up to 0.18 point and improve recall, accuracy and f1-score up to 0.17 point
- Best parameter for KNNClassifier in this project are : leaf_size = 30, n_neighbour = 10, p = 1, and weights = distance

## Export Model

In [49]:
import joblib

In [51]:
model = gridKNN.best_estimator_

In [52]:
joblib.dump(model, "KNNClassifier")

['KNNClassifier']

In [53]:
gridKNN.best_score_

0.6898571371069353