In [30]:
import pandas as pd
import re
import spacy
import json
import string
import joblib

from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, classification_report, confusion_matrix
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


In [31]:
with open('cont_en.json') as file:
    contEn = json.load(file)
    
c_re = re.compile('(%s)' % '|'.join(contEn.keys()))

def expandContractions(text, c_re=c_re):         # expanding contractiion into full form word. ex: won't -> will not
    def replace(match):
        return contEn[match.group(0)]
    return c_re.sub(replace, text.lower())

def stars(text):
    if '*' in text:
        word = text.replace('*',' stars') # replacing * symbol with "star"
    else:
        word = text
    return word

def num2word(text):                        #convert number to string
    if len(text) == 1 and text in '12345':
        if text == '1':
            word = 'one'
        elif text == '2':
            word = 'two'
        elif text == '3':
            word = 'three'
        elif text == '4':
            word = 'four'
        elif text == '5':
            word = 'five'
        else:
            word = text
    else:
        word = text
    return word

def lemma(word):
    lemma_doc = nlp(" ".join(word)) 
    lemma_text = [token.text if '_' in token.text else token.lemma_ for token in lemma_doc]
    return lemma_text

nlp = spacy.load("en_core_web_sm")

nlp.Defaults.stop_words -= {"one", "two","three","four","five"} #removing those word from stop_word
stopword = list(STOP_WORDS)

stopword2 = stopword
stopword2.extend(['good'])

In [32]:
def clean_text(text):
    text = stars(text)
    text = expandContractions(text)
    text = re.split(r'\W+',text)
    text = [num2word(x) for x in text]
    text = [x for x in text if x not in string.punctuation]
    text = ["have" if x == "ve" else x for x in text]
    text = ["game" if x == "games" else x for x in text]
    text = ["phone" if x == "mobile" else x for x in text]
    text = ' '.join(text).replace('one one','1.1').split()
    text = [x for x in text if x not in stopword2 and len(x) > 1 and len(x) <= 45]
    return text

def tokenizer(bow):
    text = clean_text(bow)
    text = lemma(text) #,allowed_postags=['NOUN','VERB']
    return text

#### Data Test Non-Dataset

In [33]:
data = pd.read_csv('test_set.csv')

In [34]:
x = data['content']
y = data['sentiment_score']

In [35]:
modelLR = joblib.load('LogRegModel')

In [36]:
modelKNN = joblib.load('KNNClassifier')

In [37]:
modelSVC = joblib.load('SVC')

In [38]:
ylr = modelLR.predict(x)
yknn = modelKNN.predict(x)
ysvc = modelSVC.predict(x)

In [39]:
print('Logistic Regression')
print(classification_report(y,ylr))
print('==========')
print('KNN')
print(classification_report(y,yknn))
print('==========')
print('Super Vector Machine')
print(classification_report(y,ysvc))

Logistic Regression
              precision    recall  f1-score   support

    Negative       0.85      0.84      0.84      2146
     Neutral       0.82      0.86      0.84      2147
    Positive       0.85      0.80      0.82      2147

    accuracy                           0.84      6440
   macro avg       0.84      0.84      0.84      6440
weighted avg       0.84      0.84      0.84      6440

KNN
              precision    recall  f1-score   support

    Negative       0.92      0.69      0.79      2146
     Neutral       0.82      0.85      0.84      2147
    Positive       0.69      0.84      0.76      2147

    accuracy                           0.79      6440
   macro avg       0.81      0.79      0.79      6440
weighted avg       0.81      0.79      0.79      6440

Super Vector Machine
              precision    recall  f1-score   support

    Negative       0.84      0.87      0.85      2146
     Neutral       0.86      0.86      0.86      2147
    Positive       0.85      0

In [40]:
lab = ['Positive','Negative','Neutral']

In [41]:
cmlr = confusion_matrix(y,ylr,labels=lab)
print('Logistic Regression')
pd.DataFrame(cmlr, columns=lab,index=lab)

Logistic Regression


Unnamed: 0,Positive,Negative,Neutral
Positive,1715,193,239
Negative,161,1812,173
Neutral,152,138,1857


In [42]:
cmkn = confusion_matrix(y,yknn,labels=lab)
print('KNN')
pd.DataFrame(cmkn, columns=lab,index=lab)

KNN


Unnamed: 0,Positive,Negative,Neutral
Positive,1804,100,243
Negative,511,1478,157
Neutral,286,32,1829


In [43]:
cmsvc = confusion_matrix(y,ysvc,labels=lab)
print('Super Vector Machine')
pd.DataFrame(cmsvc, columns=lab,index=lab)

Super Vector Machine


Unnamed: 0,Positive,Negative,Neutral
Positive,1774,213,160
Negative,149,1858,139
Neutral,161,132,1854


#### SVC has better performance compared with Logistic Regression and KNNClassifier Models, based on confussion matrix and classification report

In [44]:
data['predict_score'] = ysvc

In [45]:
# export predicted dataset for further analysis

data.to_csv('hasilprediksi.csv',index=False)