In [1]:
import nltk # Imports the library
# nltk.download_shell() #Download the necessary datasets
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string
import re
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
df = pd.read_csv('sentiment_analysis_ID_clean.csv', index_col = 0)

In [4]:
df = df[['sentimen','Tweet','clean_link','hapus_punc','substitute_slang','stemming','eliminate_stop', 'eliminate_noise']]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10806 entries, 0 to 10805
Data columns (total 8 columns):
sentimen            10806 non-null int64
Tweet               10806 non-null object
clean_link          10806 non-null object
hapus_punc          10806 non-null object
substitute_slang    10804 non-null object
stemming            10804 non-null object
eliminate_stop      10673 non-null object
eliminate_noise     10270 non-null object
dtypes: int64(1), object(7)
memory usage: 759.8+ KB


## 2. Initial ML Modeling 

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['eliminate_noise'], df['sentimen'], test_size = 0.3)

In [7]:
temp = df[['sentimen','Tweet','clean_link','hapus_punc','substitute_slang','stemming','eliminate_stop','eliminate_noise']]
temp.dropna(inplace = True)

In [8]:
from imblearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, matthews_corrcoef,precision_score,recall_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

Using TensorFlow backend.


In [28]:
def run_cv(splits, X, Y, pipeline):
    
    kf = KFold(n_splits = splits, shuffle = True)
    res = {}
    accuracy = []
    precision_all = []
    recall_all = []
    logloss = []

    for train, test in kf.split(X, Y):
        lr_fit = pipeline.fit(X.iloc[train], Y.iloc[train])
        prediction = lr_fit.predict(X.iloc[test])
        scores = accuracy_score(Y.iloc[test], prediction)
        precision = precision_score(Y.iloc[test], prediction, average = None)
        recall = recall_score(Y.iloc[test], prediction, average = None)
        precision_all.append(np.array(precision)*100)
        recall_all.append(np.array(recall)*100)
#         logloss.append(log_loss(Y.iloc[test], lr_fit.predict_proba(X.iloc[test])))
        accuracy.append(scores * 100)
        
    precision_all = np.array(precision_all)
    recall_all = np.array(recall_all)
    
#     res['logloss_mean'] = np.mean(logloss)
#     res['logloss_std'] = np.std(logloss)
    
    res['acc_mean'] = np.mean(accuracy)
#     res['acc_median'] = np.median(accuracy)
#     res['acc_std'] = np.std(accuracy)
    
    res['precision_negative_mean'] = np.mean(precision_all[:,0])
#     res['precision_negative_median'] = np.median(precision_all[:,0])
#     res['precision_negative_std'] = np.std(precision_all[:,0])
    res['precision_neutral_mean'] = np.mean(precision_all[:,1])
#     res['precision_neutral_median'] = np.median(precision_all[:,1])
#     res['precision_neutral_std'] = np.std(precision_all[:,1])
    res['precision_positive_mean'] = np.mean(precision_all[:,2])
#     res['precision_positive_median'] = np.median(precision_all[:,2])
#     res['precision_positive_std'] = np.std(precision_all[:,2])
    
    res['recall_negative_mean'] = np.mean(recall_all[:,0])
#     res['recall_negative_median'] = np.median(recall_all[:,0])
#     res['recall_negative_std'] = np.std(recall_all[:,0])
    res['recall_neutral_mean'] = np.mean(recall_all[:,1])
#     res['recall_neutral_median'] = np.median(recall_all[:,1])
#     res['recall_neutral_std'] = np.std(recall_all[:,1])
    res['recall_positive_mean'] = np.mean(recall_all[:,2])
#     res['recall_positive_median'] = np.median(recall_all[:,2])
#     res['recall_positive_std'] = np.std(recall_all[:,2])

    return res


In [19]:
def list_model(n):
    total = {'NB': Pipeline([
                        ('bow', CountVectorizer(ngram_range = (1,n))),  # strings to token integer counts
                        ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
                        ('sampling_1',SMOTE()),
                        ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier   
                        ]),

            'NB_no_smote' : Pipeline([
                        ('bow', CountVectorizer(ngram_range = (1,n))),  # strings to token integer counts
                        ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
              #         ('sampling_1',SMOTE()),
                        ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier   
                        ]),
            'NB_smote_no_tfidf' : Pipeline([
                        ('bow', CountVectorizer(ngram_range = (1,n))),  # strings to token integer counts
#                         ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
                        ('sampling_1',SMOTE()),
                        ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier   
                        ]),
             
             'NB_no_smote_no_tfidf' : Pipeline([
                        ('bow', CountVectorizer(ngram_range = (1,n))),  # strings to token integer counts
#                         ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
              #         ('sampling_1',SMOTE()),
                        ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier   
                        ]),
             'LogReg': Pipeline([
                        ('bow', CountVectorizer(ngram_range = (1,n))),  # strings to token integer counts
                        ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
                        ('sampling_1',SMOTE()),
                        ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ Naive Bayes classifier   
                        ]),

            'LogReg_no_smote' : Pipeline([
                        ('bow', CountVectorizer(ngram_range = (1,n))),  # strings to token integer counts
                        ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
              #         ('sampling_1',SMOTE()),
                        ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ Naive Bayes classifier   
                        ]),
            'LogReg_smote_no_tfidf' : Pipeline([
                        ('bow', CountVectorizer(ngram_range = (1,n))),  # strings to token integer counts
#                         ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
                        ('sampling_1',SMOTE()),
                        ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ Naive Bayes classifier   
                        ]),
             
             'LogReg_no_smote_no_tfidf' : Pipeline([
                        ('bow', CountVectorizer(ngram_range = (1,n))),  # strings to token integer counts
#                         ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
              #         ('sampling_1',SMOTE()),
                        ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ Naive Bayes classifier   
                        ]),
             'XGB': Pipeline([
                        ('bow', CountVectorizer(ngram_range = (1,n))),  # strings to token integer counts
                        ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
                        ('sampling_1',SMOTE()),
                        ('classifier', XGBClassifier()),  # train on TF-IDF vectors w/ Naive Bayes classifier   
                        ]),

            'XGB_no_smote' : Pipeline([
                        ('bow', CountVectorizer(ngram_range = (1,n))),  # strings to token integer counts
                        ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
              #         ('sampling_1',SMOTE()),
                        ('classifier', XGBClassifier()),  # train on TF-IDF vectors w/ Naive Bayes classifier   
                        ]),
            'XGB_smote_no_tfidf' : Pipeline([
                        ('bow', CountVectorizer(ngram_range = (1,n))),  # strings to token integer counts
#                         ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
                        ('sampling_1',SMOTE()),
                        ('classifier', XGBClassifier()),  # train on TF-IDF vectors w/ Naive Bayes classifier   
                        ]),
             
             'XGB_no_smote_no_tfidf' : Pipeline([
                        ('bow', CountVectorizer(ngram_range = (1,n))),  # strings to token integer counts
#                         ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
              #         ('sampling_1',SMOTE()),
                        ('classifier', XGBClassifier()),  # train on TF-IDF vectors w/ Naive Bayes classifier   
                        ]),
    
            }
    return total



In [17]:
def try_various_model(dict_model):
    res = {}
    for key, val in dict_model.items():
        res[key] = run_cv(5,temp['eliminate_noise'], temp['sentimen'], val)
#         print (key, 'done')
    return res

varmodel = pd.DataFrame(try_various_model(list_model()))
varmodel.T
## ngram 1

Unnamed: 0,acc_mean,logloss_mean,logloss_std,precision_negative_mean,precision_neutral_mean,precision_positive_mean,recall_negative_mean,recall_neutral_mean,recall_positive_mean
NB,57.020448,0.918731,0.008405,48.350259,69.346396,48.882807,57.231734,57.726855,55.298549
NB_no_smote,60.223953,0.883084,0.007434,59.5724,60.333075,60.488897,39.543573,85.045001,33.302873
NB_smote_no_tfidf,56.105161,1.054965,0.023944,47.879156,68.74697,47.678295,57.017248,55.990179,55.383988
NB_no_smote_no_tfidf,59.814995,0.983186,0.023817,53.516529,65.265915,53.31737,50.699566,72.377875,44.788757
LogReg,58.033106,0.912484,0.008816,50.768626,68.606893,49.260832,55.812742,60.530549,55.42592
LogReg_no_smote,60.983447,0.871404,0.004514,59.389772,61.444174,61.30224,40.327607,83.61488,38.47188
LogReg_smote_no_tfidf,55.073028,0.97953,0.009248,48.325553,65.558093,46.630042,55.41489,56.375096,52.080875
LogReg_no_smote_no_tfidf,60.136319,0.904091,0.016509,57.186753,62.189932,56.587387,44.188348,78.096533,41.706302
XGB,57.799416,0.995475,0.004805,54.470896,59.453428,54.500281,34.138243,81.208582,37.005246
XGB_no_smote,55.618306,0.956385,0.006006,62.660306,54.096809,65.267803,17.626938,93.835748,21.065679


In [18]:
varmodel.T.describe()

Unnamed: 0,acc_mean,logloss_mean,logloss_std,precision_negative_mean,precision_neutral_mean,precision_positive_mean,recall_negative_mean,recall_neutral_mean,recall_positive_mean
count,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0
mean,57.895975,0.950901,0.010152,55.241628,62.394303,55.361599,42.010877,75.026068,41.123542
std,1.99825,0.05438,0.007298,5.983204,5.236347,6.518212,14.183268,14.155581,12.057442
min,55.073028,0.871404,0.002409,47.879156,54.096809,46.630042,17.626938,55.990179,21.065679
25%,56.127069,0.910386,0.005641,50.164034,59.408362,49.166325,34.165272,59.829625,35.675439
50%,57.804284,0.955068,0.007919,54.41174,61.817053,55.128039,42.257977,79.652557,40.089091
75%,59.895326,0.986258,0.011064,59.435429,66.320293,60.692233,55.514353,83.97241,52.885294
max,60.983447,1.054965,0.023944,66.426699,69.346396,65.267803,57.231734,93.995685,55.42592


In [20]:
def try_various_model(dict_model):
    res = {}
    for key, val in dict_model.items():
        res[key] = run_cv(5,temp['eliminate_noise'], temp['sentimen'], val)
#         print (key, 'done')
    return res

varmodel = pd.DataFrame(try_various_model(list_model(2)))
varmodel.T

Unnamed: 0,acc_mean,logloss_mean,logloss_std,precision_negative_mean,precision_neutral_mean,precision_positive_mean,recall_negative_mean,recall_neutral_mean,recall_positive_mean
NB,58.081792,0.907466,0.006498,50.494953,69.253747,49.532497,58.269569,59.46288,55.111928
NB_no_smote,56.835443,0.915546,0.014454,68.645216,54.511617,71.863871,23.345378,94.811684,17.799797
NB_smote_no_tfidf,59.298929,1.172235,0.039062,52.606045,66.18728,52.356721,54.468677,67.434004,48.297549
NB_no_smote_no_tfidf,60.360273,1.183969,0.023635,58.134861,61.052176,60.233504,43.032807,82.358083,35.389966
LogReg,59.182084,0.901768,0.010377,51.578004,68.145876,51.926463,56.060396,63.494603,53.975743
LogReg_no_smote,61.129503,0.874924,0.009562,59.771295,61.565213,61.145143,40.235683,83.909684,38.608733
LogReg_smote_no_tfidf,55.900682,0.988297,0.019512,49.329787,65.335421,47.951932,53.720942,58.106358,53.971642
LogReg_no_smote_no_tfidf,61.908471,0.891106,0.013093,61.173513,63.341785,58.055756,46.021836,79.69624,43.841668
XGB,57.692308,0.996503,0.003342,54.117031,59.171205,55.223937,31.606194,82.225756,37.241432
XGB_no_smote,55.93963,0.953138,0.003623,63.316341,54.549976,62.523984,17.809324,93.27202,23.268949


In [None]:
#Voting Classifier

In [21]:
def createListModel(n):
    total = []
    for key,val in list_model(n).items():
        total.append((key, val))
    return total

In [30]:
voting_estimator = VotingClassifier(estimators=createListModel(2), voting='soft')

In [31]:
run_cv(5,temp['eliminate_noise'], temp['sentimen'], voting_estimator)

{'acc_mean': 61.94741966893865,
 'precision_negative_mean': 59.49107846951064,
 'precision_neutral_mean': 63.384330495742134,
 'precision_positive_mean': 59.84864062091047,
 'recall_negative_mean': 45.71663821604773,
 'recall_neutral_mean': 80.92232869915145,
 'recall_positive_mean': 41.9450912595738}

In [32]:
voting_estimator = VotingClassifier(estimators=createListModel(2), voting='hard')
run_cv(5,temp['eliminate_noise'], temp['sentimen'], voting_estimator)

{'acc_mean': 62.15189873417721,
 'precision_negative_mean': 61.08429402819544,
 'precision_neutral_mean': 62.44360738374559,
 'precision_positive_mean': 62.509585381591805,
 'recall_negative_mean': 44.74711139766547,
 'recall_neutral_mean': 83.58721314046825,
 'recall_positive_mean': 38.427342203966234}

In [None]:
def check_ngram_soft(n):
    dataset = {}
    for i in range(1,n,1):
        voting_estimator = VotingClassifier(estimators=createListModel(i), voting='soft')
        res = run_cv(5,temp['eliminate_noise'], temp['sentimen'], voting_estimator)
#         res['ngram']
        dataset['ngram (1,{})'.format(i)] = res
    
    return dataset
        
check_ngram_soft = pd.DataFrame(check_ngram_soft(5))
check_ngram_soft

In [None]:
def check_ngram_hard(n):
    dataset = {}
    for i in range(1,n,1):
        voting_estimator = VotingClassifier(estimators=createListModel(i), voting='hard')
        res = run_cv(5,temp['eliminate_noise'], temp['sentimen'], voting_estimator)
#         res['ngram']
        dataset['ngram (1,{})'.format(i)] = res
    
    return dataset
        
check_ngram_hard = pd.DataFrame(check_ngram_hard(5))
check_ngram_hard

In [13]:
list_column = list(temp.columns)
list_column.remove('sentimen')
list_column

['Tweet',
 'clean_link',
 'hapus_punc',
 'substitute_slang',
 'stemming',
 'eliminate_stop',
 'eliminate_noise']

In [14]:
def try_various_columns(list_column, pipeline):
    res = {}
    for i in list_column:
        res[i] = run_cv(5,temp[i], temp['sentimen'], pipeline)
#         print (i, 'done')
    return res

res = try_various_columns(list_column[:8], pipeline)

NameError: name 'pipeline' is not defined

In [None]:
recap = pd.DataFrame(res)

recap

In [None]:
def try_various_ngram (max):
    res = []
    for i in range(1,max,1):
        global list_column
        pipeline = Pipeline([
            ('bow', CountVectorizer(ngram_range = (1,i))),  # strings to token integer counts
            ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
            ('sampling_1',SMOTE(sampling_strategy = 'minority')),
            ('sampling_2',SMOTE(sampling_strategy = 'minority')),
            ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier   
        ])
        hasil = try_various_columns(list_column[5:9], pipeline)
        res.append(hasil)
        print ('ngram', i,'done')
    return res


In [None]:
def try_various_model(list_model):
    res = {}
    for i in list_model:
        pipeline = Pipeline([
            ('bow', CountVectorizer(ngram_range = (1,2))),  # strings to token integer counts
            ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
            ('sampling_1',SMOTE()),
            ('classifier', i),  # train on TF-IDF vectors w/ Naive Bayes classifier   
        ])
        res[i] = run_cv(5,temp['eliminate_noise'], temp['sentimen'], pipeline)
        print (i, 'done')
    return res