In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv( "../data/processed/train_1.csv")
test = pd.read_csv("../data/processed/test_1.csv")
validation = pd.read_csv("../data/processed/validation_1.csv")

In [3]:
from sklearn.model_selection import train_test_split
X = train['review']
y = train['sentiment']

In [4]:
print("Creating tfidf..")
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the "TfidfVectorizer" object, which is scikit-learn's
# bag of words tool.  
tvec = TfidfVectorizer(analyzer = "word",
                            stop_words=None,
                            max_features=100000,
                            ngram_range=(1, 3)) 
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

logistic = LogisticRegression()
lr = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)

Creating tfidf..


In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score

def lr_cv(splits, X, Y, pipeline, average_method):
    
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for train, test in kfold.split(X, Y):
        lr_fit = pipeline.fit(X[train], Y[train])
        prediction = lr_fit.predict(X[test])
        scores = lr_fit.score(X[test],Y[test])
        
        accuracy.append(scores * 100)
        precision.append(precision_score(Y[test], prediction, average=average_method)*100)
        print('              negative    positive')
        print('precision:',precision_score(Y[test], prediction, average=None))
        recall.append(recall_score(Y[test], prediction, average=average_method)*100)
        print('recall:   ',recall_score(Y[test], prediction, average=None))
        f1.append(f1_score(Y[test], prediction, average=average_method)*100)
        print('f1 score: ',f1_score(Y[test], prediction, average=None))
        print('-'*50)

    print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
    print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision)))
    print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall)))
    print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1)))

In [6]:
from sklearn.pipeline import Pipeline
original_pipeline = Pipeline([
    ('vectorizer', tvec),
    ('classifier', lr)
])
lr_cv(5,X,y,original_pipeline, 'macro')

              negative    positive
precision: [0.72706682 0.80921339]
recall:    [0.59389454 0.88540181]
f1 score:  [0.65376782 0.84559491]
--------------------------------------------------
              negative    positive
precision: [0.71490281 0.81443756]
recall:    [0.61239593 0.87446505]
f1 score:  [0.65969108 0.84338454]
--------------------------------------------------
              negative    positive
precision: [0.6987041  0.80779451]
recall:    [0.59851989 0.86733238]
f1 score:  [0.6447434  0.83650539]
--------------------------------------------------
              negative    positive
precision: [0.70964247 0.8119469 ]
recall:    [0.60648148 0.87256301]
f1 score:  [0.65401897 0.84116434]
--------------------------------------------------
              negative    positive
precision: [0.70819672 0.80952381]
recall:    [0.6        0.87303852]
f1 score:  [0.64962406 0.84008236]
--------------------------------------------------
accuracy: 78.21% (+/- 0.37%)
precision: 76.11

In [7]:
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),lr)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),lr)

In [8]:
lr_cv(5,X,y, ROS_pipeline, 'macro')

              negative    positive
precision: [0.65613718 0.82947977]
recall:    [0.67252544 0.81883024]
f1 score:  [0.66423024 0.8241206 ]
--------------------------------------------------
              negative    positive
precision: [0.63301141 0.82396088]
recall:    [0.66697502 0.80123633]
f1 score:  [0.64954955 0.81243973]
--------------------------------------------------
              negative    positive
precision: [0.63101604 0.81910766]
recall:    [0.65494912 0.80313837]
f1 score:  [0.64275987 0.81104442]
--------------------------------------------------
              negative    positive
precision: [0.64674398 0.82783705]
recall:    [0.6712963  0.81169757]
f1 score:  [0.65879146 0.81968788]
--------------------------------------------------
              negative    positive
precision: [0.6362054  0.82841691]
recall:    [0.67685185 0.80123633]
f1 score:  [0.65589951 0.81459995]
--------------------------------------------------
accuracy: 76.01% (+/- 0.59%)
precision: 73.32

In [9]:
lr_cv(5, X,y, SMOTE_pipeline, 'macro')

              negative    positive
precision: [0.63734115 0.8014808 ]
recall:    [0.60314524 0.82358535]
f1 score:  [0.61977186 0.81238274]
--------------------------------------------------
              negative    positive
precision: [0.61660777 0.81335283]
recall:    [0.64569843 0.79362815]
f1 score:  [0.63081789 0.80336943]
--------------------------------------------------
              negative    positive
precision: [0.61939616 0.80679101]
recall:    [0.62627197 0.80218735]
f1 score:  [0.62281509 0.80448259]
--------------------------------------------------
              negative    positive
precision: [0.62923351 0.81853469]
recall:    [0.6537037  0.80218735]
f1 score:  [0.64123524 0.81027858]
--------------------------------------------------
              negative    positive
precision: [0.62769784 0.81554804]
recall:    [0.6462963  0.80313837]
f1 score:  [0.63686131 0.80929564]
--------------------------------------------------
accuracy: 74.73% (+/- 0.37%)
precision: 71.86

In [10]:
from imblearn.under_sampling import NearMiss, RandomUnderSampler
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),lr)

In [11]:
lr_cv(5, X, y, RUS_pipeline, 'macro')

              negative    positive
precision: [0.62092494 0.85951743]
recall:    [0.75763182 0.76224441]
f1 score:  [0.6825     0.80796371]
--------------------------------------------------
              negative    positive
precision: [0.60353721 0.85659551]
recall:    [0.75763182 0.74417499]
f1 score:  [0.67186218 0.79643766]
--------------------------------------------------
              negative    positive
precision: [0.59622367 0.85611511]
recall:    [0.75948196 0.73561579]
f1 score:  [0.66802278 0.79130435]
--------------------------------------------------
              negative    positive
precision: [0.58857143 0.85642176]
recall:    [0.76296296 0.72610556]
f1 score:  [0.66451613 0.7858981 ]
--------------------------------------------------
              negative    positive
precision: [0.59037037 0.84560829]
recall:    [0.73796296 0.73704232]
f1 score:  [0.65596708 0.78760163]
--------------------------------------------------
accuracy: 74.58% (+/- 0.85%)
precision: 72.74