In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from imblearn.pipeline import Pipeline as pipeline_imb
from imblearn.over_sampling import RandomOverSampler

In [2]:
import pandas as pd

# load data
PATH_TO_DATA = './simplesentiment/'
files = ['products_sentiment_train.tsv', 'products_sentiment_test.tsv']
train =  pd.read_csv(PATH_TO_DATA+files[0], sep = '\t', header = None, names = ['text', 'target'])
# train, taget, test
data_samples = train.text.values
y = train.target
test = pd.read_csv(PATH_TO_DATA+files[1], sep = '\t')
test_samples = test.text.values

In [3]:
from sklearn.model_selection import train_test_split

# split data
def split_data(X, y, rs):
    test_portion = 0.33
    return train_test_split(X,
                            y,
                            test_size=test_portion,
                            random_state=int(rs),
                            shuffle=True,
                            stratify=y)

In [4]:
x,v, yx, yv = split_data(data_samples, y , 22)

In [5]:
# feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from nltk.tokenize import word_tokenize

# data transformer - vectorizer
count_vectorizer = CountVectorizer(
    ngram_range=(1, 3),
    min_df=1,
    tokenizer=word_tokenize,
    analyzer='word',
)

tf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    min_df=1,
    tokenizer=word_tokenize,
    analyzer='word',
)

In [6]:
import numpy as np
from some_funcs import checking
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [7]:
from scipy.sparse import hstack

In [39]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import VotingClassifier

In [9]:
nb = ComplementNB(alpha=0.5)
logit = LogisticRegression(penalty='l2', C=1.0)
svm = SVC(C=1.0, kernel='rbf', probability=True)

# BOW

In [10]:
features_name = "BOW"

data_x = [vect.fit(x).transform(x) for vect in [count_vectorizer]]
data_v = [vect.fit(x).transform(v) for vect in [count_vectorizer]]

union_x = hstack(data_x)
union_v = hstack(data_v)

for cls_name, cls in zip(['Naive Bayes', 'Logistic Regression', 'SVC'],
                          [nb, logit, svm]):
    cls.fit(union_x, yx)
    pred = cls.predict(union_v)
    prob = cls.predict_proba(union_v)[:, 1]
    print(f"{cls_name} with {features_name} f1: {f1_score(yv, pred).round(4)}")
    print(f"{cls_name} with {features_name} accuracy: {accuracy_score(yv, pred).round(4)}")
    print(f"{cls_name} with {features_name} roc_auc: {roc_auc_score(yv, prob).round(4)}")
    print()



Naive Bayes with BOW f1: 0.8436
Naive Bayes with BOW accuracy: 0.7848
Naive Bayes with BOW roc_auc: 0.8386

Logistic Regression with BOW f1: 0.8246
Logistic Regression with BOW accuracy: 0.7576
Logistic Regression with BOW roc_auc: 0.8301

SVC with BOW f1: 0.8059
SVC with BOW accuracy: 0.6985
SVC with BOW roc_auc: 0.8217



### Naive Bayes

In [28]:
pipe_bow = Pipeline(steps=[('bow', count_vectorizer), ('cls', nb)])
checking(pipe_bow, data_samples, y, 10, n_jobs=-1, scoring='f1', cv=5)
checking(pipe_bow, data_samples, y, 10, n_jobs=-1, scoring='accuracy', cv=5)
checking(pipe_bow, data_samples, y, 10, n_jobs=-1, scoring='roc_auc', cv=5)

Cross-validation ...
f1
(array([0.8462, 0.8561, 0.8609, 0.8351, 0.8259]), 0.8448, 0.013)
done in 1.173s.

Cross-validation ...
accuracy
(array([0.79  , 0.7975, 0.815 , 0.77  , 0.765 ]), 0.7875, 0.0183)
done in 1.117s.

Cross-validation ...
roc_auc
(array([0.8571, 0.8729, 0.8626, 0.8317, 0.8338]), 0.8516, 0.0162)
done in 1.229s.



### Logistic Regression

In [29]:
pipe_bow = Pipeline(steps=[('bow', count_vectorizer), ('cls', logit)])
checking(pipe_bow, data_samples, y, 10, n_jobs=-1, scoring='f1', cv=5)
checking(pipe_bow, data_samples, y, 10, n_jobs=-1, scoring='accuracy', cv=5)
checking(pipe_bow, data_samples, y, 10, n_jobs=-1, scoring='roc_auc', cv=5)

Cross-validation ...
f1
(array([0.8309, 0.852 , 0.8466, 0.8388, 0.8364]), 0.841, 0.0075)
done in 2.451s.

Cross-validation ...
accuracy
(array([0.77  , 0.7925, 0.7925, 0.78  , 0.78  ]), 0.783, 0.0086)
done in 2.634s.

Cross-validation ...
roc_auc
(array([0.8482, 0.8359, 0.8556, 0.8518, 0.8483]), 0.848, 0.0066)
done in 2.441s.



### Support Vector M

In [30]:
pipe_bow = Pipeline(steps=[('bow', count_vectorizer), ('cls', svm)])
checking(pipe_bow, data_samples, y, 10, n_jobs=-1, scoring='f1', cv=5)
checking(pipe_bow, data_samples, y, 10, n_jobs=-1, scoring='accuracy', cv=5)
checking(pipe_bow, data_samples, y, 10, n_jobs=-1, scoring='roc_auc', cv=5)

Cross-validation ...
f1
(array([0.8126, 0.7947, 0.8085, 0.8066, 0.818 ]), 0.8081, 0.0078)
done in 8.434s.

Cross-validation ...
accuracy
(array([0.7175, 0.69  , 0.7075, 0.705 , 0.7275]), 0.7095, 0.0126)
done in 8.700s.

Cross-validation ...
roc_auc
(array([0.8   , 0.8134, 0.8326, 0.8217, 0.818 ]), 0.8171, 0.0107)
done in 8.638s.



for Ensamble

In [15]:
cls = ComplementNB(alpha=0.5).fit(union_x, yx)
prob_bow = cls.predict_proba(union_v)[:, 1]

# TFIDF

In [16]:
features_name = "TFIDF"

data_x = [vect.fit(x).transform(x) for vect in [tf_vectorizer]]
data_v = [vect.fit(x).transform(v) for vect in [tf_vectorizer]]

union_x = hstack(data_x)
union_v = hstack(data_v)

for cls_name, cls in zip(['Naive Bayes', 'Logistic Regression', 'SVC'],
                          [nb, logit, svm]):
    cls.fit(union_x, yx)
    pred = cls.predict(union_v)
    prob = cls.predict_proba(union_v)[:, 1]
    print(f"{cls_name} with {features_name} f1: {f1_score(yv, pred).round(4)}")
    print(f"{cls_name} with {features_name} accuracy: {accuracy_score(yv, pred).round(4)}")
    print(f"{cls_name} with {features_name} roc_auc: {roc_auc_score(yv, prob).round(4)}")
    print()



Naive Bayes with TFIDF f1: 0.8221
Naive Bayes with TFIDF accuracy: 0.7364
Naive Bayes with TFIDF roc_auc: 0.8524

Logistic Regression with TFIDF f1: 0.8058
Logistic Regression with TFIDF accuracy: 0.697
Logistic Regression with TFIDF roc_auc: 0.8588

SVC with TFIDF f1: 0.7985
SVC with TFIDF accuracy: 0.6803
SVC with TFIDF roc_auc: 0.8608



## Cross validation

### Naive Bayes

In [23]:
pipe_tf = Pipeline(steps=[('tf', tf_vectorizer), ('cls', nb)])
checking(pipe_tf, data_samples, y, 10, n_jobs=-1, scoring='f1', cv=5)
checking(pipe_tf, data_samples, y, 10, n_jobs=-1, scoring='accuracy', cv=5)
checking(pipe_tf, data_samples, y, 10, n_jobs=-1, scoring='roc_auc', cv=5)

Cross-validation ...
f1
(array([0.8303, 0.8204, 0.8342, 0.8202, 0.8325]), 0.8275, 0.006)
done in 3.327s.

Cross-validation ...
accuracy
(array([0.7475, 0.7275, 0.7575, 0.7325, 0.7525]), 0.7435, 0.0116)
done in 2.434s.

Cross-validation ...
roc_auc
(array([0.8668, 0.8786, 0.8571, 0.8558, 0.8578]), 0.8632, 0.0086)
done in 1.178s.



### Logistic Regression

In [24]:
pipe_tf = Pipeline(steps=[('tf', tf_vectorizer), ('cls', logit)])
checking(pipe_tf, data_samples, y, 10, n_jobs=-1, scoring='f1', cv=5)
checking(pipe_tf, data_samples, y, 10, n_jobs=-1, scoring='accuracy', cv=5)
checking(pipe_tf, data_samples, y, 10, n_jobs=-1, scoring='roc_auc', cv=5)

Cross-validation ...
f1
(array([0.8104, 0.8071, 0.821 , 0.8071, 0.8176]), 0.8126, 0.0057)
done in 1.891s.

Cross-validation ...
accuracy
(array([0.7075, 0.7   , 0.7275, 0.7025, 0.72  ]), 0.7115, 0.0106)
done in 1.805s.

Cross-validation ...
roc_auc
(array([0.8533, 0.8559, 0.8741, 0.8571, 0.8678]), 0.8616, 0.0079)
done in 1.847s.



### Support Vector M

In [25]:
pipe_tf = Pipeline(steps=[('tf', tf_vectorizer), ('cls', svm)])
checking(pipe_tf, data_samples, y, 10, n_jobs=-1, scoring='f1', cv=5)
checking(pipe_tf, data_samples, y, 10, n_jobs=-1, scoring='accuracy', cv=5)
checking(pipe_tf, data_samples, y, 10, n_jobs=-1, scoring='roc_auc', cv=5)

Cross-validation ...
f1
(array([0.8096, 0.7981, 0.8096, 0.8019, 0.8076]), 0.8054, 0.0046)
done in 8.900s.

Cross-validation ...
accuracy
(array([0.7025, 0.6825, 0.7025, 0.69  , 0.6975]), 0.695, 0.0077)
done in 9.240s.

Cross-validation ...
roc_auc
(array([0.8592, 0.8607, 0.8799, 0.8592, 0.8645]), 0.8647, 0.0079)
done in 9.079s.



for Ensamble

In [27]:
cls = svm.fit(union_x, yx)
prob_tf = cls.predict_proba(union_v)[:, 1]

# TF-IDF and BOW

In [31]:
data_x = [vect.fit(x).transform(x) for vect in [count_vectorizer, tf_vectorizer]]
data_v = [vect.fit(x).transform(v) for vect in [count_vectorizer, tf_vectorizer]]
union_x = hstack(data_x)
union_v = hstack(data_v)

## Cross validation

### Naive Bayes

In [33]:
pipe = Pipeline(steps=[('union',
                        FeatureUnion(transformer_list=[(
                            'bow', count_vectorizer), ('tfidf',
                                                       tf_vectorizer)],
                                     n_jobs=-1)), 
                       ('cls', nb)])
checking(pipe, data_samples, y, 10, n_jobs=-1, scoring='f1', cv=5)
checking(pipe, data_samples, y, 10, n_jobs=-1, scoring='accuracy', cv=5)
checking(pipe, data_samples, y, 10, n_jobs=-1, scoring='roc_auc', cv=5)

Cross-validation ...
f1
(array([0.8451, 0.8434, 0.8582, 0.8345, 0.8303]), 0.8423, 0.0096)
done in 4.856s.

Cross-validation ...
accuracy
(array([0.78  , 0.7725, 0.8025, 0.765 , 0.765 ]), 0.777, 0.0139)
done in 4.309s.

Cross-validation ...
roc_auc
(array([0.8542, 0.867 , 0.8569, 0.827 , 0.8316]), 0.8474, 0.0154)
done in 2.655s.



### Logistic Regression

In [34]:
pipe = Pipeline(steps=[('union',
                        FeatureUnion(transformer_list=[(
                            'bow', count_vectorizer), ('tfidf',
                                                       tf_vectorizer)],
                                     n_jobs=-1)), 
                       ('cls', logit)])
checking(pipe, data_samples, y, 10, n_jobs=-1, scoring='f1', cv=5)
checking(pipe, data_samples, y, 10, n_jobs=-1, scoring='accuracy', cv=5)
checking(pipe, data_samples, y, 10, n_jobs=-1, scoring='roc_auc', cv=5)

Cross-validation ...
f1
(array([0.8339, 0.852 , 0.8466, 0.8388, 0.8364]), 0.8416, 0.0067)
done in 6.610s.

Cross-validation ...
accuracy
(array([0.775 , 0.7925, 0.7925, 0.78  , 0.78  ]), 0.784, 0.0072)
done in 6.470s.

Cross-validation ...
roc_auc
(array([0.8488, 0.8366, 0.8565, 0.8527, 0.849 ]), 0.8487, 0.0067)
done in 6.700s.



### Support Vector M

In [35]:
pipe = Pipeline(steps=[('union',
                        FeatureUnion(transformer_list=[(
                            'bow', count_vectorizer), ('tfidf',
                                                       tf_vectorizer)],
                                     n_jobs=-1)), 
                       ('cls', svm)])
checking(pipe, data_samples, y, 10, n_jobs=-1, scoring='f1', cv=5)
checking(pipe, data_samples, y, 10, n_jobs=-1, scoring='accuracy', cv=5)
checking(pipe, data_samples, y, 10, n_jobs=-1, scoring='roc_auc', cv=5)

Cross-validation ...
f1
(array([0.8126, 0.7947, 0.8072, 0.8066, 0.818 ]), 0.8078, 0.0078)
done in 19.398s.

Cross-validation ...
accuracy
(array([0.7175, 0.69  , 0.705 , 0.705 , 0.7275]), 0.709, 0.0127)
done in 19.571s.

Cross-validation ...
roc_auc
(array([0.8021, 0.8144, 0.8335, 0.8229, 0.8192]), 0.8184, 0.0103)
done in 18.244s.



# Ensemble

In [42]:
pipe_bow = Pipeline(steps=[('bow', count_vectorizer), ('cls', nb)])
pipe_tf = Pipeline(steps=[('tf', tf_vectorizer), ('cls', svm)])
pipe = Pipeline(steps=[('union',
                        FeatureUnion(transformer_list=[(
                            'bow', count_vectorizer), ('tfidf',
                                                       tf_vectorizer)],
                                     n_jobs=-1)), 
                       ('cls', logit)])

In [43]:
pipe_vote = VotingClassifier(estimators=[('bow', pipe_bow), ('tfidf', pipe_tf),
                                         ('union', pipe)],
                             voting='soft',
                             n_jobs=-1)
checking(pipe_vote, data_samples, y, 10, n_jobs=-1, scoring='f1', cv=5)
checking(pipe_vote, data_samples, y, 10, n_jobs=-1, scoring='accuracy', cv=5)
checking(pipe_vote, data_samples, y, 10, n_jobs=-1, scoring='roc_auc', cv=5)

Cross-validation ...
f1
(array([0.8626, 0.8577, 0.8593, 0.8415, 0.8305]), 0.8503, 0.0123)
done in 15.971s.

Cross-validation ...
accuracy
(array([0.8175, 0.805 , 0.8125, 0.7825, 0.7775]), 0.799, 0.0161)
done in 16.271s.

Cross-validation ...
roc_auc
(array([0.8708, 0.8682, 0.8816, 0.8627, 0.8624]), 0.8691, 0.007)
done in 16.387s.

