In [1]:
from text_clearance import clean_and_fix, fix, clean_text

from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from imblearn.pipeline import Pipeline as pipeline_imb
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
import numpy as np
from some_funcs import checking
from sklearn.metrics import classification_report

In [2]:
import pandas as pd

# load data
PATH_TO_DATA = './simplesentiment/'
files = ['products_sentiment_train.tsv', 'products_sentiment_test.tsv']
train =  pd.read_csv(PATH_TO_DATA+files[0], sep = '\t', header = None, names = ['text', 'target'])
# train, taget, test
data_samples = train.text.values
y = train.target
test = pd.read_csv(PATH_TO_DATA+files[1], sep = '\t')
test_samples = test.text.values

In [3]:
data_samples_cleared = clean_and_fix(data_samples)

In [4]:
# split data to train/validation sets
r_seed = 22
X_train, X_val, y_train, y_val = train_test_split(
    data_samples_cleared, y, train_size=0.8, random_state=r_seed)

In [5]:
# feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from nltk.tokenize import word_tokenize

# data transformer - vectorizer
bow_vectorizer = CountVectorizer(
    ngram_range=(1, 3),
    min_df=1,
    tokenizer=word_tokenize,
    analyzer='word',
)

tf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    min_df=1,
    tokenizer=word_tokenize,
    analyzer='word',
)

char_bow_vectorizer = CountVectorizer(
    ngram_range=(3, 5),
    min_df=1,
    tokenizer=word_tokenize,
    analyzer='char',
)

char_tf_vectorizer = TfidfVectorizer(
    ngram_range=(3, 5),
    min_df=1,
    tokenizer=word_tokenize,
    analyzer='char',
)

In [6]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import VotingClassifier

import numpy as np
from some_funcs import checking
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier

In [7]:
nb = ComplementNB(alpha=0.4)
logit = LogisticRegression(penalty='l2', C=1.0)
svm = SVC(C=1.0, kernel='rbf', probability=True)
dtree = ExtraTreesClassifier(random_state=0, n_jobs=-1)

# BOW - word

### Naive Bayes

In [8]:
pipe_bow = pipeline_imb(steps=[
    ('bow', bow_vectorizer), 
    ("balance", RandomOverSampler(random_state=0)), 
    ('cls', nb)
])
checking(pipe_bow,
         data_samples_cleared,
         y,
         10,
         n_jobs=-1,
         cv=5,
         data_model_name="BOW")

ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with BOW model F1 score:
([0.835, 0.8469, 0.8516, 0.8311, 0.8411], 0.8411, 0.0075)
ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with BOW model ACCURACY score:
([0.7875, 0.7975, 0.8075, 0.7825, 0.795], 0.794, 0.0086)
ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with BOW model ROC_AUC score:
([0.8535, 0.8696, 0.8648, 0.8516, 0.8553], 0.859, 0.007)



In [9]:
model = pipe_bow
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.71      0.63      0.67       141
           1       0.81      0.86      0.83       259

    accuracy                           0.78       400
   macro avg       0.76      0.74      0.75       400
weighted avg       0.77      0.78      0.77       400



### Logistic Regression

In [10]:
pipe_bow = pipeline_imb(
    steps=[('bow', bow_vectorizer), 
           ("balance", RandomOverSampler(random_state=0)), 
           ('cls', logit)])
checking(pipe_bow,
         data_samples_cleared,
         y,
         10,
         n_jobs=-1,
         cv=5,
         data_model_name="BOW")

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) with BOW model F1 score:
([0.8371, 0.8439, 0.8484, 0.8391, 0.8473], 0.8432, 0.0044)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) with BOW model ACCURACY score:
([0.785, 0.79, 0.8025, 0.79, 0.8], 0.7935, 0.0066)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto

In [11]:
model = pipe_bow
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.71      0.63      0.67       141
           1       0.81      0.86      0.84       259

    accuracy                           0.78       400
   macro avg       0.76      0.75      0.75       400
weighted avg       0.78      0.78      0.78       400



### Support Vector Machine

In [12]:
pipe_bow = pipeline_imb(steps=[
    ('bow', bow_vectorizer),
    ("balance", RandomOverSampler(random_state=0)), 
    ('cls', svm)])
checking(pipe_bow, data_samples_cleared, y, 10, n_jobs=-1, cv=5, data_model_name="BOW")

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with BOW model F1 score:
([0.8276, 0.8278, 0.8348, 0.8253, 0.8423], 0.8316, 0.0062)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with BOW model ACCURACY score:
([0.7625, 0.765, 0.7725, 0.765, 0.7875], 0.7705, 0.0091)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with BOW model ROC_AUC score:
([0.8199, 0.8338, 0.8422, 0.8428, 0.8528], 0.8383, 

In [13]:
model = pipe_bow
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.74      0.60      0.66       141
           1       0.80      0.89      0.84       259

    accuracy                           0.79       400
   macro avg       0.77      0.74      0.75       400
weighted avg       0.78      0.79      0.78       400



# TFIDF

### Naive Bayes

In [14]:
pipe_tf = pipeline_imb(
    steps=[('tf', tf_vectorizer),
           ("balance", RandomOverSampler(random_state=0)),
           ('cls', nb)])
checking(pipe_tf,
         data_samples_cleared,
         y,
         10,
         n_jobs=-1,
         cv=5,
         data_model_name="TFIDF")

ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with TFIDF model F1 score:
([0.8226, 0.8465, 0.8406, 0.832, 0.8381], 0.8359, 0.0081)
ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with TFIDF model ACCURACY score:
([0.78, 0.805, 0.8, 0.79, 0.8], 0.795, 0.0089)
ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with TFIDF model ROC_AUC score:
([0.8674, 0.8807, 0.8785, 0.8698, 0.8891], 0.8771, 0.0078)



In [15]:
model = pipe_tf
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.70      0.70      0.70       141
           1       0.84      0.84      0.84       259

    accuracy                           0.79       400
   macro avg       0.77      0.77      0.77       400
weighted avg       0.79      0.79      0.79       400



### Logistic Regression

In [16]:
pipe_tf = pipeline_imb(steps=[('tf', tf_vectorizer),
                          ("balance", RandomOverSampler(random_state=0)),
                          ('cls', logit)])
checking(pipe_tf, data_samples_cleared, y, 10,
         n_jobs=-1, cv=5, data_model_name="TFIDF")

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) with TFIDF model F1 score:
([0.8265, 0.8362, 0.8388, 0.8247, 0.8493], 0.8351, 0.0089)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) with TFIDF model ACCURACY score:
([0.7775, 0.7825, 0.7925, 0.7725, 0.8075], 0.7865, 0.0124)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi

In [17]:
model = pipe_tf
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.68      0.64      0.66       141
           1       0.81      0.84      0.82       259

    accuracy                           0.77       400
   macro avg       0.75      0.74      0.74       400
weighted avg       0.76      0.77      0.77       400



### Support Vector Machine

In [18]:
pipe_tf = pipeline_imb(steps=[('tf', tf_vectorizer), 
                              ("balance", RandomOverSampler(random_state=0)),
                              ('cls', svm)])
checking(pipe_tf, data_samples_cleared, y, 10,
         n_jobs=-1, cv=5, data_model_name="TFIDF")

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with TFIDF model F1 score:
([0.8395, 0.8291, 0.8581, 0.8436, 0.8468], 0.8434, 0.0095)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with TFIDF model ACCURACY score:
([0.7725, 0.7475, 0.7975, 0.7775, 0.7775], 0.7745, 0.016)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with TFIDF model ROC_AUC score:
([0.8619, 0.8649, 0.8769, 0.8643, 0.888], 0.

In [19]:
model = pipe_tf
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.81      0.46      0.59       141
           1       0.76      0.94      0.84       259

    accuracy                           0.77       400
   macro avg       0.79      0.70      0.72       400
weighted avg       0.78      0.77      0.75       400



# CHAR BOW

### Naive Bayes

In [20]:
pipe_bow = pipeline_imb(steps=[('bow', char_bow_vectorizer),
                               ("balance", RandomOverSampler(random_state=0)), 
                               ('cls', nb)])
checking(pipe_bow, data_samples_cleared, y, 10,
         n_jobs=-1, cv=5, data_model_name="CHAR BOW")

ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with CHAR BOW model F1 score:
([0.8442, 0.855, 0.844, 0.8333, 0.8337], 0.842, 0.008)
ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with CHAR BOW model ACCURACY score:
([0.8025, 0.81, 0.805, 0.78, 0.7825], 0.796, 0.0123)
ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with CHAR BOW model ROC_AUC score:
([0.8374, 0.8456, 0.8524, 0.8363, 0.8071], 0.8358, 0.0155)



In [21]:
model = pipe_bow
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.70      0.67      0.68       141
           1       0.82      0.85      0.83       259

    accuracy                           0.78       400
   macro avg       0.76      0.76      0.76       400
weighted avg       0.78      0.78      0.78       400



### Logistic Regression

In [22]:
pipe_bow = pipeline_imb(steps=[('bow', char_bow_vectorizer),
                               ("balance", RandomOverSampler(random_state=0)),
                               ('cls', logit)])
checking(pipe_bow, data_samples_cleared, y, 10,
         n_jobs=-1, cv=5, data_model_name="CHAR BOW")

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) with CHAR BOW model F1 score:
([0.8275, 0.8371, 0.8008, 0.8454, 0.8197], 0.8261, 0.0153)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) with CHAR BOW model ACCURACY score:
([0.78, 0.785, 0.7475, 0.8025, 0.7625], 0.7755, 0.0189)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   mu

In [23]:
model = pipe_bow
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.65      0.62      0.63       141
           1       0.80      0.82      0.81       259

    accuracy                           0.75       400
   macro avg       0.72      0.72      0.72       400
weighted avg       0.74      0.75      0.75       400



### Support Vector Machine

In [24]:
pipe_bow = pipeline_imb(steps=[('bow', char_bow_vectorizer),
                               ("balance", RandomOverSampler(random_state=0)), 
                               ('cls', svm)])
checking(pipe_bow, data_samples_cleared, y, 10,
         n_jobs=-1, cv=5, data_model_name="CHAR BOW")

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with CHAR BOW model F1 score:
([0.8308, 0.8355, 0.8173, 0.8453, 0.8343], 0.8326, 0.0091)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with CHAR BOW model ACCURACY score:
([0.775, 0.7775, 0.7675, 0.795, 0.7825], 0.7795, 0.0091)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with CHAR BOW model ROC_AUC score:
([0.8449, 0.8463, 0.8508, 0.8567, 0.

In [25]:
model = pipe_bow
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.71      0.60      0.65       141
           1       0.80      0.87      0.83       259

    accuracy                           0.77       400
   macro avg       0.75      0.73      0.74       400
weighted avg       0.77      0.77      0.77       400



# CHAR TFIDF

### Naive Bayes

In [26]:
pipe_tf = pipeline_imb(steps=[('bow', char_tf_vectorizer),
                              ("balance", RandomOverSampler(random_state=0)),
                              ('cls', nb)])
checking(pipe_tf, data_samples_cleared, y, 10,
         n_jobs=-1, cv=5, data_model_name="CHAR TFIDF")

ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with CHAR TFIDF model F1 score:
([0.8147, 0.8521, 0.8358, 0.8154, 0.8233], 0.8282, 0.0141)
ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with CHAR TFIDF model ACCURACY score:
([0.7725, 0.8125, 0.8025, 0.7725, 0.78], 0.788, 0.0165)
ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with CHAR TFIDF model ROC_AUC score:
([0.8635, 0.8734, 0.8685, 0.8708, 0.8573], 0.8667, 0.0057)



In [27]:
model = pipe_tf
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.68      0.73      0.70       141
           1       0.85      0.81      0.83       259

    accuracy                           0.78       400
   macro avg       0.76      0.77      0.77       400
weighted avg       0.79      0.78      0.78       400



### Logistic Regression

In [28]:
pipe_tf = pipeline_imb(steps=[('bow', char_tf_vectorizer),
                              ("balance", RandomOverSampler(random_state=0)),
                              ('cls', logit)])
checking(pipe_tf, data_samples_cleared, y, 10,
         n_jobs=-1, cv=5, data_model_name="CHAR TFIDF")

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) with CHAR TFIDF model F1 score:
([0.835, 0.8415, 0.8214, 0.8323, 0.8396], 0.834, 0.0071)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) with CHAR TFIDF model ACCURACY score:
([0.7925, 0.7975, 0.7825, 0.7875, 0.7975], 0.7915, 0.0058)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                

In [29]:
model = pipe_tf
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.67      0.68      0.68       141
           1       0.82      0.82      0.82       259

    accuracy                           0.77       400
   macro avg       0.75      0.75      0.75       400
weighted avg       0.77      0.77      0.77       400



### Support Vector Machine

In [30]:
pipe_tf = pipeline_imb(steps=[('bow', char_tf_vectorizer),
                              ("balance", RandomOverSampler(random_state=0)),
                              ('cls', svm)])
checking(pipe_tf, data_samples_cleared, y, 10,
         n_jobs=-1, cv=5, data_model_name="CHAR TFIDF")

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with CHAR TFIDF model F1 score:
([0.8457, 0.853, 0.8457, 0.8494, 0.8551], 0.8498, 0.0038)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with CHAR TFIDF model ACCURACY score:
([0.7875, 0.7975, 0.7975, 0.7925, 0.8], 0.795, 0.0045)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with CHAR TFIDF model ROC_AUC score:
([0.8715, 0.8646, 0.87, 0.8743, 0

In [31]:
model = pipe_tf
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.75      0.54      0.63       141
           1       0.78      0.90      0.84       259

    accuracy                           0.77       400
   macro avg       0.76      0.72      0.73       400
weighted avg       0.77      0.77      0.76       400



# TF-IDF and BOW

### Naive Bayes

In [32]:
pipe_union = pipeline_imb(steps=[('union',
                                  FeatureUnion(transformer_list=[
                                      ('bow', bow_vectorizer),
                                      ('tfidf', tf_vectorizer)],
                                      n_jobs=-1)),
                                 ('cls', nb)])
checking(pipe_union, data_samples_cleared, y, data_model_name="TFIDF and BOW")

ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with TFIDF and BOW model F1 score:
([0.8462, 0.852, 0.8556, 0.8382, 0.8509], 0.8486, 0.006)
ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with TFIDF and BOW model ACCURACY score:
([0.79, 0.7925, 0.8025, 0.78, 0.795], 0.792, 0.0073)
ComplementNB(alpha=0.4, class_prior=None, fit_prior=True, norm=False) with TFIDF and BOW model ROC_AUC score:
([0.8549, 0.8725, 0.8605, 0.8445, 0.849], 0.8563, 0.0097)



In [33]:
model = pipe_union
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.80      0.59      0.68       141
           1       0.80      0.92      0.86       259

    accuracy                           0.80       400
   macro avg       0.80      0.75      0.77       400
weighted avg       0.80      0.80      0.79       400



### Logistic Regression

In [34]:
pipe_union = pipeline_imb(steps=[('union',
                                  FeatureUnion(transformer_list=[
                                      ('bow', bow_vectorizer),
                                      ('tfidf', tf_vectorizer)],
                                      n_jobs=-1)),
                                 ('cls', logit)])
checking(pipe_union, data_samples_cleared, y, data_model_name="TFIDF and BOW")

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) with TFIDF and BOW model F1 score:
([0.8333, 0.8425, 0.8444, 0.8417, 0.845], 0.8414, 0.0042)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) with TFIDF and BOW model ACCURACY score:
([0.775, 0.785, 0.79, 0.7875, 0.79], 0.7855, 0.0056)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
               

In [35]:
model = pipe_union
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.74      0.61      0.67       141
           1       0.81      0.88      0.84       259

    accuracy                           0.79       400
   macro avg       0.77      0.75      0.75       400
weighted avg       0.78      0.79      0.78       400



### Support Vector Machine

In [36]:
pipe_union = pipeline_imb(steps=[('union',
                                  FeatureUnion(transformer_list=[
                                      ('bow', bow_vectorizer),
                                      ('tfidf', tf_vectorizer)],
                                      n_jobs=-1)),
                                 ('cls', svm)])
checking(pipe_union, data_samples_cleared, y, data_model_name="TFIDF and BOW")

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with TFIDF and BOW model F1 score:
([0.8146, 0.8083, 0.8196, 0.8227, 0.8458], 0.8222, 0.0128)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with TFIDF and BOW model ACCURACY score:
([0.7325, 0.7225, 0.7325, 0.7425, 0.7775], 0.7415, 0.0191)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False) with TFIDF and BOW model ROC_AUC score:
([0.7985, 0.8145, 0

In [37]:
model = pipe_union
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.70      0.35      0.47       141
           1       0.72      0.92      0.81       259

    accuracy                           0.72       400
   macro avg       0.71      0.64      0.64       400
weighted avg       0.72      0.72      0.69       400



# Ансамбль моделей

In [51]:
pipe_bow_nb = pipeline_imb(steps=[
    ('bow', bow_vectorizer),
    ("balance", RandomOverSampler(random_state=0)), 
    ('cls', nb)])

pipe_bow_logit = pipeline_imb(steps=[
    ('bow', bow_vectorizer),
    ("balance", RandomOverSampler(random_state=0)),
    ('cls', logit)])

pipe_bow_svm = pipeline_imb(steps=[
    ('bow', bow_vectorizer),
    ("balance", RandomOverSampler(random_state=0)),
    ('cls', svm)])

pipe_tf_nb = pipeline_imb(steps=[
    ('tf', tf_vectorizer),
    ("balance", RandomOverSampler(random_state=0)), 
    ('cls', nb)])

pipe_tf_svm = pipeline_imb(steps=[
    ('tf', tf_vectorizer),
    ("balance", RandomOverSampler(random_state=0)), 
    ('cls', svm)])

pipe_tf_logit = pipeline_imb(steps=[
    ('tf', tf_vectorizer),
    ("balance", RandomOverSampler(random_state=0)), 
    ('cls', logit)])

pipe_tf_nb_char = pipeline_imb(steps=[
    ('tf', char_tf_vectorizer),
    ("balance", RandomOverSampler(random_state=0)), 
    ('cls', nb)])

pipe_tf_logit_char = pipeline_imb(steps=[
    ('tf', char_tf_vectorizer),
    ("balance", RandomOverSampler(random_state=0)), 
    ('cls', logit)])

pipe_tf_svm_char = pipeline_imb(steps=[
    ('tf', char_tf_vectorizer),
    ("balance", RandomOverSampler(random_state=0)), 
    ('cls', svm)])

pipe_union_nb = Pipeline(steps=[('union',
                              FeatureUnion(transformer_list=[
                                  ('bow', bow_vectorizer),
                                  ('tfidf', tf_vectorizer)],
                                  n_jobs=-1)),
                             ('cls', nb)])

pipe_union_logit = Pipeline(steps=[('union',
                              FeatureUnion(transformer_list=[
                                  ('bow', bow_vectorizer),
                                  ('tfidf', tf_vectorizer)],
                                  n_jobs=-1)),
                             ('cls', logit)])


In [52]:
pipe_vote = VotingClassifier(estimators=[
#     ('bow_nb', pipe_bow_nb),
#     ('bow_svm', pipe_bow_svm),
#     ('bow_logit', pipe_bow_logit),
    
    ('tfidf_nb', pipe_tf_nb),
#     ('tfidf_logit', pipe_tf_logit),
#     ('tfidf_svm', pipe_tf_svm),

    ('pipe_tf_nb_char', pipe_tf_nb_char),
#     ('pipe_tf_logit_char', pipe_tf_logit_char),
    
#     ('pipe_tf_svm_char', pipe_tf_svm_char),
    ('union_nb', pipe_union_nb),
#     ('union_logit', pipe_union_logit),
],
    voting='soft',
    n_jobs=-1)
checking(pipe_vote, data_samples_cleared, y, data_model_name="Voting", r_seed = 22)

tfidf_nb+pipe_tf_nb_char+union_nb with Voting model F1 score:
([0.8539, 0.8473, 0.8496, 0.8368, 0.8449], 0.8465, 0.0057)
tfidf_nb+pipe_tf_nb_char+union_nb with Voting model ACCURACY score:
([0.805, 0.79, 0.8, 0.7825, 0.7925], 0.794, 0.0078)
tfidf_nb+pipe_tf_nb_char+union_nb with Voting model ROC_AUC score:
([0.8763, 0.8865, 0.8861, 0.8788, 0.8772], 0.881, 0.0044)



In [53]:
model = pipe_vote
model.fit(X_train, y_train)
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.78      0.64      0.70       141
           1       0.82      0.90      0.86       259

    accuracy                           0.81       400
   macro avg       0.80      0.77      0.78       400
weighted avg       0.80      0.81      0.80       400

