In [69]:
# from os import chdir
# from google.colab import drive

# drive.mount("/content/drive", force_remount=True)
# chdir("/content/drive/MyDrive/Eliott/files/")

In [70]:
# IMPORTS

import numpy as np
import pandas as pd
import preprocessing as pp
import json


from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline


%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
file_name_train = 'Corona_NLP_train.csv'
file_name_test = 'Corona_NLP_test.csv'
X_train, y_train = pp.prepare_dataframe(file_name_train,lemmatising=False)
X_test, y_test = pp.prepare_dataframe(file_name_test,lemmatising=False)

On enlève : 
- Les URLS
- Hashtags
- Mentions
- Mots réservés
- Emojis et smileys

- 35525 mots sans lemmatisation
- 30794 avec lemmatisation

### TFIDF

On commence par vectoriser les données textuelles sous forme de tfidf. On se retrouve avec une sorte de one-hot vector pour chaque mot présent dans le corpus  total de tweets. La valeur pour chaque mot est le nombre de fois que le mot apparait dans le tweet divisé par le nombre de fois dans tous les tweets (importance locale / fréquence totale). 

In [72]:
vectorizer = TfidfVectorizer()
X_vect = vectorizer.fit(X_train)
X_vect = vectorizer.transform(X_train)
X_vect_test = vectorizer.transform(X_test)

X_vect.shape

(41157, 35525)

### SMOTE

On va augmenter syntétiquement les données pour équilibrer les classes et améliorer les performances globales. 

In [73]:
print(y_train.value_counts())
print(y_test.value_counts())

 1    18046
-1    15398
 0     7713
Name: Sentiment_Number, dtype: int64
-1    1633
 1    1546
 0     619
Name: Sentiment_Number, dtype: int64


In [74]:
oversample = SMOTE(random_state=42)
X_train_smote, y_train_smote = oversample.fit_resample(X_vect, y_train)

# Recherche d'Hyperparamètres

In [75]:
model_sgd = SGDClassifier()
model_gb = GradientBoostingClassifier()
model_rf = RandomForestClassifier()
model_lr = LogisticRegression(max_iter=1000)
model_per = Perceptron()
model_svc = LinearSVC(max_iter=10000)


In [76]:
models_and_params = {
    "SGD" : { "model" : model_sgd,
              "params" : {
                    'penalty': ['l2', 'l1', 'elasticnet'],
                    'alpha': np.linspace(1e-7, 1e-4, 20),  
                        }
    },
    "GB" : { "model" : model_gb,
              "params" : {
                    "learning_rate": [0.01, 0.1, 0.2],
                    "n_estimators":[10,50,100,200,400,800]
                        }
    },
    "RF" : { "model" : model_rf,
              "params" : {
                    'n_estimators': [10, 100],   
                    'max_depth': [10 ,150,300,600],
                    'min_samples_leaf': [1, 2, 3],   
                    'min_samples_split': [4, 8, 16, 32],
                    'max_features': ['log2', 'sqrt'],
                    'criterion': ['gini', 'entropy'],
                    'warm_start': [True, False] 
                        }
    },
    "LR" : { "model" : model_lr,
              "params" : {
                    'C': [100, 80, 40,20,10, 1.0],
                    'tol': np.linspace(1e-8,1e-4,15)
                        }
    },
    "PER" : { "model" : model_per,
              "params" : {
                    'penalty': ['l2', 'l1', 'elasticnet'],
                    'alpha': np.linspace(1e-8, 1e-4, 20),
                        }
    },
    "SVC" : { "model" : model_svc,
              "params" : {
                    'penalty': ['l2', 'l1', 'elasticnet'],
                    'loss': ['hinge', 'squared_hinge'],
                    'dual' : [False,True]
                        }
    }
    

}

In [78]:
def grid_search(model_name,X_search,Y_search):

    model = models_and_params[model_name]["model"]
    parameters = models_and_params[model_name]["params"]

    grid_clf = GridSearchCV(model, parameters,  scoring='accuracy', verbose=1 ,n_jobs=-1)
    
    grid_clf.fit(X_search, Y_search)

    print("Best Score: ", grid_clf.best_score_)
    print("Best Params: ", grid_clf.best_params_)

    return grid_clf


### Saving Hyperparameters in JSON

We will first load old weights and updates only if needed. Then we can use the model with best params in the end.

In [79]:
with open('data.json') as json_file:
    dico = json.load(json_file)

### SGD

In [10]:
%%time
grid_sgd = grid_search("SGD",X_vect,y_train) 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Score:  0.8786112322291502
Best Params:  {'alpha': 1.5873684210526315e-05, 'penalty': 'l1'}
CPU times: user 1.96 s, sys: 3.3 s, total: 5.26 s
Wall time: 27 s


In [11]:
%%time
grid_sgd_smote = grid_search("SGD",X_train_smote,y_train_smote)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Score:  0.8908717001978491
Best Params:  {'alpha': 1.0615789473684212e-05, 'penalty': 'l1'}
CPU times: user 2.49 s, sys: 3.18 s, total: 5.66 s
Wall time: 31.6 s


In [12]:
dico['SGD'] = grid_sgd_smote.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

### Gradient boosting

In [82]:
%%time
grid_gb = grid_search("GB",X_vect[:1000],y_train[:1000]) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Score:  0.602
Best Params:  {'learning_rate': 0.2, 'n_estimators': 800}
CPU times: user 18.5 s, sys: 15.5 ms, total: 18.5 s
Wall time: 1min 27s


In [13]:
%%time
grid_gb = grid_search("GB",X_vect[:100],y_train[:100]) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Score:  0.8167019453701956
Best Params:  {'learning_rate': 0.2, 'n_estimators': 800}
CPU times: user 9min 13s, sys: 22.9 ms, total: 9min 13s
Wall time: 1h 35min 34s


In [83]:
GB_param = {
    "gb_clf__learning_rate": [0.01, 0.1, 0.2],
    "gb_clf__n_estimators":[10,50,100,200,400,800]
    }

pipeline_gradient_boosting = Pipeline([('tfidf', TfidfVectorizer()),
                     ('gb_clf', GradientBoostingClassifier()),
])


gb_clf = GridSearchCV(pipeline_gradient_boosting, GB_param, scoring='accuracy', verbose=1 ,n_jobs=-1)

gb_best = gb_clf.fit(X_train[:1000], y_train[:1000])    

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [84]:
print("Best Score: ", gb_clf.best_score_)
print("Best Params: ", gb_clf.best_params_)

Best Score:  0.5900000000000001
Best Params:  {'gb_clf__learning_rate': 0.2, 'gb_clf__n_estimators': 800}


In [14]:
%%time
grid_gb_smote = grid_search("GB",X_train_smote,y_train_smote)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Score:  0.824263698727705
Best Params:  {'learning_rate': 0.2, 'n_estimators': 800}
CPU times: user 11min 48s, sys: 230 ms, total: 11min 48s
Wall time: 1h 43min 57s


In [15]:
dico['GB'] = grid_gb_smote.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

### Random Forest

In [16]:
%%time
grid_rf = grid_search("RF",X_vect,y_train) 

Fitting 5 folds for each of 768 candidates, totalling 3840 fits
Best Score:  0.6936605598380768
Best Params:  {'criterion': 'gini', 'max_depth': 600, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 100, 'warm_start': True}
CPU times: user 19 s, sys: 1.2 s, total: 20.2 s
Wall time: 1h 7min 28s


In [17]:
%%time
grid_rf_smote = grid_search("RF",X_train_smote,y_train_smote)

Fitting 5 folds for each of 768 candidates, totalling 3840 fits
Best Score:  0.748994306773667
Best Params:  {'criterion': 'entropy', 'max_depth': 600, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 32, 'n_estimators': 100, 'warm_start': False}
CPU times: user 22.6 s, sys: 1.39 s, total: 24 s
Wall time: 1h 27min 53s


In [18]:
dico['RF'] = grid_rf_smote.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)


### Logistic Regression

In [19]:
%%time
grid_lr = grid_search("LR",X_vect,y_train) 

Fitting 5 folds for each of 90 candidates, totalling 450 fits
Best Score:  0.8231163380186238
Best Params:  {'C': 10, 'tol': 1e-08}
CPU times: user 1min 12s, sys: 3min 38s, total: 4min 51s
Wall time: 34min 26s


In [20]:
%%time
grid_lr_smote = grid_search("LR",X_train_smote,y_train_smote)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Score:  0.8546311385678151
Best Params:  {'C': 20, 'tol': 1e-08}
CPU times: user 2min 15s, sys: 7min 5s, total: 9min 20s
Wall time: 40min 9s


In [21]:
dico['LR'] = grid_lr_smote.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

### Perceptron

In [22]:
%%time
grid_per = grid_search("PER",X_vect,y_train) 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Score:  0.8184510039849113
Best Params:  {'alpha': 5.272631578947369e-06, 'penalty': 'l1'}
CPU times: user 1.61 s, sys: 3.2 s, total: 4.81 s
Wall time: 12.4 s


In [23]:
%%time
grid_per_smote = grid_search("PER",X_train_smote,y_train_smote)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Score:  0.8470214003772055
Best Params:  {'alpha': 1e-08, 'penalty': 'l2'}
CPU times: user 1.68 s, sys: 3.29 s, total: 4.96 s
Wall time: 14.6 s


In [24]:
dico['Perceptron'] = grid_per_smote.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

#### Observation 

La regression logistique n'est qu'un perceptron avec une sigmoid en fonction d'activation.
On voit que la Regression Logistique a de meilleures performances à l'issue de la recherche d'hyperparamètres mais pas de loin. Par ailleurs le temps d'entrainement est considérablement plus élevé pour la regression logistique (du au calcul de l'exponentiel). Nous verrons par la suite quel modèle il est préférable de conserver. 

### LinearSVC

In [25]:
%%time
grid_svc = grid_search("SVC",X_vect,y_train) 

Fitting 5 folds for each of 12 candidates, totalling 60 fits


40 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/eliott/envs/RNenv/lib64/python3.10/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/eliott/envs/RNenv/lib64/python3.10/site-packages/sklearn/svm/_classes.py", line 257, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/home/eliott/envs/RNenv/lib64/python3.10/site-packages/sklearn/svm/_base.py", line 1185, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File "/home/eliott/envs/RNenv/lib64/pyt

Best Score:  0.8673372513001938
Best Params:  {'dual': False, 'loss': 'squared_hinge', 'penalty': 'l1'}
CPU times: user 4.5 s, sys: 37.6 ms, total: 4.54 s
Wall time: 16.7 s


In [26]:
%%time
grid_svc_smote = grid_search("SVC",X_train_smote,y_train_smote)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


40 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/eliott/envs/RNenv/lib64/python3.10/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/eliott/envs/RNenv/lib64/python3.10/site-packages/sklearn/svm/_classes.py", line 257, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/home/eliott/envs/RNenv/lib64/python3.10/site-packages/sklearn/svm/_base.py", line 1185, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File "/home/eliott/envs/RNenv/lib64/pyt

Best Score:  0.8840003539564666
Best Params:  {'dual': False, 'loss': 'squared_hinge', 'penalty': 'l1'}
CPU times: user 3.15 s, sys: 53.9 ms, total: 3.2 s
Wall time: 51.4 s


In [27]:
dico['SVC'] = grid_svc_smote.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

### Charging the best parameters for training

In [28]:
# Opening JSON file
with open('data.json') as json_file:
    data = json.load(json_file)


data_sgd = data['SGD']
data_rf = data['RF']
data_gb = data['GB']
data_lr = data['LR']
data_per = data['Perceptron']
data_svc = data['SVC']



### Training on all training data and testing on test data

In [29]:
def testing(model,X_train_pred,y_train_pred):
    predictions_train = model.predict(X_train_pred)
    predictions_test = model.predict(X_vect_test)
    accuracy_train = accuracy_score(y_train_pred,predictions_train )
    accuracy_test = accuracy_score(y_test,predictions_test )
    
    print(f"train_accuracy : {accuracy_train} \ntest_accuracy : {accuracy_test}  ")




### SGD

In [30]:
model_sgd_best = SGDClassifier()
model_sgd_best.set_params(**data_sgd,n_jobs=-1) 

model_sgd_best_smote = SGDClassifier()
model_sgd_best_smote.set_params(**data_sgd,n_jobs=-1) 


SGDClassifier(alpha=1.0615789473684212e-05, n_jobs=-1, penalty='l1')

In [31]:
%%time
model_sgd_best.fit(X_vect,y_train)


CPU times: user 877 ms, sys: 905 ms, total: 1.78 s
Wall time: 286 ms


SGDClassifier(alpha=1.0615789473684212e-05, n_jobs=-1, penalty='l1')

In [32]:
%%time
model_sgd_best_smote.fit(X_train_smote,y_train_smote)

CPU times: user 937 ms, sys: 973 ms, total: 1.91 s
Wall time: 318 ms


SGDClassifier(alpha=1.0615789473684212e-05, n_jobs=-1, penalty='l1')

In [33]:
testing(model_sgd_best,X_vect,y_train)

train_accuracy : 0.9402774740627354 
test_accuracy : 0.8807266982622433  


In [34]:
testing(model_sgd_best_smote,X_train_smote,y_train_smote)

train_accuracy : 0.9344083638110016 
test_accuracy : 0.8741442864665614  


### Random Forest

In [35]:
model_rf_best = RandomForestClassifier()
model_rf_best.set_params(**data_rf,n_jobs=-1)

model_rf_best_smote = RandomForestClassifier()
model_rf_best_smote.set_params(**data_rf,n_jobs=-1)

RandomForestClassifier(criterion='entropy', max_depth=600, max_features='log2',
                       min_samples_split=32, n_jobs=-1)

In [36]:
%%time

model_rf_best.fit(X_vect,y_train)

CPU times: user 20.4 s, sys: 142 ms, total: 20.5 s
Wall time: 1.48 s


RandomForestClassifier(criterion='entropy', max_depth=600, max_features='log2',
                       min_samples_split=32, n_jobs=-1)

In [37]:
%%time

model_rf_best_smote.fit(X_train_smote,y_train_smote)

CPU times: user 31.4 s, sys: 170 ms, total: 31.6 s
Wall time: 2.22 s


RandomForestClassifier(criterion='entropy', max_depth=600, max_features='log2',
                       min_samples_split=32, n_jobs=-1)

In [38]:
testing(model_rf_best,X_vect,y_train)

train_accuracy : 0.9864664577107175 
test_accuracy : 0.690626645602949  


In [39]:
testing(model_rf_best_smote,X_train_smote,y_train_smote)

train_accuracy : 0.974214045587203 
test_accuracy : 0.6837809373354397  


### Gradient Boosting 

In [40]:
model_gb_best = GradientBoostingClassifier()
model_gb_best.set_params(**data_gb)

model_gb_best_smote = GradientBoostingClassifier()
model_gb_best_smote.set_params(**data_gb)

GradientBoostingClassifier(learning_rate=0.2, n_estimators=800)

In [41]:
%%time
model_gb_best.fit(X_vect,y_train)

CPU times: user 8min 59s, sys: 0 ns, total: 8min 59s
Wall time: 8min 59s


GradientBoostingClassifier(learning_rate=0.2, n_estimators=800)

In [42]:
%%time
model_gb_best_smote.fit(X_train_smote,y_train_smote)

CPU times: user 11min 51s, sys: 0 ns, total: 11min 51s
Wall time: 11min 51s


GradientBoostingClassifier(learning_rate=0.2, n_estimators=800)

In [43]:
testing(model_gb_best,X_vect,y_train)

train_accuracy : 0.9167334839759944 
test_accuracy : 0.8299104791995787  


In [44]:
testing(model_gb_best_smote,X_train_smote,y_train_smote)

train_accuracy : 0.9134803649931656 
test_accuracy : 0.8243812532912059  


### Logistic Regression

In [45]:
model_lr_best = LogisticRegression(max_iter=1000)
model_lr_best.set_params(**data_lr,n_jobs=-1)

model_lr_best_smote = LogisticRegression(max_iter=1000)
model_lr_best_smote.set_params(**data_lr, n_jobs=-1)

LogisticRegression(C=20, max_iter=1000, n_jobs=-1, tol=1e-08)

In [46]:
%%time

model_lr_best.fit(X_vect,y_train)

CPU times: user 94.8 ms, sys: 221 ms, total: 316 ms
Wall time: 13.3 s


LogisticRegression(C=20, max_iter=1000, n_jobs=-1, tol=1e-08)

In [47]:
%%time
model_lr_best_smote.fit(X_train_smote,y_train_smote)

CPU times: user 10.1 ms, sys: 8.27 ms, total: 18.4 ms
Wall time: 16.9 s


LogisticRegression(C=20, max_iter=1000, n_jobs=-1, tol=1e-08)

In [48]:
testing(model_lr_best,X_vect,y_train)

train_accuracy : 0.9846927618631096 
test_accuracy : 0.8299104791995787  


In [49]:
testing(model_lr_best_smote,X_train_smote,y_train_smote)

train_accuracy : 0.9814548006945214 
test_accuracy : 0.8275408109531333  


### Perceptron

In [50]:
model_per_best = Perceptron()
model_per_best.set_params(**data_per,n_jobs=-1)

model_per_best_smote = Perceptron()
model_per_best_smote.set_params(**data_per,n_jobs=-1)

Perceptron(alpha=1e-08, n_jobs=-1, penalty='l2')

In [51]:
%%time
model_per_best.fit(X_vect,y_train)

CPU times: user 443 ms, sys: 1.24 s, total: 1.68 s
Wall time: 141 ms


Perceptron(alpha=1e-08, n_jobs=-1, penalty='l2')

In [52]:
%%time
model_per_best_smote.fit(X_train_smote,y_train_smote)

CPU times: user 484 ms, sys: 945 ms, total: 1.43 s
Wall time: 162 ms


Perceptron(alpha=1e-08, n_jobs=-1, penalty='l2')

In [53]:
testing(model_per_best,X_vect,y_train)

train_accuracy : 0.9813640449984207 
test_accuracy : 0.80173775671406  


In [54]:
testing(model_per_best_smote,X_train_smote,y_train_smote)

train_accuracy : 0.9819165835457534 
test_accuracy : 0.7877830437072143  


### SVC

In [55]:
model_svc_best = LinearSVC(max_iter=10000)
model_svc_best.set_params(**data_svc)

model_svc_best_smote = LinearSVC(max_iter=10000)
model_svc_best_smote.set_params(**data_svc)

LinearSVC(dual=False, max_iter=10000, penalty='l1')

In [56]:
%%time

model_svc_best.fit(X_vect,y_train)

CPU times: user 4.29 s, sys: 1.92 ms, total: 4.29 s
Wall time: 4.29 s


LinearSVC(dual=False, max_iter=10000, penalty='l1')

In [57]:
%%time

model_svc_best_smote.fit(X_train_smote,y_train_smote)

CPU times: user 2.87 s, sys: 4.81 ms, total: 2.88 s
Wall time: 2.88 s


LinearSVC(dual=False, max_iter=10000, penalty='l1')

In [58]:
testing(model_svc_best,X_vect,y_train)

train_accuracy : 0.9481983623684914 
test_accuracy : 0.8730911005792522  


In [59]:
testing(model_svc_best_smote,X_train_smote,y_train_smote)


train_accuracy : 0.9525287228933467 
test_accuracy : 0.8659820958399157  


### Voting Classifier

In [60]:
from sklearn.ensemble import VotingClassifier


In [61]:
clf1 = model_sgd_best_smote

clf2 = model_rf_best_smote

clf3 = model_gb_best_smote

clf4 = model_lr_best_smote

clf5 = model_per_best_smote

clf6 = model_svc_best_smote


eclf1 = VotingClassifier(
     estimators=[('sgd', clf1), ('rf', clf2), ('gb', clf3), ('lr', clf4), ('per', clf5), ('svc', clf6)],
     voting='hard')

eclf2 = VotingClassifier(
     estimators=[('sgd', clf1), ('lr', clf4), ('per', clf5), ('svc', clf6)],
     voting='hard')


In [62]:
%%time

eclf1.fit(X_train_smote,y_train_smote)

CPU times: user 11min 46s, sys: 2.48 s, total: 11min 48s
Wall time: 12min 4s


VotingClassifier(estimators=[('sgd',
                              SGDClassifier(alpha=1.0615789473684212e-05,
                                            n_jobs=-1, penalty='l1')),
                             ('rf',
                              RandomForestClassifier(criterion='entropy',
                                                     max_depth=600,
                                                     max_features='log2',
                                                     min_samples_split=32,
                                                     n_jobs=-1)),
                             ('gb',
                              GradientBoostingClassifier(learning_rate=0.2,
                                                         n_estimators=800)),
                             ('lr',
                              LogisticRegression(C=20, max_iter=1000, n_jobs=-1,
                                                 tol=1e-08)),
                             ('per',
                    

In [63]:
testing(eclf1,X_train_smote,y_train_smote)

train_accuracy : 0.9753038531161107 
test_accuracy : 0.8659820958399157  


In [64]:
%%time

eclf2.fit(X_train_smote,y_train_smote)

CPU times: user 4.25 s, sys: 2.19 s, total: 6.44 s
Wall time: 20.1 s


VotingClassifier(estimators=[('sgd',
                              SGDClassifier(alpha=1.0615789473684212e-05,
                                            n_jobs=-1, penalty='l1')),
                             ('lr',
                              LogisticRegression(C=20, max_iter=1000, n_jobs=-1,
                                                 tol=1e-08)),
                             ('per',
                              Perceptron(alpha=1e-08, n_jobs=-1, penalty='l2')),
                             ('svc',
                              LinearSVC(dual=False, max_iter=10000,
                                        penalty='l1'))])

In [65]:
testing(eclf2,X_train_smote,y_train_smote)

train_accuracy : 0.9697624589013263 
test_accuracy : 0.8570300157977883  


### Tests perso

In [66]:
tests_perso = ["I think covid is the most horrible threat we ever faced",
                "I love covid, thanks to it I can see my family much more often and I don't have to comute as much",
                "I would love to come to your birthday party, but I got covid, I have to stay confined"
]

In [67]:
test_perso_vect = vectorizer.transform(tests_perso)
predictions_out = eclf2.predict(test_perso_vect)


In [68]:
predictions_out

array([-1,  1,  1])