In [1]:
# from os import chdir
# from google.colab import drive

# drive.mount("/content/drive", force_remount=True)
# chdir("/content/drive/MyDrive/Eliott/files/")

In [2]:
# IMPORTS

import numpy as np
import pandas as pd
import preprocessing as pp
import json


from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score


%load_ext autoreload
%autoreload 2


2022-03-29 21:27:18.780100: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-29 21:27:18.780170: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
file_name_train = 'Corona_NLP_train.csv'
file_name_test = 'Corona_NLP_test.csv'
X_train, y_train = pp.prepare_dataframe(file_name_train,lemmatising=False)
X_test, y_test = pp.prepare_dataframe(file_name_test,lemmatising=False)

On enlève : 
- Les URLS
- Hashtags
- Mentions
- Mots réservés
- Emojis et smileys

- 35525 mots sans lemmatisation
- 30794 avec lemmatisation

### TFIDF

On commence par vectoriser les données textuelles sous forme de tfidf. On se retrouve avec une sorte de one-hot vector pour chaque mot présent dans le corpus  total de tweets. La valeur pour chaque mot est le nombre de fois que le mot apparait dans le tweet divisé par le nombre de fois dans tous les tweets (importance locale / fréquence totale). 

In [28]:
vectorizer = TfidfVectorizer()
X_vect = vectorizer.fit(X_train)
X_vect = vectorizer.transform(X_train)
X_vect_test = vectorizer.transform(X_test)

X_vect.shape

(41157, 35525)

### SMOTE

On va augmenter syntétiquement les données pour équilibrer les classes et améliorer les performances globales. 

In [5]:
print(y_train.value_counts())
print(y_test.value_counts())

 1    18046
-1    15398
 0     7713
Name: Sentiment_Number, dtype: int64
-1    1633
 1    1546
 0     619
Name: Sentiment_Number, dtype: int64


In [6]:
oversample = SMOTE(random_state=42)
X_train_smote, y_train_smote = oversample.fit_resample(X_vect, y_train)

# Recherche d'Hyperparamètres

In [7]:
model_sgd = SGDClassifier()
model_gb = GradientBoostingClassifier()
model_rf = RandomForestClassifier()
model_lr = LogisticRegression(max_iter=1000)
model_per = Perceptron()
model_svc = LinearSVC(max_iter=10000)


In [8]:
models_and_params = {
    "SGD" : { "model" : model_sgd,
              "params" : {
                    'penalty': ['l2', 'l1', 'elasticnet'],
                    'alpha': np.linspace(1e-7, 1e-4, 20),  
                        }
    },
    "GB" : { "model" : model_gb,
              "params" : {
                    "learning_rate": [0.01, 0.1, 0.2],
                    "n_estimators":[10,50,100,200,400,800]
                        }
    },
    "RF" : { "model" : model_rf,
              "params" : {
                    'n_estimators': [10, 100],   
                    'max_depth': [10 ,150,300,600],
                    'min_samples_leaf': [1, 2, 3],   
                    'min_samples_split': [4, 8, 16, 32],
                    'max_features': ['log2', 'sqrt'],
                    'criterion': ['gini', 'entropy'],
                    'warm_start': [True, False] 
                        }
    },
    "LR" : { "model" : model_lr,
              "params" : {
                    'C': [100, 80, 40,20,10, 1.0],
                    'tol': np.linspace(1e-8,1e-4,15)
                        }
    },
    "PER" : { "model" : model_per,
              "params" : {
                    'penalty': ['l2', 'l1', 'elasticnet'],
                    'alpha': np.linspace(1e-8, 1e-4, 20),
                        }
    },
    "SVC" : { "model" : model_svc,
              "params" : {
                    'penalty': ['l2', 'l1', 'elasticnet'],
                    'loss': ['hinge', 'squared_hinge'],
                    'dual' : [False,True]
                        }
    }
    

}

In [10]:
def grid_Search(model_name,X_search,Y_search):

    model = models_and_params[model_name]["model"]
    parameters = models_and_params[model_name]["params"]

    grid_clf = GridSearchCV(model, parameters, verbose=1, scoring='accuracy' ,n_jobs=-1)
    
    grid_clf.fit(X_search, Y_search)

    print("Best Score: ", grid_clf.best_score_)
    print("Best Params: ", grid_clf.best_params_)

    return grid_clf


### Saving Hyperparameters in JSON

We will first load old weights and updates only if needed. Then we can use the model with best params in the end.

In [11]:
with open('data.json') as json_file:
    dico = json.load(json_file)

### SGD

In [12]:
%%time
grid_sgd = grid_Search("SGD",X_vect,y_train) 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Score:  0.8782469129068223
Best Params:  {'alpha': 1.5873684210526315e-05, 'penalty': 'l1'}
CPU times: user 2.13 s, sys: 3.31 s, total: 5.44 s
Wall time: 28.8 s


In [13]:
%%time
grid_sgd_smote = grid_Search("SGD",X_train_smote,y_train_smote)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Score:  0.8910565975844229
Best Params:  {'alpha': 5.357894736842105e-06, 'penalty': 'l1'}
CPU times: user 3.06 s, sys: 3.1 s, total: 6.15 s
Wall time: 33.2 s


In [15]:
dico['SGD'] = grid_sgd_smote.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

### Gradient boosting

In [10]:
%%time
grid_gb = grid_Search("GB",X_vect,y_train) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits


KeyboardInterrupt: 

In [None]:
%%time
grid_gb_smote = grid_Search("GB",X_train_smote,y_train_smote)

In [None]:
dico['GB'] = grid_gb_smote.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

### Random Forest

In [22]:
%%time
grid_rf = grid_Search("RF",X_vect,y_train) 

In [None]:
%%time
grid_rf_smote = grid_Search("RF",X_train_smote,y_train_smote)

In [None]:
dico['RF'] = grid_rf_smote.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)


### Logistic Regression

In [186]:
%%time
grid_lr = grid_Search("LR",X_vect,y_train) 

In [187]:
%%time
grid_lr_smote = grid_Search("LR",X_train_smote,y_train_smote)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
CPU times: user 1min 55s, sys: 4min 27s, total: 6min 23s
Wall time: 32min 56s


In [None]:
dico['LR'] = grid_lr_smote.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

### Perceptron

In [25]:
%%time
grid_per = grid_Search("PER",X_vect,y_train) 

In [189]:
%%time
grid_per_smote = grid_Search("PER",X_train_smote,y_train_smote)

In [None]:
dico['Perceptron'] = grid_per_smote.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

#### Observation 

La regression logistique n'est qu'un perceptron avec une sigmoid en fonction d'activation.
On voit que la Regression Logistique a de meilleures performances à l'issue de la recherche d'hyperparamètres mais pas de loin. Par ailleurs le temps d'entrainement est considérablement plus élevé pour la regression logistique (du au calcul de l'exponentiel). Nous verrons par la suite quel modèle il est préférable de conserver. 

### LinearSVC

In [19]:
%%time
grid_svc = grid_Search("SVC",X_vect,y_train) 

In [20]:
%%time
grid_svc_smote = grid_Search("SVC",X_train_smote,y_train_smote)

In [None]:
dico['SVC'] = grid_svc_smote.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

### Charging the best parameters for training

In [16]:
# Opening JSON file
with open('data.json') as json_file:
    data = json.load(json_file)


data_sgd = data['SGD']
data_rf = data['RF']
data_gb = data['GB']
data_lr = data['LR']
data_per = data['Perceptron']
data_svc = data['SVC']



### Training on all training data and testing on test data

In [26]:
def testing(model,X_train_pred,y_train_pred):
    predictions_train = model.predict(X_train_pred)
    predictions_test = model.predict(X_vect_test)
    accuracy_train = accuracy_score(y_train_pred,predictions_train )
    accuracy_test = accuracy_score(y_test,predictions_test )
    
    print(f"train_accuracy : {accuracy_train} \ntest_accuracy : {accuracy_test}  ")




### SGD

In [18]:
model_sgd_best = SGDClassifier()
model_sgd_best.set_params(**data_sgd,n_jobs=-1) 

model_sgd_best_smote = SGDClassifier()
model_sgd_best_smote.set_params(**data_sgd,n_jobs=-1) 


SGDClassifier(alpha=5.357894736842105e-06, n_jobs=-1, penalty='l1')

In [19]:
%%time
model_sgd_best.fit(X_vect,y_train)


CPU times: user 1.1 s, sys: 904 ms, total: 2.01 s
Wall time: 382 ms


SGDClassifier(alpha=5.357894736842105e-06, n_jobs=-1, penalty='l1')

In [20]:
%%time
model_sgd_best_smote.fit(X_train_smote,y_train_smote)

CPU times: user 1.36 s, sys: 882 ms, total: 2.24 s
Wall time: 442 ms


SGDClassifier(alpha=5.357894736842105e-06, n_jobs=-1, penalty='l1')

In [30]:
testing(model_sgd_best,X_vect,y_train)

train_accuracy : 0.9675632334718274 
test_accuracy : 0.869141653501843  


In [33]:
testing(model_sgd_best_smote,X_train_smote,y_train_smote)

train_accuracy : 0.9629280727030921 
test_accuracy : 0.8686150605581885  


### Random Forest

In [80]:
model_rf_best = RandomForestClassifier()
model_rf_best.set_params(**data_rf,n_jobs=-1)

model_rf_best_smote = RandomForestClassifier()
model_rf_best_smote.set_params(**data_rf,n_jobs=-1)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('rf_clf',
                 RandomForestClassifier(criterion='entropy', max_depth=300,
                                        max_features='sqrt', min_samples_leaf=2,
                                        min_samples_split=32, n_jobs=-1,
                                        verbose=1, warm_start=True))])

In [81]:
%%time

model_rf_best.fit(X_vect,y_train)

CPU times: user 669 ms, sys: 378 µs, total: 670 ms
Wall time: 681 ms


  warn(


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('rf_clf',
                 RandomForestClassifier(criterion='entropy', max_depth=300,
                                        max_features='sqrt', min_samples_leaf=2,
                                        min_samples_split=32, n_jobs=-1,
                                        verbose=1, warm_start=True))])

In [None]:
%%time

model_rf_best_smote.fit(X_train_smote,y_train_smote)

In [82]:
testing(model_rf_best,X_vect,y_train)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.


train_accuracy : 0.8653934932089317 
test_accuracy : 0.6908899420747762  


[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


In [None]:
testing(model_rf_best_smote,X_train_smote,y_train_smote)

### Gradient Boosting 

In [85]:
model_gb_best = GradientBoostingClassifier()
model_gb_best.set_params(**data_gb)

model_gb_best_smote = GradientBoostingClassifier()
model_gb_best_smote.set_params(**data_gb)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('gb_clf',
                 GradientBoostingClassifier(learning_rate=0.2, n_estimators=800,
                                            verbose=1))])

In [38]:
%%time
model_gb_best.fit(X_vect,y_train)

CPU times: user 9min 33s, sys: 9.32 ms, total: 9min 33s
Wall time: 9min 34s


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('gb_clf',
                 GradientBoostingClassifier(learning_rate=0.2,
                                            n_estimators=800))])

In [None]:
%%time
model_gb_best_smote.fit(X_train_smote,y_train_smote)

In [39]:
testing(model_gb_best,X_vect,y_train)

train_accuracy : 0.9136963335520082 
test_accuracy : 0.8156924697209057  


In [None]:
testing(model_gb_best_smote,X_train_smote,y_train_smote)

### Logistic Regression

In [86]:
model_lr_best = LogisticRegression(max_iter=1000)
model_lr_best.set_params(**data_lr,n_jobs=-1)

model_lr_best_smote = LogisticRegression(max_iter=1000)
model_lr_best_smote.set_params(**data_lr, n_jobs=-1)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('lr_clf',
                 LogisticRegression(C=10, max_iter=1000, n_jobs=-1, tol=1e-08,
                                    verbose=1))])

In [87]:
%%time

model_lr_best.fit(X_vect,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        92385     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.52156D+04    |proj g|=  6.00600D+03


 This problem is unconstrained.



At iterate   50    f=  1.72399D+04    |proj g|=  4.05238D+01

At iterate  100    f=  1.38656D+04    |proj g|=  1.69239D+02

At iterate  150    f=  1.31815D+04    |proj g|=  5.81206D+01

At iterate  200    f=  1.29607D+04    |proj g|=  7.76878D+01

At iterate  250    f=  1.28956D+04    |proj g|=  6.04622D+00

At iterate  300    f=  1.28799D+04    |proj g|=  3.58044D+00

At iterate  350    f=  1.28758D+04    |proj g|=  1.48195D+00

At iterate  400    f=  1.28742D+04    |proj g|=  4.92087D-01

At iterate  450    f=  1.28739D+04    |proj g|=  1.60575D+00

At iterate  500    f=  1.28738D+04    |proj g|=  1.04958D+00

At iterate  550    f=  1.28737D+04    |proj g|=  2.51328D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F   

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   14.1s finished


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('lr_clf',
                 LogisticRegression(C=10, max_iter=1000, n_jobs=-1, tol=1e-08,
                                    verbose=1))])

In [None]:
%%time
model_lr_best_smote.fit(X_train_smote,y_train_smote)

In [88]:
testing(model_lr_best,X_vect,y_train)

train_accuracy : 0.959277887115193 
test_accuracy : 0.8230647709320695  


In [None]:
testing(model_lr_best_smote,X_train_smote,y_train_smote)

### Perceptron

In [89]:
model_per_best = Perceptron()
model_per_best.set_params(**data_per,n_jobs=-1)

model_per_best_smote = Perceptron()
model_per_best_smote.set_params(**data_per,n_jobs=-1)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('per_clf',
                 Perceptron(alpha=5.272631578947369e-06, n_jobs=-1,
                            penalty='l1', verbose=1))])

In [90]:
%%time
model_per_best.fit(X_vect,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 59.23, NNZs: 10382, Bias: 0.150000, T: 41157, Avg. loss: 0.049036
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 67.12, NNZs: 12019, Bias: -0.280000, T: 41157, Avg. loss: 0.059772Norm: 65.96, NNZs: 11652, Bias: -0.330000, T: 41157, Avg. loss: 0.056884
Total training time: 0.05 seconds.
-- Epoch 2

Total training time: 0.07 seconds.
-- Epoch 2
Norm: 79.38, NNZs: 8014, Bias: 0.140000, T: 82314, Avg. loss: 0.026856
Total training time: 0.08 seconds.
-- Epoch 3
Norm: 88.65, NNZs: 8964, Bias: -0.300000, T: 82314, Avg. loss: 0.031492
Total training time: 0.08 seconds.
-- Epoch 3
Norm: 86.97, NNZs: 8612, Bias: -0.360000, T: 82314, Avg. loss: 0.029703
Total training time: 0.09 seconds.
-- Epoch 3
Norm: 96.96, NNZs: 5766, Bias: 0.150000, T: 123471, Avg. loss: 0.019641
Total training time: 0.09 seconds.
-- Epoch 4
Norm: 106.18, NNZs: 6571, Bias: -0.270000, T: 123471, Avg. loss: 0.022868
Total training time: 0.10 seconds.
-- Epoch 4
Norm: 104.62, NNZs: 

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.3s finished


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('per_clf',
                 Perceptron(alpha=5.272631578947369e-06, n_jobs=-1,
                            penalty='l1', verbose=1))])

In [None]:
%%time
model_per_best_smote.fit(X_train_smote,y_train_smote)

In [91]:
testing(model_per_best,X_vect,y_train)

train_accuracy : 0.8576669825303107 
test_accuracy : 0.7814639283833597  


In [None]:
testing(model_per_best_smote,X_train_smote,y_train_smote)

### SVC

In [94]:
model_svc_best = LinearSVC(max_iter=10000)
model_svc_best.set_params(**data_svc)

model_svc_best_smote = LinearSVC(max_iter=10000)
model_svc_best_smote.set_params(**data_svc)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('svc_clf',
                 LinearSVC(dual=False, max_iter=10000, penalty='l1',
                           verbose=1))])

In [95]:
%%time

model_svc_best.fit(X_vect,y_train)

[LibLinear]...........*..........*....*
optimization finished, #iter = 257
Objective value = 13387.596363
#nonzeros/#features = 4675/30795
.............*...........*...*
optimization finished, #iter = 271
Objective value = 12866.620558
#nonzeros/#features = 4813/30795
...........*..........*...*.
optimization finished, #iter = 250
Objective value = 13900.984310
CPU times: user 6.94 s, sys: 13.1 ms, total: 6.95 s
Wall time: 6.92 s


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('svc_clf',
                 LinearSVC(dual=False, max_iter=10000, penalty='l1',
                           verbose=1))])

#nonzeros/#features = 4850/30795


In [None]:
%%time

model_svc_best_smote.fit(X_train_smote,y_train_smote)

In [96]:
testing(model_svc_best,X_vect,y_train)

train_accuracy : 0.9274728478752096 
test_accuracy : 0.8496577145866245  


In [None]:
testing(model_svc_best_smote,X_train_smote,y_train_smote)


### Voting Classifier

In [49]:
from sklearn.ensemble import VotingClassifier


In [97]:
clf1 = model_sgd_best_smote

clf2 = model_rf_best_smote

clf3 = model_gb_best_smote

clf4 = model_lr_best_smote

clf5 = model_per_best_smote

clf6 = model_svc_best_smote


eclf1 = VotingClassifier(
     estimators=[('sgd', clf1), ('rf', clf2), ('gb', clf3), ('lr', clf4), ('per', clf5), ('svc', clf6)],
     voting='hard')

eclf2 = VotingClassifier(
     estimators=[('sgd', clf1), ('lr', clf4), ('per', clf5), ('svc', clf6)],
     voting='hard')


In [98]:
%%time

eclf1.fit(X_train_smote,y_train_smote)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 360.88, NNZs: 5102, Bias: 0.947182, T: 41157, Avg. loss: 0.486335
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 414.01, NNZs: 5851, Bias: -1.909098, T: 41157, Avg. loss: 0.605988
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 406.08, NNZs: 5633, Bias: -1.600516, T: 41157, Avg. loss: 0.579718
Total training time: 0.03 seconds.
Norm: 420.66, NNZs: 4001, Bias: -1.523806, T: 82314, Avg. loss: 0.268741
Total training time: 0.05 seconds.-- Epoch 2
Norm: 369.55, NNZs: 3544, Bias: 1.069173, T: 82314, Avg. loss: 0.234354
Total training time: 0.06 seconds.
-- Epoch 3

-- Epoch 3
Norm: 412.70, NNZs: 3927, Bias: -1.462631, T: 82314, Avg. loss: 0.262655
Total training time: 0.06 seconds.
-- Epoch 3
Norm: 374.69, NNZs: 3254, Bias: 1.027108, T: 123471, Avg. loss: 0.214995
Total training time: 0.09 seconds.
-- Epoch 4
Norm: 416.32, NNZs: 3547, Bias: -1.238832, T: 123471, Avg. loss: 0.238725
Total training time: 0.08 seconds.
-- Epoch 4
Norm: 424.34, NN

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.6s finished


      Iter       Train Loss   Remaining Time 
         1           1.0120            9.95m
         2           0.9888            9.90m
         3           0.9711            9.89m
         4           0.9563            9.90m
         5           0.9428            9.90m
         6           0.9318            9.88m
         7           0.9217            9.86m
         8           0.9126            9.85m
         9           0.9040            9.83m
        10           0.8965            9.82m
        20           0.8405            9.62m
        30           0.8040            9.44m
        40           0.7767            9.29m
        50           0.7543            9.14m
        60           0.7351            9.00m
        70           0.7185            8.87m
        80           0.7038            8.74m
        90           0.6908            8.62m
       100           0.6784            8.50m
       200           0.5897            7.40m
       300           0.5325            6.22m
       40

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        92385     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.52156D+04    |proj g|=  6.00600D+03


 This problem is unconstrained.



At iterate   50    f=  1.72399D+04    |proj g|=  4.05238D+01

At iterate  100    f=  1.38656D+04    |proj g|=  1.69239D+02

At iterate  150    f=  1.31815D+04    |proj g|=  5.81206D+01

At iterate  200    f=  1.29607D+04    |proj g|=  7.76878D+01

At iterate  250    f=  1.28956D+04    |proj g|=  6.04622D+00

At iterate  300    f=  1.28799D+04    |proj g|=  3.58044D+00

At iterate  350    f=  1.28758D+04    |proj g|=  1.48195D+00

At iterate  400    f=  1.28742D+04    |proj g|=  4.92087D-01

At iterate  450    f=  1.28739D+04    |proj g|=  1.60575D+00

At iterate  500    f=  1.28738D+04    |proj g|=  1.04958D+00

At iterate  550    f=  1.28737D+04    |proj g|=  2.51328D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F   

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   13.7s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 59.23, NNZs: 10382, Bias: 0.150000, T: 41157, Avg. loss: 0.049036
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 65.96, NNZs: 11652, Bias: -0.330000, T: 41157, Avg. loss: 0.056884
Total training time: 0.04 seconds.
-- Epoch 2
Norm: 67.12, NNZs: 12019, Bias: -0.280000, T: 41157, Avg. loss: 0.059772
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 79.38, NNZs: 8014, Bias: 0.140000, T: 82314, Avg. loss: 0.026856
Total training time: 0.07 seconds.
-- Epoch 3
Norm: 86.97, NNZs: 8612, Bias: -0.360000, T: 82314, Avg. loss: 0.029703
Total training time: 0.08 seconds.
-- Epoch 3
Norm: 88.65, NNZs: 8964, Bias: -0.300000, T: 82314, Avg. loss: 0.031492
Total training time: 0.08 seconds.
-- Epoch 3
Norm: 96.96, NNZs: 5766, Bias: 0.150000, T: 123471, Avg. loss: 0.019641
Total training time: 0.09 seconds.
-- Epoch 4
Norm: 104.62, NNZs: 6373, Bias: -0.300000, T: 123471, Avg. loss: 0.022644
Total training time: 0.10 seconds.
-- Epoch 4
Norm: 106.18, NNZs: 

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.3s finished


[LibLinear]...........*..........*....**
optimization finished, #iter = 254
Objective value = 13387.596365
#nonzeros/#features = 4668/30795
.............*..........*...*
optimization finished, #iter = 266
Objective value = 12866.624502
#nonzeros/#features = 4818/30795
...........*..........*....*
optimization finished, #iter = 251
Objective value = 13900.999692
#nonzeros/#features = 4853/30795
CPU times: user 10min 8s, sys: 2.38 s, total: 10min 10s
Wall time: 10min 24s


VotingClassifier(estimators=[('sgd',
                              Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                              ('clf',
                                               SGDClassifier(alpha=1.5873684210526315e-05,
                                                             n_jobs=-1,
                                                             penalty='l1',
                                                             verbose=1))])),
                             ('rf',
                              Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                              ('rf_clf',
                                               RandomForestClassifier(criterion='entropy',
                                                                      max_depth=300,
                                                                      max_features='sqrt',
                                                                      min_samples_

In [99]:
testing(eclf1,X_train_smote,y_train_smote)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


train_accuracy : 0.937944942537114 
test_accuracy : 0.8499210110584519  


In [100]:
%%time

eclf2.fit(X_train_smote,y_train_smote)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 357.60, NNZs: 4998, Bias: 0.891696, T: 41157, Avg. loss: 0.484172
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 366.71, NNZs: 3474, Bias: 0.954372, T: 82314, Avg. loss: 0.235496
Total training time: 0.03 seconds.
-- Epoch 3
Norm: 412.60, NNZs: 5878, Bias: -1.926124, T: 41157, Avg. loss: 0.598335
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 404.48, NNZs: 5595, Bias: -1.558111, T: 41157, Avg. loss: 0.571228
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 371.66, NNZs: 3172, Bias: 1.002502, T: 123471, Avg. loss: 0.215340
Total training time: 0.05 seconds.
-- Epoch 4
Norm: 375.23, NNZs: 3087, Bias: 0.988975, T: 164628, Avg. loss: 0.207595
Total training time: 0.07 seconds.
-- Epoch 5
Norm: 419.40, NNZs: 4059, Bias: -1.519952, T: 82314, Avg. loss: 0.269418
Total training time: 0.08 seconds.
-- Epoch 3
Norm: 377.89, NNZs: 3030, Bias: 1.001752, T: 205785, Avg. loss: 0.202337
Total training time: 0.09 seconds.
-- Epoch 6
Norm: 411.28, NNZ

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        92385     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.52156D+04    |proj g|=  6.00600D+03


 This problem is unconstrained.



At iterate   50    f=  1.72399D+04    |proj g|=  4.05238D+01

At iterate  100    f=  1.38656D+04    |proj g|=  1.69239D+02

At iterate  150    f=  1.31815D+04    |proj g|=  5.81206D+01

At iterate  200    f=  1.29607D+04    |proj g|=  7.76878D+01

At iterate  250    f=  1.28956D+04    |proj g|=  6.04622D+00

At iterate  300    f=  1.28799D+04    |proj g|=  3.58044D+00

At iterate  350    f=  1.28758D+04    |proj g|=  1.48195D+00

At iterate  400    f=  1.28742D+04    |proj g|=  4.92087D-01

At iterate  450    f=  1.28739D+04    |proj g|=  1.60575D+00

At iterate  500    f=  1.28738D+04    |proj g|=  1.04958D+00

At iterate  550    f=  1.28737D+04    |proj g|=  2.51328D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F   

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   13.8s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished


-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 59.23, NNZs: 10382, Bias: 0.150000, T: 41157, Avg. loss: 0.049036
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 67.12, NNZs: 12019, Bias: -0.280000, T: 41157, Avg. loss: 0.059772
Total training time: 0.03 seconds.
Norm: 65.96, NNZs: 11652, Bias: -0.330000, T: 41157, Avg. loss: 0.056884-- Epoch 2
Norm: 79.38, NNZs: 8014, Bias: 0.140000, T: 82314, Avg. loss: 0.026856
Total training time: 0.03 seconds.
-- Epoch 3

Total training time: 0.02 seconds.
-- Epoch 2
Norm: 88.65, NNZs: 8964, Bias: -0.300000, T: 82314, Avg. loss: 0.031492
Total training time: 0.05 seconds.
-- Epoch 3
Norm: 96.96, NNZs: 5766, Bias: 0.150000, T: 123471, Avg. loss: 0.019641
Total training time: 0.05 seconds.
-- Epoch 4
Norm: 112.52, NNZs: 4770, Bias: 0.130000, T: 164628, Avg. loss: 0.015376
Total training time: 0.07 seconds.
-- Epoch 5
Norm: 106.18, NNZs: 6571, Bias: -0.270000, T: 123471, Avg. loss: 0.022868
Total training time: 0.08 seconds.
-- Epoch 4
Norm: 126.18, NNZs:

VotingClassifier(estimators=[('sgd',
                              Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                              ('clf',
                                               SGDClassifier(alpha=1.5873684210526315e-05,
                                                             n_jobs=-1,
                                                             penalty='l1',
                                                             verbose=1))])),
                             ('lr',
                              Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                              ('lr_clf',
                                               LogisticRegression(C=10,
                                                                  max_iter=1000,
                                                                  n_jobs=-1,
                                                                  tol=1e-08,
                                          

In [101]:
testing(eclf2,X_train_smote,y_train_smote)

train_accuracy : 0.9355881138081007 
test_accuracy : 0.8522906793048973  


### Tests perso

In [57]:
tests_perso = ["I think covid is the most horrible threat we ever faced",
                "I love covid, thanks to it I can see my family much more often and I don't have to comute as much",
                "I would love to come to your birthday party, but I got covid, I have to stay confined"
]

In [58]:
test_perso_vect = vectorizer.transform(tests_perso)
predictions_out = eclf2.predict(test_perso_vect)


In [59]:
predictions_out

array([-1,  1,  1])