In [1]:
# from os import chdir
# from google.colab import drive

# drive.mount("/content/drive", force_remount=True)
# chdir("/content/drive/MyDrive/Eliott/files/")

In [27]:
# IMPORTS

import numpy as np
import pandas as pd
import preprocessing as pp
import json


from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import GradientBoostingClassifier

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
file_name_train = 'Corona_NLP_train.csv'
file_name_test = 'Corona_NLP_test.csv'
X_train, y_train = pp.prepare_dataframe(file_name_train,lemmatising=False)
X_test, y_test = pp.prepare_dataframe(file_name_test,lemmatising=False)

On enlève : 
- Les URLS
- Hashtags
- Mentions
- Mots réservés
- Emojis et smileys

- 35525 mots sans lemmatisation
- 30794 avec lemmatisation

### TFIDF

On commence par vectoriser les données textuelles sous forme de tfidf. On se retrouve avec une sorte de one-hot vector pour chaque mot présent dans le corpus  total de tweets. La valeur pour chaque mot est le nombre de fois que le mot apparait dans le tweet divisé par le nombre de fois dans tous les tweets (importance locale / fréquence totale). 

In [5]:
vectorizer = TfidfVectorizer()
X_vect = vectorizer.fit_transform(X_train)

X_vect.shape

(41157, 35525)

### SMOTE

On va augmenter syntétiquement les données pour équilibrer les classes et améliorer les performances globales. 

In [9]:
print(y_train.value_counts())
print(y_test.value_counts())

 1    18046
-1    15398
 0     7713
Name: Sentiment_Number, dtype: int64
-1    1633
 1    1546
 0     619
Name: Sentiment_Number, dtype: int64


In [10]:
oversample = SMOTE(random_state=42)
X_train_smote, y_train_smote = oversample.fit_resample(X_vect, y_train)

# Recherche d'Hyperparamètres

In [30]:
model_sgd = SGDClassifier()
model_gb = GradientBoostingClassifier()


In [31]:
models_and_params = {
    "SGD" : { "model" : model_sgd,
              "params" : {
                    'penalty': ['l2', 'l1', 'elasticnet'],
                    'alpha': np.linspace(1e-7, 1e-4, 20),  
                        }
    },
    "GB" : { "model" : model_gb,
              "params" : {
                    "gb_clf__learning_rate": [0.01, 0.1, 0.2],
                    "gb_clf__n_estimators":[10,50,100,200,400,800]
                        }
    },
    "SGD" : { "model" : model_gb,
              "params" : {
                    'penalty': ['l2', 'l1', 'elasticnet'],
                    'alpha': np.linspace(1e-7, 1e-4, 20),  
                        }
    }
    

}

In [21]:
def grid_Search(model_name,X_search,Y_search):

    model = models_and_params[model_name]["model"]
    params = models_and_params[model_name]["params"]

    grid_clf = GridSearchCV(model, params, verbose=1, scoring='accuracy' ,n_jobs=-1)
    
    grid_clf.fit(X_search, Y_search)

    print("Best Score: ", grid_clf.best_score_)
    print("Best Params: ", grid_clf.best_params_)

    return grid_clf


### SGD

In [26]:
%%time
grid_sgd = grid_Search("SGD",X_vect,y_train) 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Score:  0.8782468302592571
Best Params:  {'alpha': 1.5873684210526315e-05, 'penalty': 'l1'}
CPU times: user 2.05 s, sys: 2.86 s, total: 4.91 s
Wall time: 30.4 s


In [25]:
%%time
grid_sgd_smote = grid_Search("SGD",X_train_smote,y_train_smote)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Score:  0.8910936070869632
Best Params:  {'alpha': 5.357894736842105e-06, 'penalty': 'l1'}
CPU times: user 2.64 s, sys: 3.15 s, total: 5.79 s
Wall time: 32.7 s


### Gradient boosting

In [32]:
%%time
grid_gb = grid_Search("GB",X_vect,y_train) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [None]:
%%time
grid_gb_smote = grid_Search("GB",X_train_smote,y_train_smote)

In [179]:
%%time
subset = X_train.shape[0]

gb_best = gb_clf.fit(X_train[:subset], y_train[:subset])    

Fitting 5 folds for each of 18 candidates, totalling 90 fits
CPU times: user 9min 29s, sys: 258 ms, total: 9min 29s
Wall time: 46min 55s


In [180]:
print("Best Score: ", gb_clf.best_score_)
print("Best Params: ", gb_clf.best_params_)

Best Score:  0.8194232669897714
Best Params:  {'gb_clf__learning_rate': 0.2, 'gb_clf__n_estimators': 800}


### Random Forest

In [22]:
randForest_param = {
    'rf_clf__n_estimators': [10, 100],   
    'rf_clf__max_depth': [10 ,150,300,600],
    'rf_clf__min_samples_leaf': [1, 2, 3],   
    'rf_clf__min_samples_split': [4, 8, 16, 32],
    'rf_clf__max_features': ['log2', 'sqrt'],
    'rf_clf__criterion': ['gini', 'entropy'],
    'rf_clf__warm_start': [True, False] 
}



pipeline_random_forest = Pipeline([('tfidf', TfidfVectorizer()),
                     ('rf_clf', RandomForestClassifier()),
])


rf_clf = GridSearchCV(pipeline_random_forest, randForest_param, scoring='accuracy', verbose=1 ,n_jobs=-1)


In [182]:
%%time
subset = X_train.shape[0]

rf_best = rf_clf.fit(X_train[:subset], y_train[:subset])



Fitting 5 folds for each of 768 candidates, totalling 3840 fits
CPU times: user 1min 34s, sys: 8.71 s, total: 1min 43s
Wall time: 42min 30s


In [183]:
print("Best Score: ", rf_clf.best_score_)
print("Best Params: ", rf_clf.best_params_)

Best Score:  0.6925428813940141
Best Params:  {'rf_clf__criterion': 'entropy', 'rf_clf__max_depth': 300, 'rf_clf__max_features': 'sqrt', 'rf_clf__min_samples_leaf': 2, 'rf_clf__min_samples_split': 32, 'rf_clf__n_estimators': 100, 'rf_clf__warm_start': True}


### Logistic Regression

In [184]:
LogisticRegression_param = {
    'lr_clf__C': [100, 80, 40,20,10, 1.0],
    'lr_clf__tol': np.linspace(1e-8,1e-4,15)
}


In [23]:
pipeline_lr = Pipeline([('tfidf', TfidfVectorizer()),
                       
                     ('lr_clf', LogisticRegression(max_iter=1000)),
 ])

In [186]:
lr_clf = GridSearchCV(pipeline_lr, LogisticRegression_param,  scoring='accuracy', verbose=1, n_jobs=-1)

In [187]:
%%time
lr_best = lr_clf.fit(X_train[:subset], y_train[:subset])

Fitting 5 folds for each of 90 candidates, totalling 450 fits
CPU times: user 1min 55s, sys: 4min 27s, total: 6min 23s
Wall time: 32min 56s


In [188]:
print("Best Score: ", lr_clf.best_score_)
print("Best Params: ", lr_clf.best_params_)

Best Score:  0.8230920602963577
Best Params:  {'lr_clf__C': 10, 'lr_clf__tol': 1e-08}


##### SMOTE

In [31]:
LogisticRegression_param = {
    'C': [100, 80, 40,20,10, 1.0],
    'tol': np.linspace(1e-8,1e-4,15)
}

In [32]:
modelsololr = LogisticRegression(max_iter=1000)

In [33]:
lr_clf_solo = GridSearchCV(modelsololr, LogisticRegression_param,verbose=1, scoring='accuracy' ,n_jobs=-1)

In [34]:
%%time
lr_best_solo = lr_clf_solo.fit(X_new, yNew)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

CPU times: user 1min 51s, sys: 5min 43s, total: 7min 35s
Wall time: 50min 37s


In [35]:
print("Best Score: ", lr_clf_solo.best_score_)
print("Best Params: ", lr_clf_solo.best_params_)

Best Score:  0.8546311385678151
Best Params:  {'C': 20, 'tol': 1e-08}


### Perceptron

In [25]:
from sklearn.linear_model import Perceptron


In [189]:
per_params = {
  'per_clf__penalty': ['l2', 'l1', 'elasticnet'],
  'per_clf__alpha': np.linspace(1e-8, 1e-4, 20),
}

In [26]:
pipeline_perceptron = Pipeline([('tfidf', TfidfVectorizer()),
                       
                     ('per_clf', Perceptron()),
 ])

In [191]:
per_clf = GridSearchCV(pipeline_perceptron, per_params,  scoring='accuracy', verbose=1, n_jobs=-1)

In [192]:
%%time
per_best = per_clf.fit(X_train[:subset], y_train[:subset])

Fitting 5 folds for each of 60 candidates, totalling 300 fits
CPU times: user 7.8 s, sys: 3.47 s, total: 11.3 s
Wall time: 41.6 s


In [193]:
print("Best Score: ", per_clf.best_score_)
print("Best Params: ", per_clf.best_params_)

Best Score:  0.8128628567555942
Best Params:  {'per_clf__alpha': 5.272631578947369e-06, 'per_clf__penalty': 'l1'}


#### Observation 

La regression logistique n'est qu'un perceptron avec une sigmoid en fonction d'activation.
On voit que la Regression Logistique a de meilleures performances à l'issue de la recherche d'hyperparamètres mais pas de loin. Par ailleurs le temps d'entrainement est considérablement plus élevé pour la regression logistique (du au calcul de l'exponentiel). Nous verrons par la suite quel modèle il est préférable de conserver. 

### LinearSVC

In [19]:
from sklearn.svm import LinearSVC


In [20]:
svc_params = {
  'svc_clf__penalty': ['l2', 'l1', 'elasticnet'],
  'svc_clf__loss': ['hinge', 'squared_hinge'],
  'svc_clf__dual' : [False,True]
}

In [21]:
pipeline_svc = Pipeline([('tfidf', TfidfVectorizer()),
                       
                     ('svc_clf', LinearSVC(max_iter=10000)),
 ])

In [22]:
svc_clf = GridSearchCV(pipeline_svc, svc_params,  scoring='accuracy', verbose=1, n_jobs=-1)

In [23]:
%%time
svc_best = svc_clf.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


40 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/eliott/envs/RNenv/lib64/python3.10/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/eliott/envs/RNenv/lib64/python3.10/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/eliott/envs/RNenv/lib64/python3.10/site-packages/sklearn/svm/_classes.py", line 257, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/home/eliott/envs/RNenv/lib64/python3.10/site-packages/sklearn/s

CPU times: user 7.45 s, sys: 812 ms, total: 8.26 s
Wall time: 32.2 s


In [24]:
print("Best Score: ", svc_clf.best_score_)
print("Best Params: ", svc_clf.best_params_)

Best Score:  0.8689408679104181
Best Params:  {'svc_clf__dual': False, 'svc_clf__loss': 'squared_hinge', 'svc_clf__penalty': 'l1'}


##### SMOTE

In [26]:

svc_params = {
  'penalty': ['l2', 'l1', 'elasticnet'],
  'loss': ['hinge', 'squared_hinge'],
  'dual' : [False,True]
}

In [27]:
modelsolosvc = LinearSVC(max_iter=10000)

In [28]:
svc_clf_solo = GridSearchCV(modelsolosvc, svc_params,verbose=1, scoring='accuracy' ,n_jobs=-1)

In [29]:
%%time
svc_best_solo = svc_clf_solo.fit(X_new, yNew)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


40 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/eliott/envs/RNenv/lib64/python3.10/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/eliott/envs/RNenv/lib64/python3.10/site-packages/sklearn/svm/_classes.py", line 257, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/home/eliott/envs/RNenv/lib64/python3.10/site-packages/sklearn/svm/_base.py", line 1185, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File "/home/eliott/envs/RNenv/lib64/pyt

CPU times: user 4.14 s, sys: 173 ms, total: 4.31 s
Wall time: 1min 17s


In [30]:
print("Best Score: ", svc_clf_solo.best_score_)
print("Best Params: ", svc_clf_solo.best_params_)

Best Score:  0.883981883324771
Best Params:  {'dual': False, 'loss': 'squared_hinge', 'penalty': 'l1'}


### Saving Hyperparameters in JSON

In [9]:
with open('data.json') as json_file:
    dico = json.load(json_file)

In [10]:
dico['SGD'] = sgd_clf.best_params_


In [None]:
dico['RF'] = rf_clf.best_params_


In [None]:
dico['GB'] = gb_clf.best_params_

In [None]:
dico['LR'] = lr_clf.best_params_

In [None]:
dico['Perceptron'] = per_clf.best_params_

In [None]:
dico['SVC'] = svc_clf.best_params_

In [12]:
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

### Charging the best parameters for training

In [15]:
# Opening JSON file
with open('data.json') as json_file:
    data = json.load(json_file)


data_sgd = data['SGD']
data_rf = data['RF']
data_gb = data['GB']
data_lr = data['LR']
data_per = data['Perceptron']
data_svc = data['SVC']



### Training on all training data and testing on test data

In [16]:
from sklearn.metrics import accuracy_score, f1_score

In [17]:
def testing(model):
    predictions_train = model.predict(X_train)
    predictions_test = model.predict(X_test)
    accuracy_train = accuracy_score(y_train,predictions_train )
    accuracy_test = accuracy_score(y_test,predictions_test )
    
    print(f"train_accuracy : {accuracy_train} \ntest_accuracy : {accuracy_test}  ")




### SGD

In [73]:
model_sgd = pipeline_clf
model_sgd.set_params(**data_sgd,clf__verbose=1,clf__n_jobs=-1) # clf__max_iter=1000

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 SGDClassifier(alpha=1.5873684210526315e-05, n_jobs=-1,
                               penalty='l1', verbose=1))])

In [74]:
%%time

model_sgd.fit(X_train,y_train)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 405.71, NNZs: 5790, Bias: -1.648232, T: 41157, Avg. loss: 0.580203Norm: 359.38, NNZs: 5075, Bias: 1.019153, T: 41157, Avg. loss: 0.496203Norm: 418.90, NNZs: 5942, Bias: -1.727436, T: 41157, Avg. loss: 0.609151
Total training time: 0.05 seconds.
-- Epoch 2
Total training time: 0.06 seconds.
-- Epoch 2


Total training time: 0.05 seconds.
-- Epoch 2
Norm: 367.78, NNZs: 3565, Bias: 0.959986, T: 82314, Avg. loss: 0.236487
Total training time: 0.09 seconds.
-- Epoch 3
Norm: 424.82, NNZs: 4057, Bias: -1.548911, T: 82314, Avg. loss: 0.272227
Total training time: 0.07 seconds.
-- Epoch 3
Norm: 412.36, NNZs: 3999, Bias: -1.368071, T: 82314, Avg. loss: 0.262424
Total training time: 0.10 seconds.
-- Epoch 3
Norm: 372.77, NNZs: 3265, Bias: 0.949299, T: 123471, Avg. loss: 0.217699
Total training time: 0.10 seconds.
-- Epoch 4
Norm: 428.20, NNZs: 3584, Bias: -1.361592, T: 123471, Avg. loss: 0.246139
Total training time: 0.09 seconds.
Norm: 416.00, NNZs: 3534, B

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.4s finished


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 SGDClassifier(alpha=1.5873684210526315e-05, n_jobs=-1,
                               penalty='l1', verbose=1))])

In [20]:
testing(model_sgd)

train_accuracy : 0.9031027528731443 
test_accuracy : 0.8612427593470248  


In [None]:
f1score = f1_score(y_test, ad_best_ss.predict(X_test_ss), average='weighted')


### Random Forest

In [93]:
model_svc.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'tfidf', 'svc_clf', 'tfidf__analyzer', 'tfidf__binary', 'tfidf__decode_error', 'tfidf__dtype', 'tfidf__encoding', 'tfidf__input', 'tfidf__lowercase', 'tfidf__max_df', 'tfidf__max_features', 'tfidf__min_df', 'tfidf__ngram_range', 'tfidf__norm', 'tfidf__preprocessor', 'tfidf__smooth_idf', 'tfidf__stop_words', 'tfidf__strip_accents', 'tfidf__sublinear_tf', 'tfidf__token_pattern', 'tfidf__tokenizer', 'tfidf__use_idf', 'tfidf__vocabulary', 'svc_clf__C', 'svc_clf__class_weight', 'svc_clf__dual', 'svc_clf__fit_intercept', 'svc_clf__intercept_scaling', 'svc_clf__loss', 'svc_clf__max_iter', 'svc_clf__multi_class', 'svc_clf__penalty', 'svc_clf__random_state', 'svc_clf__tol', 'svc_clf__verbose'])

In [80]:
model_rf = pipeline_random_forest
model_rf.set_params(**data_rf,rf_clf__verbose=1,rf_clf__n_jobs=-1) # clf__max_iter=1000

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('rf_clf',
                 RandomForestClassifier(criterion='entropy', max_depth=300,
                                        max_features='sqrt', min_samples_leaf=2,
                                        min_samples_split=32, n_jobs=-1,
                                        verbose=1, warm_start=True))])

In [81]:
%%time

model_rf.fit(X_train,y_train)

CPU times: user 669 ms, sys: 378 µs, total: 670 ms
Wall time: 681 ms


  warn(


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('rf_clf',
                 RandomForestClassifier(criterion='entropy', max_depth=300,
                                        max_features='sqrt', min_samples_leaf=2,
                                        min_samples_split=32, n_jobs=-1,
                                        verbose=1, warm_start=True))])

In [82]:
testing(model_rf)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.


train_accuracy : 0.8653934932089317 
test_accuracy : 0.6908899420747762  


[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


### Gradient Boosting 

In [85]:
model_gb = pipeline_gradient_boosting
model_gb.set_params(**data_gb,gb_clf__verbose=1)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('gb_clf',
                 GradientBoostingClassifier(learning_rate=0.2, n_estimators=800,
                                            verbose=1))])

In [38]:
%%time

model_gb.fit(X_train,y_train)

CPU times: user 9min 33s, sys: 9.32 ms, total: 9min 33s
Wall time: 9min 34s


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('gb_clf',
                 GradientBoostingClassifier(learning_rate=0.2,
                                            n_estimators=800))])

In [39]:
testing(model_gb)

train_accuracy : 0.9136963335520082 
test_accuracy : 0.8156924697209057  


### Logistic Regression

In [86]:
model_lr = pipeline_lr
model_lr.set_params(**data_lr,lr_clf__verbose=1,lr_clf__n_jobs=-1)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('lr_clf',
                 LogisticRegression(C=10, max_iter=1000, n_jobs=-1, tol=1e-08,
                                    verbose=1))])

In [87]:
%%time

model_lr.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        92385     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.52156D+04    |proj g|=  6.00600D+03


 This problem is unconstrained.



At iterate   50    f=  1.72399D+04    |proj g|=  4.05238D+01

At iterate  100    f=  1.38656D+04    |proj g|=  1.69239D+02

At iterate  150    f=  1.31815D+04    |proj g|=  5.81206D+01

At iterate  200    f=  1.29607D+04    |proj g|=  7.76878D+01

At iterate  250    f=  1.28956D+04    |proj g|=  6.04622D+00

At iterate  300    f=  1.28799D+04    |proj g|=  3.58044D+00

At iterate  350    f=  1.28758D+04    |proj g|=  1.48195D+00

At iterate  400    f=  1.28742D+04    |proj g|=  4.92087D-01

At iterate  450    f=  1.28739D+04    |proj g|=  1.60575D+00

At iterate  500    f=  1.28738D+04    |proj g|=  1.04958D+00

At iterate  550    f=  1.28737D+04    |proj g|=  2.51328D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F   

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   14.1s finished


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('lr_clf',
                 LogisticRegression(C=10, max_iter=1000, n_jobs=-1, tol=1e-08,
                                    verbose=1))])

In [88]:
testing(model_lr)

train_accuracy : 0.959277887115193 
test_accuracy : 0.8230647709320695  


### Perceptron

In [89]:
model_per = pipeline_perceptron
model_per.set_params(**data_per,per_clf__verbose=1,per_clf__n_jobs=-1)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('per_clf',
                 Perceptron(alpha=5.272631578947369e-06, n_jobs=-1,
                            penalty='l1', verbose=1))])

In [90]:
%%time

model_per.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 59.23, NNZs: 10382, Bias: 0.150000, T: 41157, Avg. loss: 0.049036
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 67.12, NNZs: 12019, Bias: -0.280000, T: 41157, Avg. loss: 0.059772Norm: 65.96, NNZs: 11652, Bias: -0.330000, T: 41157, Avg. loss: 0.056884
Total training time: 0.05 seconds.
-- Epoch 2

Total training time: 0.07 seconds.
-- Epoch 2
Norm: 79.38, NNZs: 8014, Bias: 0.140000, T: 82314, Avg. loss: 0.026856
Total training time: 0.08 seconds.
-- Epoch 3
Norm: 88.65, NNZs: 8964, Bias: -0.300000, T: 82314, Avg. loss: 0.031492
Total training time: 0.08 seconds.
-- Epoch 3
Norm: 86.97, NNZs: 8612, Bias: -0.360000, T: 82314, Avg. loss: 0.029703
Total training time: 0.09 seconds.
-- Epoch 3
Norm: 96.96, NNZs: 5766, Bias: 0.150000, T: 123471, Avg. loss: 0.019641
Total training time: 0.09 seconds.
-- Epoch 4
Norm: 106.18, NNZs: 6571, Bias: -0.270000, T: 123471, Avg. loss: 0.022868
Total training time: 0.10 seconds.
-- Epoch 4
Norm: 104.62, NNZs: 

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.3s finished


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('per_clf',
                 Perceptron(alpha=5.272631578947369e-06, n_jobs=-1,
                            penalty='l1', verbose=1))])

In [91]:
testing(model_per)

train_accuracy : 0.8576669825303107 
test_accuracy : 0.7814639283833597  


### SVC

In [94]:
model_svc = pipeline_svc
model_svc.set_params(**data_svc,svc_clf__verbose=1)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('svc_clf',
                 LinearSVC(dual=False, max_iter=10000, penalty='l1',
                           verbose=1))])

In [95]:
%%time

model_svc.fit(X_train,y_train)

[LibLinear]...........*..........*....*
optimization finished, #iter = 257
Objective value = 13387.596363
#nonzeros/#features = 4675/30795
.............*...........*...*
optimization finished, #iter = 271
Objective value = 12866.620558
#nonzeros/#features = 4813/30795
...........*..........*...*.
optimization finished, #iter = 250
Objective value = 13900.984310
CPU times: user 6.94 s, sys: 13.1 ms, total: 6.95 s
Wall time: 6.92 s


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('svc_clf',
                 LinearSVC(dual=False, max_iter=10000, penalty='l1',
                           verbose=1))])

#nonzeros/#features = 4850/30795


In [96]:
testing(model_svc)

train_accuracy : 0.9274728478752096 
test_accuracy : 0.8496577145866245  


### Voting Classifier

In [49]:
from sklearn.ensemble import VotingClassifier


In [97]:
clf1 = model_sgd

clf2 = model_rf

clf3 = model_gb

clf4 = model_lr

clf5 = model_per

clf6 = model_svc


eclf1 = VotingClassifier(
     estimators=[('sgd', clf1), ('rf', clf2), ('gb', clf3), ('lr', clf4), ('per', clf5), ('svc', clf6)],
     voting='hard')

eclf2 = VotingClassifier(
     estimators=[('sgd', clf1), ('lr', clf4), ('per', clf5), ('svc', clf6)],
     voting='hard')


In [98]:
%%time

eclf1.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 360.88, NNZs: 5102, Bias: 0.947182, T: 41157, Avg. loss: 0.486335
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 414.01, NNZs: 5851, Bias: -1.909098, T: 41157, Avg. loss: 0.605988
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 406.08, NNZs: 5633, Bias: -1.600516, T: 41157, Avg. loss: 0.579718
Total training time: 0.03 seconds.
Norm: 420.66, NNZs: 4001, Bias: -1.523806, T: 82314, Avg. loss: 0.268741
Total training time: 0.05 seconds.-- Epoch 2
Norm: 369.55, NNZs: 3544, Bias: 1.069173, T: 82314, Avg. loss: 0.234354
Total training time: 0.06 seconds.
-- Epoch 3

-- Epoch 3
Norm: 412.70, NNZs: 3927, Bias: -1.462631, T: 82314, Avg. loss: 0.262655
Total training time: 0.06 seconds.
-- Epoch 3
Norm: 374.69, NNZs: 3254, Bias: 1.027108, T: 123471, Avg. loss: 0.214995
Total training time: 0.09 seconds.
-- Epoch 4
Norm: 416.32, NNZs: 3547, Bias: -1.238832, T: 123471, Avg. loss: 0.238725
Total training time: 0.08 seconds.
-- Epoch 4
Norm: 424.34, NN

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.6s finished


      Iter       Train Loss   Remaining Time 
         1           1.0120            9.95m
         2           0.9888            9.90m
         3           0.9711            9.89m
         4           0.9563            9.90m
         5           0.9428            9.90m
         6           0.9318            9.88m
         7           0.9217            9.86m
         8           0.9126            9.85m
         9           0.9040            9.83m
        10           0.8965            9.82m
        20           0.8405            9.62m
        30           0.8040            9.44m
        40           0.7767            9.29m
        50           0.7543            9.14m
        60           0.7351            9.00m
        70           0.7185            8.87m
        80           0.7038            8.74m
        90           0.6908            8.62m
       100           0.6784            8.50m
       200           0.5897            7.40m
       300           0.5325            6.22m
       40

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        92385     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.52156D+04    |proj g|=  6.00600D+03


 This problem is unconstrained.



At iterate   50    f=  1.72399D+04    |proj g|=  4.05238D+01

At iterate  100    f=  1.38656D+04    |proj g|=  1.69239D+02

At iterate  150    f=  1.31815D+04    |proj g|=  5.81206D+01

At iterate  200    f=  1.29607D+04    |proj g|=  7.76878D+01

At iterate  250    f=  1.28956D+04    |proj g|=  6.04622D+00

At iterate  300    f=  1.28799D+04    |proj g|=  3.58044D+00

At iterate  350    f=  1.28758D+04    |proj g|=  1.48195D+00

At iterate  400    f=  1.28742D+04    |proj g|=  4.92087D-01

At iterate  450    f=  1.28739D+04    |proj g|=  1.60575D+00

At iterate  500    f=  1.28738D+04    |proj g|=  1.04958D+00

At iterate  550    f=  1.28737D+04    |proj g|=  2.51328D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F   

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   13.7s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 59.23, NNZs: 10382, Bias: 0.150000, T: 41157, Avg. loss: 0.049036
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 65.96, NNZs: 11652, Bias: -0.330000, T: 41157, Avg. loss: 0.056884
Total training time: 0.04 seconds.
-- Epoch 2
Norm: 67.12, NNZs: 12019, Bias: -0.280000, T: 41157, Avg. loss: 0.059772
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 79.38, NNZs: 8014, Bias: 0.140000, T: 82314, Avg. loss: 0.026856
Total training time: 0.07 seconds.
-- Epoch 3
Norm: 86.97, NNZs: 8612, Bias: -0.360000, T: 82314, Avg. loss: 0.029703
Total training time: 0.08 seconds.
-- Epoch 3
Norm: 88.65, NNZs: 8964, Bias: -0.300000, T: 82314, Avg. loss: 0.031492
Total training time: 0.08 seconds.
-- Epoch 3
Norm: 96.96, NNZs: 5766, Bias: 0.150000, T: 123471, Avg. loss: 0.019641
Total training time: 0.09 seconds.
-- Epoch 4
Norm: 104.62, NNZs: 6373, Bias: -0.300000, T: 123471, Avg. loss: 0.022644
Total training time: 0.10 seconds.
-- Epoch 4
Norm: 106.18, NNZs: 

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.3s finished


[LibLinear]...........*..........*....**
optimization finished, #iter = 254
Objective value = 13387.596365
#nonzeros/#features = 4668/30795
.............*..........*...*
optimization finished, #iter = 266
Objective value = 12866.624502
#nonzeros/#features = 4818/30795
...........*..........*....*
optimization finished, #iter = 251
Objective value = 13900.999692
#nonzeros/#features = 4853/30795
CPU times: user 10min 8s, sys: 2.38 s, total: 10min 10s
Wall time: 10min 24s


VotingClassifier(estimators=[('sgd',
                              Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                              ('clf',
                                               SGDClassifier(alpha=1.5873684210526315e-05,
                                                             n_jobs=-1,
                                                             penalty='l1',
                                                             verbose=1))])),
                             ('rf',
                              Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                              ('rf_clf',
                                               RandomForestClassifier(criterion='entropy',
                                                                      max_depth=300,
                                                                      max_features='sqrt',
                                                                      min_samples_

In [99]:
testing(eclf1)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


train_accuracy : 0.937944942537114 
test_accuracy : 0.8499210110584519  


In [100]:
%%time

eclf2.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 357.60, NNZs: 4998, Bias: 0.891696, T: 41157, Avg. loss: 0.484172
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 366.71, NNZs: 3474, Bias: 0.954372, T: 82314, Avg. loss: 0.235496
Total training time: 0.03 seconds.
-- Epoch 3
Norm: 412.60, NNZs: 5878, Bias: -1.926124, T: 41157, Avg. loss: 0.598335
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 404.48, NNZs: 5595, Bias: -1.558111, T: 41157, Avg. loss: 0.571228
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 371.66, NNZs: 3172, Bias: 1.002502, T: 123471, Avg. loss: 0.215340
Total training time: 0.05 seconds.
-- Epoch 4
Norm: 375.23, NNZs: 3087, Bias: 0.988975, T: 164628, Avg. loss: 0.207595
Total training time: 0.07 seconds.
-- Epoch 5
Norm: 419.40, NNZs: 4059, Bias: -1.519952, T: 82314, Avg. loss: 0.269418
Total training time: 0.08 seconds.
-- Epoch 3
Norm: 377.89, NNZs: 3030, Bias: 1.001752, T: 205785, Avg. loss: 0.202337
Total training time: 0.09 seconds.
-- Epoch 6
Norm: 411.28, NNZ

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        92385     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.52156D+04    |proj g|=  6.00600D+03


 This problem is unconstrained.



At iterate   50    f=  1.72399D+04    |proj g|=  4.05238D+01

At iterate  100    f=  1.38656D+04    |proj g|=  1.69239D+02

At iterate  150    f=  1.31815D+04    |proj g|=  5.81206D+01

At iterate  200    f=  1.29607D+04    |proj g|=  7.76878D+01

At iterate  250    f=  1.28956D+04    |proj g|=  6.04622D+00

At iterate  300    f=  1.28799D+04    |proj g|=  3.58044D+00

At iterate  350    f=  1.28758D+04    |proj g|=  1.48195D+00

At iterate  400    f=  1.28742D+04    |proj g|=  4.92087D-01

At iterate  450    f=  1.28739D+04    |proj g|=  1.60575D+00

At iterate  500    f=  1.28738D+04    |proj g|=  1.04958D+00

At iterate  550    f=  1.28737D+04    |proj g|=  2.51328D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F   

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   13.8s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished


-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 59.23, NNZs: 10382, Bias: 0.150000, T: 41157, Avg. loss: 0.049036
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 67.12, NNZs: 12019, Bias: -0.280000, T: 41157, Avg. loss: 0.059772
Total training time: 0.03 seconds.
Norm: 65.96, NNZs: 11652, Bias: -0.330000, T: 41157, Avg. loss: 0.056884-- Epoch 2
Norm: 79.38, NNZs: 8014, Bias: 0.140000, T: 82314, Avg. loss: 0.026856
Total training time: 0.03 seconds.
-- Epoch 3

Total training time: 0.02 seconds.
-- Epoch 2
Norm: 88.65, NNZs: 8964, Bias: -0.300000, T: 82314, Avg. loss: 0.031492
Total training time: 0.05 seconds.
-- Epoch 3
Norm: 96.96, NNZs: 5766, Bias: 0.150000, T: 123471, Avg. loss: 0.019641
Total training time: 0.05 seconds.
-- Epoch 4
Norm: 112.52, NNZs: 4770, Bias: 0.130000, T: 164628, Avg. loss: 0.015376
Total training time: 0.07 seconds.
-- Epoch 5
Norm: 106.18, NNZs: 6571, Bias: -0.270000, T: 123471, Avg. loss: 0.022868
Total training time: 0.08 seconds.
-- Epoch 4
Norm: 126.18, NNZs:

VotingClassifier(estimators=[('sgd',
                              Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                              ('clf',
                                               SGDClassifier(alpha=1.5873684210526315e-05,
                                                             n_jobs=-1,
                                                             penalty='l1',
                                                             verbose=1))])),
                             ('lr',
                              Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                              ('lr_clf',
                                               LogisticRegression(C=10,
                                                                  max_iter=1000,
                                                                  n_jobs=-1,
                                                                  tol=1e-08,
                                          

In [101]:
testing(eclf2)

train_accuracy : 0.9355881138081007 
test_accuracy : 0.8522906793048973  


### Tests perso

In [57]:
tests_perso = ["I think covid is the most horrible threat we ever faced",
                "I love covid, thanks to it I can see my family much more often and I don't have to comute as much",
                "I would love to come to your birthday party, but I got covid, I have to stay confined"
]

In [58]:
predictions_out = eclf2.predict(tests_perso)


In [59]:
predictions_out

array([-1,  1,  1])