In [None]:
from os import chdir
from google.colab import drive

drive.mount("/content/drive", force_remount=True)
chdir("/content/drive/MyDrive/Eliott/files/")

!pip install tweet-preprocessor

Mounted at /content/drive


In [None]:
# IMPORTS

import numpy as np
import pandas as pd
import preprocessing as pp
import json


from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline


%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
file_name_train = 'Corona_NLP_train.csv'
file_name_test = 'Corona_NLP_test.csv'
X_train, y_train = pp.prepare_dataframe(file_name_train,lemmatising=False)
X_test, y_test = pp.prepare_dataframe(file_name_test,lemmatising=False)

On enlève : 
- Les URLS
- Hashtags
- Mentions
- Mots réservés
- Emojis et smileys

- 35525 mots sans lemmatisation
- 30794 avec lemmatisation

# Recherche d'Hyperparamètres

In [None]:
pipeline_sgd = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', SGDClassifier()),
])

pipeline_gb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', GradientBoostingClassifier()),
])

pipeline_rf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier()),
])

pipeline_lr = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression(max_iter=1000)),
])

pipeline_per = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', Perceptron()),
])

pipeline_svc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC(max_iter=10000)),
])


In [None]:
models_and_params = {
    "SGD" : { "model" : pipeline_sgd,
              "params" : {
                    'clf__penalty': ['l2', 'l1', 'elasticnet'],
                    'clf__alpha': np.linspace(1e-10, 1e-4, 40),  
                        }
    },
    "GB" : { "model" : pipeline_gb,
              "params" : {
                    "clf__learning_rate": [0.1, 0.2,0.4,0.8],
                    "clf__n_estimators":[1600]#,3200]
                        }
    },
    "RF" : { "model" : pipeline_rf,
              "params" : {
                    'clf__n_estimators': [100, 200],   
                    'clf__max_depth': [300,600,None],
                    'clf__min_samples_leaf': [1, 2, 3],   
                    'clf__min_samples_split': [16, 32, 64],
                    'clf__max_features': ['log2', 'sqrt'],
                    'clf__criterion': ['gini', 'entropy']
                        }
    },
    "LR" : { "model" : pipeline_lr,
              "params" : {
                    'clf__C': [20,10, 1],
                    'clf__tol': np.linspace(1e-12,1e-6,20)
                        }
    },
    "PER" : { "model" : pipeline_per,
              "params" : {
                    'clf__penalty': ['l2', 'l1', 'elasticnet'],
                    'clf__alpha': np.linspace(1e-8, 1e-4, 100),
                        }
    },
    "SVC" : { "model" : pipeline_svc,
              "params" : {
                    'clf__penalty': ['l2', 'l1', 'elasticnet'],
                    'clf__loss': ['hinge', 'squared_hinge'],
                    'clf__dual' : [False,True]
                        }
    }
    

}

In [None]:
def grid_Search(model_name,subset=-1):

    model = models_and_params[model_name]["model"]
    parameters = models_and_params[model_name]["params"]

    grid_clf = GridSearchCV(model, parameters,  scoring='accuracy', verbose=1 ,n_jobs=-1)
    
    if subset==-1:
        grid_clf.fit(X_train, y_train)
    else:
        grid_clf.fit(X_train[:subset], y_train[:subset])


    print("Best Score: ", grid_clf.best_score_)
    print("Best Params: ", grid_clf.best_params_)

    return grid_clf


### Saving Hyperparameters in JSON

We will first load old weights and updates only if needed. Then we can use the model with best params in the end.

In [None]:
with open('data.json') as json_file:
    dico = json.load(json_file)

### SGD

In [None]:
%%time
grid_sgd = grid_Search("SGD") 

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Best Score:  0.8807008312810172
Best Params:  {'clf__alpha': 1.28206e-05, 'clf__penalty': 'l1'}
CPU times: user 29 s, sys: 4.25 s, total: 33.2 s
Wall time: 7min 37s


In [None]:
dico['SGD'] = grid_sgd.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

### Gradient boosting

In [None]:
%%time
grid_gb = grid_Search("GB") 

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Score:  0.8391769441865644
Best Params:  {'clf__learning_rate': 0.4, 'clf__n_estimators': 1600}
CPU times: user 33min 38s, sys: 7.7 s, total: 33min 46s
Wall time: 3h 40min 28s


In [None]:
dico['GB'] = grid_gb.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

### Random Forest

In [None]:
%%time
grid_rf = grid_Search("RF") 

Fitting 5 folds for each of 216 candidates, totalling 1080 fits




Best Score:  0.7047644196092446
Best Params:  {'clf__criterion': 'entropy', 'clf__max_depth': None, 'clf__max_features': 'log2', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 32, 'clf__n_estimators': 200}
CPU times: user 2min 29s, sys: 14 s, total: 2min 43s
Wall time: 2h 30min 23s


In [None]:
dico['RF'] = grid_rf.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)


### Logistic Regression

In [None]:
%%time
grid_lr = grid_Search("LR") 

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Score:  0.8230677619122002
Best Params:  {'clf__C': 10, 'clf__tol': 1e-12}
CPU times: user 1min 20s, sys: 2min 15s, total: 3min 35s
Wall time: 36min 42s


In [None]:
dico['LR'] = grid_lr.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

### Perceptron

In [None]:
%%time
grid_per = grid_Search("PER") 

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Best Score:  0.818353925564533
Best Params:  {'clf__alpha': 3.04e-06, 'clf__penalty': 'l1'}
CPU times: user 1min 10s, sys: 7.51 s, total: 1min 17s
Wall time: 16min 16s


In [None]:
dico['Perceptron'] = grid_per.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

#### Observation 

La regression logistique n'est qu'un perceptron avec une sigmoid en fonction d'activation.
On voit que la Regression Logistique a de meilleures performances à l'issue de la recherche d'hyperparamètres mais pas de loin. Par ailleurs le temps d'entrainement est considérablement plus élevé pour la regression logistique (du au calcul de l'exponentiel). Nous verrons par la suite quel modèle il est préférable de conserver. 

### LinearSVC

In [None]:
%%time
grid_svc = grid_Search("SVC") 

Fitting 5 folds for each of 12 candidates, totalling 60 fits


40 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/svm/_classes.py", line 272, in fit
    sample_weight=sample_weight,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/svm/_base.py", line 1185, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_c

Best Score:  0.8689408679104181
Best Params:  {'clf__dual': False, 'clf__loss': 'squared_hinge', 'clf__penalty': 'l1'}
CPU times: user 11.8 s, sys: 273 ms, total: 12.1 s
Wall time: 1min 7s


In [None]:
dico['SVC'] = grid_svc.best_params_
with open('data.json', 'w') as fp:
    json.dump(dico, fp,  indent=4)

### Charging the best parameters for training

In [None]:
# Opening JSON file
with open('data.json') as json_file:
    data = json.load(json_file)


data_sgd = data['SGD']
data_rf = data['RF']
data_gb = data['GB']
data_lr = data['LR']
data_per = data['Perceptron']
data_svc = data['SVC']



### Training on all training data and testing on test data

In [None]:
def testing(model):
    predictions_train = model.predict(X_train)
    predictions_test = model.predict(X_test)
    accuracy_train = accuracy_score(y_train,predictions_train )
    accuracy_test = accuracy_score(y_test,predictions_test )
    
    print(f"train_accuracy : {accuracy_train} \ntest_accuracy : {accuracy_test}  ")




### SGD

In [None]:
model_sgd_best = pipeline_sgd
model_sgd_best.set_params(**data_sgd,clf__n_jobs=-1) 

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 SGDClassifier(alpha=1.28206e-05, n_jobs=-1, penalty='l1'))])

In [None]:
%%time

"""
Si on a fait la recherche avant, on prend le best modèle,

Sinon, on reconstruit le modèle avec les best paramètres et on le fit à toutes les données d'entrainement. 

"""

model_sgd_best.fit(X_train,y_train)


CPU times: user 2.38 s, sys: 292 ms, total: 2.67 s
Wall time: 1.67 s


In [None]:
testing(model_sgd_best)

train_accuracy : 0.9327453410112496 
test_accuracy : 0.8773038441284887  


### Random Forest

In [None]:
model_rf_best = pipeline_rf
model_rf_best.set_params(**data_rf,clf__n_jobs=-1)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 RandomForestClassifier(criterion='entropy',
                                        max_features='log2',
                                        min_samples_split=32, n_estimators=200,
                                        n_jobs=-1))])

In [None]:
%%time

model_rf_best.fit(X_train,y_train)

CPU times: user 1min 22s, sys: 1.39 s, total: 1min 24s
Wall time: 22.4 s


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 RandomForestClassifier(criterion='entropy',
                                        max_features='log2',
                                        min_samples_split=32, n_estimators=200,
                                        n_jobs=-1))])

In [None]:
testing(model_rf_best)

train_accuracy : 0.996039555847122 
test_accuracy : 0.7074776197998947  


### Gradient Boosting 

In [None]:
model_gb_best = pipeline_gb
model_gb_best.set_params(**data_gb)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 GradientBoostingClassifier(learning_rate=0.4,
                                            n_estimators=1600))])

In [61]:
%%time
model_gb_best.fit(X_train,y_train)

CPU times: user 33min 41s, sys: 3.8 s, total: 33min 45s
Wall time: 33min 40s


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 GradientBoostingClassifier(learning_rate=0.4,
                                            n_estimators=1600))])

In [62]:
testing(model_gb_best)

train_accuracy : 0.9979104405082975 
test_accuracy : 0.8462348604528699  


### Logistic Regression

In [63]:
model_lr_best = pipeline_lr
model_lr_best.set_params(**data_lr,clf__n_jobs=-1)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 LogisticRegression(C=10, max_iter=1000, n_jobs=-1,
                                    tol=1e-12))])

In [64]:
%%time

model_lr_best.fit(X_train,y_train)

CPU times: user 1.21 s, sys: 188 ms, total: 1.4 s
Wall time: 24.2 s


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 LogisticRegression(C=10, max_iter=1000, n_jobs=-1,
                                    tol=1e-12))])

In [65]:
testing(model_lr_best)

train_accuracy : 0.9755813105911509 
test_accuracy : 0.833070036861506  


### Perceptron

In [66]:
model_per_best = pipeline_per
model_per_best.set_params(**data_per,clf__n_jobs=-1)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', Perceptron(alpha=3.04e-06, n_jobs=-1, penalty='l1'))])

In [67]:
%%time
model_per_best.fit(X_train,y_train)

CPU times: user 1.92 s, sys: 230 ms, total: 2.15 s
Wall time: 1.55 s


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', Perceptron(alpha=3.04e-06, n_jobs=-1, penalty='l1'))])

In [68]:
testing(model_per_best)

train_accuracy : 0.941662414656073 
test_accuracy : 0.8217482885729331  


### SVC

In [69]:
model_svc_best = pipeline_svc
model_svc_best.set_params(**data_svc)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', LinearSVC(dual=False, max_iter=10000, penalty='l1'))])

In [70]:
%%time

model_svc_best.fit(X_train,y_train)

CPU times: user 9.03 s, sys: 16.8 ms, total: 9.05 s
Wall time: 9.03 s


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', LinearSVC(dual=False, max_iter=10000, penalty='l1'))])

In [71]:
testing(model_svc_best)

train_accuracy : 0.9481983623684914 
test_accuracy : 0.8730911005792522  


### Voting Classifier

In [72]:
from sklearn.ensemble import VotingClassifier


In [73]:
clf1 = model_sgd_best

clf2 = model_rf_best

clf3 = model_gb_best

clf4 = model_lr_best

clf5 = model_per_best

clf6 = model_svc_best


eclf1 = VotingClassifier(
     estimators=[('sgd', clf1), ('rf', clf2), ('gb', clf3), ('lr', clf4), ('per', clf5), ('svc', clf6)],
     voting='hard')

eclf2 = VotingClassifier(
     estimators=[('sgd', clf1), ('lr', clf4), ('per', clf5), ('svc', clf6)],
     voting='hard')


In [None]:
%%time

eclf1.fit(X_train,y_train)

In [None]:
testing(eclf1)

In [None]:
%%time

eclf2.fit(X_train,y_train)

In [None]:
testing(eclf2)

### Tests perso

In [None]:
tests_perso = ["I think covid is the most horrible threat we ever faced",
                "I love covid, thanks to it I can see my family much more often and I don't have to comute as much",
                "I would love to come to your birthday party, but I got covid, I have to stay confined"
]

In [None]:
predictions_out = eclf2.predict(tests_perso)


In [None]:
predictions_out