### Import libraries

In [42]:
import glob
import os

import numpy as np
import pandas as pd

### Read data

In [43]:
path = "inputs/"
all_files = glob.glob(os.path.join(path , "*.csv"))

# Read all videos
li = []
for filename in all_files:
    df_video = pd.read_csv(filename)
    df_video["VIDEO"] = filename.split('-')[1].split('.')[0] 
    li.append(df_video)

# Create one unique dataframe
df = pd.concat(li, axis=0, ignore_index=True)

# Move column "VIDEO" in fist position
first_column = df.pop('VIDEO')
df.insert(0, 'VIDEO', first_column)

### Training

In [44]:
# Train/Test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["CONTENT"],
    df["CLASS"],
    test_size = 0.2,
    random_state = 42
)

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

pipe = make_pipeline(
    CountVectorizer(),
    RandomForestClassifier(random_state=42)
)

pipe.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=42))])

### Evaluation

In [46]:
pipe.score(X_test, y_test)

0.9566326530612245

In [47]:
# Inference

print(pipe.predict(["hey subscribe to my youtube channel please ! have fun"]))
print(pipe.predict(["Wow he is the best!"]))

[1]
[0]


#### Cross validation

In [48]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(pipe, X_train, y_train, cv=3)
print(scores)
print(np.mean(scores))
print(np.median(scores))
print(np.std(scores))

[0.94827586 0.96353167 0.9462572 ]
0.9526882432104484
0.9482758620689655
0.007711622297193711


### Tuning hyperparameters

In [49]:
parameters = {
    'countvectorizer__max_features' : (None, 1000, 2000),
    'countvectorizer__ngram_range' : ((1, 1), (1, 2)),
    'countvectorizer__stop_words': (None, 'english'),
    'countvectorizer__strip_accents': (None, 'ascii', 'unicode'),
    'randomforestclassifier__n_estimators' : (50, 100, 200)
}


In [50]:
from sklearn.model_selection import GridSearchCV

search_model = GridSearchCV(pipe, parameters, cv=3, n_jobs=-1) #n_jobs=-1 pour utiliser toute la capacité de ma machine

In [51]:
search_model.fit(X_train, y_train)

print(search_model.best_score_)
print(search_model.best_estimator_)

0.9584402722929429
Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000, ngram_range=(1, 2),
                                 stop_words='english',
                                 strip_accents='unicode')),
                ('randomforestclassifier',
                 RandomForestClassifier(n_estimators=50, random_state=42))])


### Enregistrement du modèle

In [54]:
import joblib

joblib.dump(search_model.best_estimator_, "models/spam_detector_model.pkl")

['models/spam_detector_model.pkl']

### Chargement du modèle

In [55]:
loaded_model = joblib.load("models/spam_detector_model.pkl")

In [58]:
# New inference
print(loaded_model.predict(["hey guys, come to see my video on my youtube channel please"]))
print(loaded_model.predict(["Thank you for this video"]))

[1]
[0]
