imports

In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix


from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

import joblib

[nltk_data] Downloading package stopwords to C:\Users\Abdel
[nltk_data]     Alim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessing function

In [26]:
stop_words = set(stopwords.words('english')) #stop words are wor
def remove_stop(x):
    return " ".join([word for word in str(x).split() if word not in stop_words])
def remove_multiple_spaces(x):
    return " ".join([word for word in str(x).split()])
def preprocessing(data):
    data = data.drop(['COMMENT_ID', 'AUTHOR', 'DATE'], axis=1)
    data['CONTENT'] = data['CONTENT'].str.lower()
    data['CONTENT'] = data['CONTENT'].str.replace(r'.com/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),])+',' ')
    data['CONTENT'] = data['CONTENT'].str.replace(r'[^\w\s]',' ')
    data['CONTENT'] = data['CONTENT'].apply(lambda x : remove_multiple_spaces(x))
    data['CONTENT'] = data['CONTENT'].apply(lambda x : remove_stop(x))
    data = data.drop_duplicates(subset='CONTENT', keep='first')
    x = data['CONTENT']
    y = data['CLASS']
    return(x,y)

hyperparameter tuning with one hidden layer

In [130]:
def one_hidden_layer(X_main,y_main,iter,noeuds_hidden_layers, alphas):
    meanAll = []
    for lay in noeuds_hidden_layers:
        accuracyvalid= []
        for alpha in alphas:
            mlp = MLPClassifier(solver="lbfgs", random_state=1,hidden_layer_sizes=[lay], alpha=alpha)
            for i in range(iter):
                X_train, X_validat, y_train, y_validat = train_test_split(X_main,y_main ,
                                test_size = 0.16,
                                shuffle = True)
                mlp.fit(X_train, y_train)
                accuracy_valid = mlp.score(X_validat,y_validat)
                accuracyvalid.append(accuracy_valid)
            
            f = np.array(accuracyvalid)
        
            a = [lay,alpha,f.mean()]
            meanAll.append(a)
    pm = np.array(meanAll)
    best_number_of_noeds = np.where(pm[:,-1] ==max(pm[:,-1]))[0]
    return meanAll[best_number_of_noeds[0]]

hyperparameter tuning with two hidden layer

In [131]:

def two_hidden_layer(X_main,y_main,iter,noeuds_hidden_layers, alphas):
    meanAll = []
    for lay1 in noeuds_hidden_layers:
        for lay2 in noeuds_hidden_layers:
            accuracyvalid= []
            for alpha in alphas:
                mlp = MLPClassifier(solver="lbfgs", random_state=1,hidden_layer_sizes=[lay1,lay2], alpha=alpha)
                for i in range(iter):
                    X_train, X_validat, y_train, y_validat = train_test_split(X_main,y_main ,
                                    test_size = 0.16,
                                    shuffle = True)
                    mlp.fit(X_train, y_train)
                    accuracy_valid = mlp.score(X_validat,y_validat)
                    accuracyvalid.append(accuracy_valid)
                
                f = np.array(accuracyvalid)
            
                a = [lay1,lay2,alpha,f.mean()]
                meanAll.append(a)
    pm = np.array(meanAll)
    best_number_of_noeds = np.where(pm[:,-1] ==max(pm[:,-1]))[0]
    return meanAll[best_number_of_noeds[0]]

hyperparameter tuning with three hidden layer

In [132]:
def three_hidden_layer(X_main,y_main,iter,noeuds_hidden_layers, alphas):
    meanAll = []
    for lay1 in noeuds_hidden_layers:
        for lay2 in noeuds_hidden_layers:
            for lay3 in noeuds_hidden_layers:
                accuracyvalid= []
                for alpha in alphas:
                    mlp = MLPClassifier(solver="lbfgs", random_state=1,hidden_layer_sizes=[lay1,lay2,lay3], alpha=alpha)
                    for i in range(iter):
                        X_train, X_validat, y_train, y_validat = train_test_split(X_main,y_main ,
                                        test_size = 0.16,
                                        shuffle = True)
                        mlp.fit(X_train, y_train)
                        accuracy_valid = mlp.score(X_validat,y_validat)
                        accuracyvalid.append(accuracy_valid)
                    
                    f = np.array(accuracyvalid)
                
                    a = [lay1,lay2,lay3,alpha,f.mean()]
                    meanAll.append(a)
    pm = np.array(meanAll)
    best_number_of_noeds = np.where(pm[:,-1] ==max(pm[:,-1]))[0]
    return meanAll[best_number_of_noeds[0]]

In [133]:
def which_hidden_layer_we_choose(X_main,y_main,best_one_hidden_layer,best_two_hidden_layer,best_three_hidden_layer):
    X_train, X_validat, y_train, y_validat = train_test_split(X_main,y_main ,
                            test_size = 0.16,
                            shuffle = True)
    
    a = best_one_hidden_layer
    b = best_two_hidden_layer
    c = best_three_hidden_layer

    modle = MLPClassifier(solver="lbfgs", random_state=1,hidden_layer_sizes=[a[0]], alpha=a[1])
    modle.fit(X_train, y_train)
    a[-1] = modle.score(X_validat,y_validat)
    modle = MLPClassifier(solver="lbfgs", random_state=1,hidden_layer_sizes=[b[0],b[1]], alpha=b[2])
    modle.fit(X_train, y_train)
    b[-1] = modle.score(X_validat,y_validat)
    modle = MLPClassifier(solver="lbfgs", random_state=1,hidden_layer_sizes=[c[0],c[1],c[2]], alpha=c[3])
    modle.fit(X_train, y_train)
    c[-1] = modle.score(X_validat,y_validat)

    list2 = [a[-1], b[-1], c[-1]]
    max1 = max(list2)
    index = list2.index(max1)
    if index == 0:
        return a
    if index == 1:
        return b
    if index == 2:
        return c
    

Main function

In [135]:
data = pd.read_csv('Youtube01-Psy.csv')
x, y = preprocessing(data)

vectorizer = CountVectorizer( )
featuers = vectorizer.fit_transform(x)


X_main, X_test, y_main, y_test = train_test_split(featuers,y ,
                                   test_size = 0.2, 
                                   shuffle = True)

noeuds_hidden_layers = [1,3,5,10,15,20]
alphas = [0.001,0.01,0.5]
iter = 100


X_train, X_validat, y_train, y_validat = train_test_split(X_main,y_main ,
                            test_size = 0.16,
                            shuffle = True)

a = one_hidden_layer(X_main,y_main,iter,noeuds_hidden_layers, alphas)
b = two_hidden_layer(X_main,y_main,iter,noeuds_hidden_layers, alphas)
c = three_hidden_layer(X_main,y_main,iter,noeuds_hidden_layers, alphas)

best = which_hidden_layer_we_choose(X_main, y_main, a, b, c)
print(best[:-1])
modle = MLPClassifier(solver="lbfgs", random_state=1,hidden_layer_sizes=best[:-2], alpha=best[-2])
modle.fit(X_train, y_train)
print("accuracy test", modle.score(X_test,y_test))
pred_test = modle.predict(X_test)
print(confusion_matrix(y_test,pred_test))




[10, 0.001, 0.9636363636363636]
[15, 10, 0.001, 0.9818181818181818]
[10, 3, 10, 0.001, 0.9818181818181818]
[15, 10, 0.001]
accuracy test 0.9117647058823529
[[32  5]
 [ 1 30]]


In [139]:
#save the model
modle = joblib.load('myModel.joblib')

FileNotFoundError: [Errno 2] No such file or directory: 'myModel.joblib'