In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
from utils.preprocessing_cleaned_data import *
from utils.swSets import *
from sklearn import naive_bayes, metrics
from sklearn.model_selection import train_test_split, cross_val_score

# Classifier des épisodes dans la bonne série avec Naive Bayes

## Avec 20 séries

In [None]:
path = "/Vrac/PLDAC_reco/data20" #20 TV-shows, 3108 episodes
new_path = "/Vrac/PLDAC_reco/cleaned_data20"

createCleanedData(path, new_path)

Les séries de notre corpus qui contient 20 séries pour un total de 3108 episodes:

In [None]:
path = "/Vrac/PLDAC_reco/cleaned_data20"

d_info, d_name = getDicts(path)
d_name

On construit la matrice de TF-IDF où chaque ligne représente un épisode.

In [None]:
corpus = get_corpus(path, texts_as="episode")
X = getTfidfSparseMat(corpus, my_stopwords = stopwords_set)

On labelise chaque épisode par l'identifiant de la série auquel il appartient.

In [None]:
#Labelisation
nb_eps_per_show = []
for i in range(1, len(d_info.keys())+1):
    nb_eps_per_show.append(sum(d_info[i].values()))
Y = []
nb_shows = len(nb_eps_per_show)
somme_cumul = np.cumsum(nb_eps_per_show)
indMin = 0
for i in range(0, nb_shows):
    indMax = somme_cumul[i]
    for j in range(indMin, indMax):
        Y.append(i+1)
    indMin = indMax

On instancie un classifieur bayesien naif.

In [None]:
#Naives Bayes classifier instantiation
nb_clf = naive_bayes.MultinomialNB()

On utilise la cross-validation pour évaluer le classifieur.

5 folds :

In [None]:
cv = 5
scores = cross_val_score(nb_clf, X, Y, cv=cv)
scores

Accuracy moyenne:

In [None]:
np.mean(scores)

Ecart-type:

In [None]:
np.std(scores)

10 folds :

In [None]:
cv = 10
scores = cross_val_score(nb_clf, X, Y, cv=cv)
scores

Accuracy moyenne:

In [None]:
np.mean(scores)

Ecart-type:

In [None]:
np.std(scores)

## Avec 50 séries, 5630 épisodes

In [35]:
nb_eps = 0
for info in d_info.values():
    nb_eps += sum(list(info.values()))
        
nb_eps

5630

In [36]:
path = "/Vrac/PLDAC_reco/cleaned_data50"

d_info, d_name = getDicts(path)
d_name

{1: '10___Smallville',
 2: '1149___Downton_Abbey',
 3: '119___Robin_Hood',
 4: '121___Gossip_Girl',
 5: '122___Malcolm_In_The_Middle',
 6: '12___Doctor_Who',
 7: '132___NCIS',
 8: '152___Star_Trek__The_Animated_Series',
 9: '15___House',
 10: '16___Desperate_Housewives',
 11: '182___Friends',
 12: '186___American_Dad',
 13: '1___Lost',
 14: '2381___Scandal',
 15: '240___Breaking_Bad',
 16: '24___Scrubs',
 17: '28___Bones',
 18: '2964___Vikings',
 19: '2___Heroes',
 20: '30___Grey_s_Anatomy',
 21: '3103___House_of_Cards_(2013)',
 22: '32___Veronica_Mars',
 23: '334___Buffy_The_Vampire_Slayer',
 24: '364___Legend_of_the_Seeker',
 25: '366___True_Blood',
 26: '376___The_Mentalist',
 27: '384___H2O__Just_Add_Water',
 28: '3861___Orange_is_the_New_Black',
 29: '3990___Peaky_Blinders',
 30: '406___Agatha_Christie__Poirot',
 31: '4679___Outlander',
 32: '46___The_Tudors',
 33: '4___Prison_Break',
 34: '51___How_I_Met_Your_Mother',
 35: '5423___The_Last_Kingdom',
 36: '54___Seinfeld',
 37: '56

In [37]:
corpus = get_corpus(path, texts_as="episode")
X = getTfidfSparseMat(corpus, my_stopwords = stopwords_set)

In [38]:
#Labelisation
nb_eps_per_show = []
for i in range(1, len(d_info.keys())+1):
    nb_eps_per_show.append(sum(d_info[i].values()))
Y = []
nb_shows = len(nb_eps_per_show)
somme_cumul = np.cumsum(nb_eps_per_show)
indMin = 0
for i in range(0, nb_shows):
    indMax = somme_cumul[i]
    for j in range(indMin, indMax):
        Y.append(i+1)
    indMin = indMax

In [40]:
#Naives Bayes classifier instantiation
nb_clf = naive_bayes.MultinomialNB()

In [41]:
cv = 5
scores = cross_val_score(nb_clf, X, Y, cv=cv)
scores



array([0.65734266, 0.75395431, 0.77886323, 0.79121864, 0.77034358])

In [42]:
np.mean(scores)

0.7503444828574721

In [43]:
cv = 10
scores = cross_val_score(nb_clf, X, Y, cv=cv)
scores



array([0.67581475, 0.73195876, 0.77526132, 0.79824561, 0.80884956,
       0.81105169, 0.82014388, 0.82789855, 0.82234432, 0.78373383])

In [44]:
np.mean(scores)

0.7855302287385441

In [45]:
np.std(scores)

0.045520698672970476

# Tentative de recommandation avec Naive Bayes

<b>Idée :</b> 
- Entrainer le classifieur d'épisodes sur un ensemble de séries.  
- Prendre une série hors de cet ensemble.
- Tenter de classifier ses épisodes dans les séries aprises par le classifieur.
- Renvoyer la série apparaissant le plus dans les classifications comme étant la plus proche.

In [55]:
path_test = "/Vrac/PLDAC_reco/cleaned_data_reco_bayes"

corpus = get_corpus(path_test, texts_as="episode")

In [62]:
d_info, d_name = getDicts(path_test)

In [63]:
nb_eps = 0
for i in range(50, 58):
    nb_eps += sum(list(d_info[i].values()))
        
nb_eps    

457

In [81]:
c1 = list(corpus[:-457])
c2 = list(corpus[-457:])

In [74]:
d_info, d_name = getDicts(path_test)

for i in range(50, 58):
    del d_info[i]

In [76]:
#Labelisation
nb_eps_per_show = []
for i in range(1, len(d_info.keys())+1):
    nb_eps_per_show.append(sum(d_info[i].values()))
Y = []
nb_shows = len(nb_eps_per_show)
somme_cumul = np.cumsum(nb_eps_per_show)
indMin = 0
for i in range(0, nb_shows):
    indMax = somme_cumul[i]
    for j in range(indMin, indMax):
        Y.append(i+1)
    indMin = indMax

In [83]:
X = getTfidfSparseMat(c1, my_stopwords = stopwords_set)

In [84]:
#Naives Bayes classifier instantiation
nb_clf = naive_bayes.MultinomialNB()

nb_clf.fit(X, Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [85]:
X_test_reco = getTfidfSparseMat(c2, my_stopwords = stopwords_set)

In [88]:
X

<5639x121597 sparse matrix of type '<class 'numpy.float64'>'
	with 20708046 stored elements in Compressed Sparse Row format>

In [87]:
X_test_reco

<457x31672 sparse matrix of type '<class 'numpy.float64'>'
	with 1282148 stored elements in Compressed Sparse Row format>

In [None]:
nb_clf.predict(X_test_reco)

In [None]:
#predict on test
predictions = nb_clf.predict(X_test_reco)