In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
from utils.preprocessing_cleaned_data import *
from utils.swSets import *
from sklearn import naive_bayes, metrics
from sklearn.model_selection import train_test_split, cross_val_score

# Classifier des épisodes dans la bonne série avec Naive Bayes

## Avec 20 séries

In [2]:
#path = "/Vrac/PLDAC_reco/data20" #20 TV-shows, 3108 episodes
#new_dir = "/Vrac/PLDAC_reco/cleaned_data20"

#createCleanedData(path, new_dir)

Les séries de notre corpus qui contient 20 séries pour un total de 3108 episodes:

In [3]:
path = "/Vrac/PLDAC_reco/cleaned_data20"

d_info, d_name = getDicts(path)
d_name

{1: '10___Smallville',
 2: '122___Malcolm_In_The_Middle',
 3: '12___Doctor_Who',
 4: '132___NCIS',
 5: '15___House',
 6: '16___Desperate_Housewives',
 7: '186___American_Dad',
 8: '2381___Scandal',
 9: '24___Scrubs',
 10: '28___Bones',
 11: '30___Grey_s_Anatomy',
 12: '3103___House_of_Cards_(2013)',
 13: '32___Veronica_Mars',
 14: '366___True_Blood',
 15: '51___How_I_Met_Your_Mother',
 16: '57___CSI__Crime_Scene_Investigation',
 17: '615___The_Good_Wife',
 18: '66___Ugly_Betty',
 19: '71___The_Wire',
 20: '880___Pretty_Little_Liars'}

On construit la matrice de TF-IDF où chaque ligne représente un épisode.

In [4]:
corpus = get_corpus(path, texts_as="episode")
X = getTfidfSparseMat(corpus, my_stopwords = stopwords_set)

On labelise chaque épisode par l'identifiant de la série auquel il appartient.

In [5]:
#Labelisation
nb_eps_per_show = []
for i in range(1, len(d_info.keys())+1):
    nb_eps_per_show.append(sum(d_info[i].values()))
Y = []
nb_shows = len(nb_eps_per_show)
somme_cumul = np.cumsum(nb_eps_per_show)
indMin = 0
for i in range(0, nb_shows):
    indMax = somme_cumul[i]
    for j in range(indMin, indMax):
        Y.append(i+1)
    indMin = indMax

On instancie un classifieur bayesien naif.

In [6]:
#Naives Bayes classifier instantiation
nb_clf = naive_bayes.MultinomialNB()

On utilise la cross-validation pour évaluer le classifieur.

5 folds :

In [7]:
cv = 5
scores = cross_val_score(nb_clf, X, Y, cv=cv)
scores

array([0.74562798, 0.76038339, 0.8016129 , 0.79902755, 0.75974026])

Accuracy moyenne:

In [8]:
np.mean(scores)

0.773278416628773

Ecart-type:

In [9]:
np.std(scores)

0.022715506705476406

10 folds :

In [10]:
cv = 10
scores = cross_val_score(nb_clf, X, Y, cv=cv)
scores

array([0.7672956 , 0.7672956 , 0.77777778, 0.77070064, 0.84076433,
       0.82315113, 0.83116883, 0.81639344, 0.79207921, 0.75827815])

Accuracy moyenne:

In [11]:
np.mean(scores)

0.7944904693709065

Ecart-type:

In [12]:
np.std(scores)

0.029033230657270728

## Avec 50 séries, 5630 épisodes

In [13]:
#path = "/Vrac/PLDAC_reco/data50"
#new_dir = "/Vrac/PLDAC_reco/cleaned_data50"

#createCleanedData(path, new_dir)

In [14]:
path = "/Vrac/PLDAC_reco/cleaned_data50"

d_info, d_name = getDicts(path)
d_name

{1: '10___Smallville',
 2: '1149___Downton_Abbey',
 3: '119___Robin_Hood',
 4: '121___Gossip_Girl',
 5: '122___Malcolm_In_The_Middle',
 6: '12___Doctor_Who',
 7: '132___NCIS',
 8: '152___Star_Trek__The_Animated_Series',
 9: '15___House',
 10: '16___Desperate_Housewives',
 11: '182___Friends',
 12: '186___American_Dad',
 13: '1___Lost',
 14: '2381___Scandal',
 15: '240___Breaking_Bad',
 16: '24___Scrubs',
 17: '28___Bones',
 18: '2964___Vikings',
 19: '2___Heroes',
 20: '30___Grey_s_Anatomy',
 21: '3103___House_of_Cards_(2013)',
 22: '32___Veronica_Mars',
 23: '334___Buffy_The_Vampire_Slayer',
 24: '364___Legend_of_the_Seeker',
 25: '366___True_Blood',
 26: '376___The_Mentalist',
 27: '384___H2O__Just_Add_Water',
 28: '3861___Orange_is_the_New_Black',
 29: '3990___Peaky_Blinders',
 30: '406___Agatha_Christie__Poirot',
 31: '4679___Outlander',
 32: '46___The_Tudors',
 33: '4___Prison_Break',
 34: '51___How_I_Met_Your_Mother',
 35: '5423___The_Last_Kingdom',
 36: '54___Seinfeld',
 37: '56

In [15]:
nb_eps = 0
for info in d_info.values():
    nb_eps += sum(list(info.values()))
        
nb_eps

5630

In [16]:
corpus = get_corpus(path, texts_as="episode")
X = getTfidfSparseMat(corpus, my_stopwords = stopwords_set)

In [17]:
#Labelisation
nb_eps_per_show = []
for i in range(1, len(d_info.keys())+1):
    nb_eps_per_show.append(sum(d_info[i].values()))
Y = []
nb_shows = len(nb_eps_per_show)
somme_cumul = np.cumsum(nb_eps_per_show)
indMin = 0
for i in range(0, nb_shows):
    indMax = somme_cumul[i]
    for j in range(indMin, indMax):
        Y.append(i+1)
    indMin = indMax

In [18]:
#Naives Bayes classifier instantiation
nb_clf = naive_bayes.MultinomialNB()

In [19]:
cv = 5
scores = cross_val_score(nb_clf, X, Y, cv=cv)
scores



array([0.6451049 , 0.69859402, 0.74156306, 0.74103943, 0.71066908])

In [20]:
np.mean(scores)

0.7073940958105228

In [21]:
cv = 10
scores = cross_val_score(nb_clf, X, Y, cv=cv)
scores



array([0.65523156, 0.67525773, 0.70383275, 0.7122807 , 0.76460177,
       0.75222816, 0.74820144, 0.75362319, 0.74542125, 0.7245841 ])

In [22]:
np.mean(scores)

0.723526265731068

In [45]:
np.std(scores)

0.045520698672970476

Moins bonne accuracy avec 50 séries mais certaines séries dans ce corpus contiennent trop peu d'épisodes (par rapport au corpus de 20 séries)

# Tentative de recommandation avec Naive Bayes

<b>Idée :</b> 
- Entrainer le classifieur d'épisodes sur un ensemble de séries.  
- Prendre une série hors de cet ensemble.
- Tenter de classifier ses épisodes dans les séries aprises par le classifieur.
- Renvoyer la série apparaissant le plus dans les classifications comme étant la plus proche.

In [23]:
#path_test = "/Vrac/PLDAC_reco/data_reco_bayes"
#new_dir_test = "/Vrac/PLDAC_reco/cleaned_data_reco_bayes"

#createCleanedData(path_test, new_dir_test)

In [2]:
path_test = "/Vrac/PLDAC_reco/cleaned_data_reco_bayes"

corpus = get_corpus(path_test, texts_as="episode")

In [3]:
d_info, d_name = getDicts(path_test)

In [4]:
nb_eps = 0
for i in range(50, 58):
    nb_eps += sum(list(d_info[i].values()))
        
nb_eps    

457

In [5]:
X = getTfidfSparseMat(corpus, my_stopwords = stopwords_set)

In [6]:
X

<6096x125664 sparse matrix of type '<class 'numpy.float64'>'
	with 4371841 stored elements in Compressed Sparse Row format>

In [7]:
X1 = X[:-457]
X2 = X[-457:]

In [8]:
d_info, d_name = getDicts(path_test)

for i in range(50, 58):
    del d_info[i]

In [10]:
d_name

{1: '108___Mad_Men',
 2: '10___Smallville',
 3: '1149___Downton_Abbey',
 4: '119___Robin_Hood',
 5: '121___Gossip_Girl',
 6: '122___Malcolm_In_The_Middle',
 7: '12___Doctor_Who',
 8: '132___NCIS',
 9: '152___Star_Trek__The_Animated_Series',
 10: '15___House',
 11: '16___Desperate_Housewives',
 12: '182___Friends',
 13: '186___American_Dad',
 14: '1___Lost',
 15: '2381___Scandal',
 16: '240___Breaking_Bad',
 17: '24___Scrubs',
 18: '262___Only_Fools_and_Horses',
 19: '2766___Major_Crimes',
 20: '28___Bones',
 21: '2964___Vikings',
 22: '2___Heroes',
 23: '30___Grey_s_Anatomy',
 24: '3103___House_of_Cards_(2013)',
 25: '32___Veronica_Mars',
 26: '334___Buffy_The_Vampire_Slayer',
 27: '364___Legend_of_the_Seeker',
 28: '366___True_Blood',
 29: '376___The_Mentalist',
 30: '384___H2O__Just_Add_Water',
 31: '3861___Orange_is_the_New_Black',
 32: '3990___Peaky_Blinders',
 33: '406___Agatha_Christie__Poirot',
 34: '4679___Outlander',
 35: '46___The_Tudors',
 36: '48___Everybody_Hates_Chris',
 

In [9]:
#Labelisation
nb_eps_per_show = []
for i in range(1, len(d_info.keys())+1):
    nb_eps_per_show.append(sum(d_info[i].values()))
Y = []
nb_shows = len(nb_eps_per_show)
somme_cumul = np.cumsum(nb_eps_per_show)
indMin = 0
for i in range(0, nb_shows):
    indMax = somme_cumul[i]
    for j in range(indMin, indMax):
        Y.append(i+1)
    indMin = indMax

In [11]:
#Naives Bayes classifier instantiation
nb_clf = naive_bayes.MultinomialNB()

nb_clf.fit(X1, Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
predictions = nb_clf.predict(X2)

In [13]:
predictions

array([43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 23, 43, 43, 43, 43, 43, 43,
       43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
       43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
       43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
       43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
       43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
       43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 23, 43, 43, 43,
       43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
       43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
       43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
       43, 43, 43, 43, 43, 43, 43,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        8, 43, 43, 43, 43, 43, 43, 23, 43, 23, 23, 43, 43, 43, 43, 43, 43,
       43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
       43, 43, 43, 43, 43

Ne marche pas, (biais chelou quelque part ? ou alors idée nulle) à revoir

Dans les prédictions, on ne retrouve que quelques séries. Ces séries ayant beaucoup de saisons et d'épisodes par saison, elles aspirent les recommandations avec le classifieur naiveBayes. Voir avec un SVM (discriminant) si les résultats changent.

In [14]:
d_info, d_name = getDicts(path_test)

In [15]:
cpt = 0
pred_series = []
for idSerie in range(50, 58) :
    dico_saisons = d_info[idSerie]
    pred_saisons = []
    for idSaison in dico_saisons :
        nb_ep = dico_saisons[idSaison]
        for idSerieReco in predictions[cpt:cpt+nb_ep]:
            pred_saisons.append(idSerieReco)
        
        cpt += nb_ep
    pred_series.append(pred_saisons)
        


In [16]:
import operator

l = []
for idSerie in range(0, len(pred_series)) :
    liste_series_recommandees = pred_series[idSerie]
    dico_series = dict.fromkeys(set(liste_series_recommandees), 0)
    for idSerieReco in liste_series_recommandees :
        dico_series[idSerieReco] += 1
    m = max(dico_series.items(), key=operator.itemgetter(1))[0]
    l.append(m)

l

[43, 43, 43, 8, 43, 43, 43, 43]

In [17]:
for idSerie in range(50, 58) :
    print("-----------------------------------------------------------------------\n")
    nomSerie = d_name[idSerie]
    print("Si vous avez aimé la série "+str(nomSerie)+"\n")
    idSerieReco = l[idSerie-50]
    nomSerieReco = d_name[idSerieReco]
    print("\tvous aimerez sûrement la série "+str(nomSerieReco)+"\n")

-----------------------------------------------------------------------

Si vous avez aimé la série 68___Black_Books

	vous aimerez sûrement la série 57___CSI__Crime_Scene_Investigation

-----------------------------------------------------------------------

Si vous avez aimé la série 6___Dexter

	vous aimerez sûrement la série 57___CSI__Crime_Scene_Investigation

-----------------------------------------------------------------------

Si vous avez aimé la série 71___The_Wire

	vous aimerez sûrement la série 57___CSI__Crime_Scene_Investigation

-----------------------------------------------------------------------

Si vous avez aimé la série 73___Rome

	vous aimerez sûrement la série 132___NCIS

-----------------------------------------------------------------------

Si vous avez aimé la série 793___The_League_of_Gentlemen

	vous aimerez sûrement la série 57___CSI__Crime_Scene_Investigation

-----------------------------------------------------------------------

Si vous avez aimé la

Apparemment, la série CSI est une très bonne recommandation de série.