In [6]:
import numpy as np
from utils.preprocessing_cleaned_data import *
from utils.swSets import *
from sklearn import naive_bayes, metrics, svm
from sklearn.model_selection import train_test_split, cross_val_score

# Classifier des épisodes dans la bonne série avec SVM

## Avec 20 séries

Un premier corpus de 20 séries, pour un total de 3108 épisodes.

In [2]:
path = "/Vrac/PLDAC_reco/cleaned_data20"

d_info, d_name = getDicts(path)
d_name

{1: '10___Smallville',
 2: '122___Malcolm_In_The_Middle',
 3: '12___Doctor_Who',
 4: '132___NCIS',
 5: '15___House',
 6: '16___Desperate_Housewives',
 7: '186___American_Dad',
 8: '2381___Scandal',
 9: '24___Scrubs',
 10: '28___Bones',
 11: '30___Grey_s_Anatomy',
 12: '3103___House_of_Cards_(2013)',
 13: '32___Veronica_Mars',
 14: '366___True_Blood',
 15: '51___How_I_Met_Your_Mother',
 16: '57___CSI__Crime_Scene_Investigation',
 17: '615___The_Good_Wife',
 18: '66___Ugly_Betty',
 19: '71___The_Wire',
 20: '880___Pretty_Little_Liars'}

In [50]:
corpus = get_corpus(path, texts_as="episode")
X = getTfidfSparseMat(corpus, my_stopwords = stopwords_set)

In [51]:
#Labelisation
nb_eps_per_show = []
for i in range(1, len(d_info.keys())+1):
    nb_eps_per_show.append(sum(d_info[i].values()))
Y = []
nb_shows = len(nb_eps_per_show)
somme_cumul = np.cumsum(nb_eps_per_show)
indMin = 0
for i in range(0, nb_shows):
    indMax = somme_cumul[i]
    for j in range(indMin, indMax):
        Y.append(i+1)
    indMin = indMax

In [13]:
# SVM instanciation - Linear
lsvm_clf = svm.LinearSVC()

In [14]:
cv = 5
scores = cross_val_score(lsvm_clf, X, Y, cv=cv)
scores

array([0.99841017, 0.99680511, 0.9983871 , 1.        , 0.99675325])

In [15]:
np.mean(scores)

0.998071126045858

In [16]:
np.std(scores)

0.0012062291815240506

In [18]:
cv = 10
scores = cross_val_score(lsvm_clf, X, Y, cv=cv)
scores

array([0.99685535, 1.        , 1.        , 0.99681529, 1.        ,
       0.99678457, 1.        , 1.        , 1.        , 1.        ])

In [19]:
np.mean(scores)

0.9990455198452552

In [20]:
np.std(scores)

0.0014580789178722524

## Avec 50 séries

Un second corpus, de 50 séries, avec 5630 épisodes.

In [68]:
path = "/Vrac/PLDAC_reco/cleaned_data50"

d_info, d_name = getDicts(path)
d_name

{1: '10___Smallville',
 2: '1149___Downton_Abbey',
 3: '119___Robin_Hood',
 4: '121___Gossip_Girl',
 5: '122___Malcolm_In_The_Middle',
 6: '12___Doctor_Who',
 7: '132___NCIS',
 8: '152___Star_Trek__The_Animated_Series',
 9: '15___House',
 10: '16___Desperate_Housewives',
 11: '182___Friends',
 12: '186___American_Dad',
 13: '1___Lost',
 14: '2381___Scandal',
 15: '240___Breaking_Bad',
 16: '24___Scrubs',
 17: '28___Bones',
 18: '2964___Vikings',
 19: '2___Heroes',
 20: '30___Grey_s_Anatomy',
 21: '3103___House_of_Cards_(2013)',
 22: '32___Veronica_Mars',
 23: '334___Buffy_The_Vampire_Slayer',
 24: '364___Legend_of_the_Seeker',
 25: '366___True_Blood',
 26: '376___The_Mentalist',
 27: '384___H2O__Just_Add_Water',
 28: '3861___Orange_is_the_New_Black',
 29: '3990___Peaky_Blinders',
 30: '406___Agatha_Christie__Poirot',
 31: '4679___Outlander',
 32: '46___The_Tudors',
 33: '4___Prison_Break',
 34: '51___How_I_Met_Your_Mother',
 35: '5423___The_Last_Kingdom',
 36: '54___Seinfeld',
 37: '56

In [70]:
nb_eps = 0
for info in d_info.values():
    nb_eps += sum(list(info.values()))
        
nb_eps

5630

In [71]:
corpus = get_corpus(path, texts_as="episode")
X = getTfidfSparseMat(corpus, my_stopwords = stopwords_set)

In [72]:
#Labelisation
nb_eps_per_show = []
for i in range(1, len(d_info.keys())+1):
    nb_eps_per_show.append(sum(d_info[i].values()))
Y = []
nb_shows = len(nb_eps_per_show)
somme_cumul = np.cumsum(nb_eps_per_show)
indMin = 0
for i in range(0, nb_shows):
    indMax = somme_cumul[i]
    for j in range(indMin, indMax):
        Y.append(i+1)
    indMin = indMax

In [74]:
# SVM instanciation - Linear
lsvm_clf = svm.LinearSVC()

In [75]:
cv = 5
scores = cross_val_score(lsvm_clf, X, Y, cv=cv)
scores



array([0.9965035 , 0.99648506, 0.9991119 , 0.99731183, 0.99638336])

In [76]:
np.mean(scores)

0.9971591299953481

In [77]:
np.std(scores)

0.00103176225314876

In [79]:
cv = 10
scores = cross_val_score(lsvm_clf, X, Y, cv=cv)
scores



array([0.9948542 , 1.        , 1.        , 0.99649123, 1.        ,
       0.99821747, 0.99820144, 0.99637681, 0.9981685 , 0.99815157])

In [80]:
np.mean(scores)

0.9980461219053384

In [81]:
np.std(scores)

0.0016408881530731965

# Tentative de recommandation avec SVM

<b>Idée :</b> 
- Entrainer le classifieur d'épisodes sur un ensemble de séries.  
- Prendre une série hors de cet ensemble.
- Tenter de classifier ses épisodes dans les séries aprises par le classifieur.
- Renvoyer la série apparaissant le plus dans les classifications comme étant la plus proche.

In [82]:
path_test = "/Vrac/PLDAC_reco/cleaned_data_reco_bayes"

corpus = get_corpus(path_test, texts_as="episode")

In [83]:
d_info, d_name = getDicts(path_test)

In [84]:
nb_eps = 0
for i in range(50, 58):
    nb_eps += sum(list(d_info[i].values()))
        
nb_eps  

457

In [85]:
X = getTfidfSparseMat(corpus, my_stopwords = stopwords_set)

In [86]:
X1 = X[:-457]
X2 = X[-457:]

In [87]:
d_info, d_name = getDicts(path_test)

for i in range(50, 58):
    del d_info[i]

In [88]:
d_name

{1: '108___Mad_Men',
 2: '10___Smallville',
 3: '1149___Downton_Abbey',
 4: '119___Robin_Hood',
 5: '121___Gossip_Girl',
 6: '122___Malcolm_In_The_Middle',
 7: '12___Doctor_Who',
 8: '132___NCIS',
 9: '152___Star_Trek__The_Animated_Series',
 10: '15___House',
 11: '16___Desperate_Housewives',
 12: '182___Friends',
 13: '186___American_Dad',
 14: '1___Lost',
 15: '2381___Scandal',
 16: '240___Breaking_Bad',
 17: '24___Scrubs',
 18: '262___Only_Fools_and_Horses',
 19: '2766___Major_Crimes',
 20: '28___Bones',
 21: '2964___Vikings',
 22: '2___Heroes',
 23: '30___Grey_s_Anatomy',
 24: '3103___House_of_Cards_(2013)',
 25: '32___Veronica_Mars',
 26: '334___Buffy_The_Vampire_Slayer',
 27: '364___Legend_of_the_Seeker',
 28: '366___True_Blood',
 29: '376___The_Mentalist',
 30: '384___H2O__Just_Add_Water',
 31: '3861___Orange_is_the_New_Black',
 32: '3990___Peaky_Blinders',
 33: '406___Agatha_Christie__Poirot',
 34: '4679___Outlander',
 35: '46___The_Tudors',
 36: '48___Everybody_Hates_Chris',
 

In [89]:
#Labelisation
nb_eps_per_show = []
for i in range(1, len(d_info.keys())+1):
    nb_eps_per_show.append(sum(d_info[i].values()))
Y = []
nb_shows = len(nb_eps_per_show)
somme_cumul = np.cumsum(nb_eps_per_show)
indMin = 0
for i in range(0, nb_shows):
    indMax = somme_cumul[i]
    for j in range(indMin, indMax):
        Y.append(i+1)
    indMin = indMax

In [90]:
# Linear SVM
lsvm_clf = svm.LinearSVC()
lsvm_clf.fit(X1, Y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [92]:
predictions = lsvm_clf.predict(X2)

In [93]:
predictions

array([ 7, 48, 48, 48, 48, 48, 48,  7, 16, 48, 48,  7, 48, 48,  7, 48, 48,
       48, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 41, 41, 43, 29,
       43, 43, 43, 43, 41, 43, 43, 41, 43, 29, 29, 41, 41, 41, 43, 41, 43,
       43, 41, 41, 43, 43, 43, 43, 43, 43, 43, 43, 43, 45, 43, 43, 41, 43,
       43, 43, 31, 31, 43, 43, 17, 43, 43, 43, 43, 43, 43, 19, 41, 19, 43,
       45, 43, 43, 43, 43, 19, 38, 19, 41, 43, 41, 43, 43, 43, 41, 43, 43,
       43, 43, 41, 43, 41, 43, 41, 31, 41, 43, 43, 49, 43, 41, 31, 31, 31,
       41, 31, 31, 31, 31, 31, 31, 31, 31, 41, 41, 31, 31, 31, 31, 31, 31,
       41, 31, 31, 24, 41, 31, 31, 31, 31, 31, 31, 31, 31, 31, 41, 31, 41,
       31, 31, 31, 31, 36, 31, 31, 31, 31, 31, 31, 31, 31, 41, 31, 31, 31,
       41, 41, 31, 31, 31, 41, 40,  7, 40,  7, 40, 40,  7, 35, 40, 40, 40,
       40,  7, 21,  7, 29, 21,  7, 29,  7,  7,  7,  7, 20,  7,  7,  7, 12,
       40,  7,  7,  7,  7,  7,  7,  7,  7,  7, 41,  7,  7,  7,  7, 20,  7,
        7,  7,  7,  7, 43

In [100]:
d_info, d_name = getDicts(path_test)

In [103]:
d_info

{1: {1: 13, 2: 13, 3: 13, 4: 13, 5: 13, 6: 13, 7: 14},
 2: {1: 21, 2: 23, 3: 22, 4: 22, 5: 22, 6: 22, 7: 20, 8: 22, 9: 21, 10: 21},
 3: {1: 7, 2: 10, 3: 9, 4: 10, 5: 11, 6: 6},
 4: {1: 13, 2: 13, 3: 13},
 5: {1: 18, 2: 25, 3: 22, 4: 22, 5: 24, 6: 11},
 6: {1: 16, 2: 6, 3: 22, 4: 22, 5: 21},
 7: {1: 14, 2: 16, 3: 15, 4: 20, 5: 17, 6: 16, 7: 23, 8: 27, 9: 8},
 8: {1: 23,
  2: 23,
  3: 24,
  4: 24,
  5: 19,
  6: 25,
  7: 24,
  8: 24,
  9: 24,
  10: 24,
  11: 24,
  12: 24,
  13: 6},
 9: {1: 16, 2: 6},
 10: {1: 22, 2: 24, 3: 24, 4: 16, 5: 24, 6: 21, 7: 23, 8: 23},
 11: {1: 23, 2: 24, 3: 23, 4: 17, 5: 24, 6: 23, 7: 23, 8: 23},
 12: {1: 23, 2: 24, 3: 25, 4: 24, 5: 24, 6: 25, 7: 24, 8: 24, 9: 23, 10: 18},
 13: {1: 23,
  2: 19,
  3: 16,
  4: 20,
  5: 18,
  6: 18,
  7: 18,
  8: 19,
  9: 20,
  10: 3,
  11: 15},
 14: {1: 24, 2: 24, 3: 24, 4: 13, 5: 19, 6: 20},
 15: {1: 7, 2: 22, 3: 19, 4: 22, 5: 5},
 16: {1: 7, 2: 13, 3: 13, 4: 14, 5: 16},
 17: {1: 24, 2: 22, 3: 22, 4: 25, 5: 24, 6: 22, 7: 11, 8: 

In [110]:
cpt = 0
pred_series = []
for idSerie in range(50, 58) :
    dico_saisons = d_info[idSerie]
    pred_saisons = []
    for idSaison in dico_saisons :
        nb_ep = dico_saisons[idSaison]
        for idSerieReco in predictions[cpt:cpt+nb_ep]:
            pred_saisons.append(idSerieReco)
        
        cpt += nb_ep
    pred_series.append(pred_saisons)
        


In [113]:
import operator

l = []
for idSerie in range(0, len(pred_series)) :
    liste_series_recommandees = pred_series[idSerie]
    dico_series = dict.fromkeys(set(liste_series_recommandees), 0)
    for idSerieReco in liste_series_recommandees :
        dico_series[idSerieReco] += 1
    m = max(dico_series.items(), key=operator.itemgetter(1))[0]
    l.append(m)

l

[48, 43, 31, 7, 7, 7, 11, 12]

In [117]:
for idSerie in range(50, 58) :
    print("-----------------------------------------------------------------------\n")
    nomSerie = d_name[idSerie]
    print("Si vous avez aimé la série "+str(nomSerie)+"\n")
    idSerieReco = l[idSerie-50]
    nomSerieReco = d_name[idSerieReco]
    print("\tvous aimerez sûrement la série "+str(nomSerieReco)+"\n")

-----------------------------------------------------------------------

Si vous avez aimé la série 68___Black_Books

	vous aimerez sûrement la série 630___The_Nanny

-----------------------------------------------------------------------

Si vous avez aimé la série 6___Dexter

	vous aimerez sûrement la série 57___CSI__Crime_Scene_Investigation

-----------------------------------------------------------------------

Si vous avez aimé la série 71___The_Wire

	vous aimerez sûrement la série 3861___Orange_is_the_New_Black

-----------------------------------------------------------------------

Si vous avez aimé la série 73___Rome

	vous aimerez sûrement la série 12___Doctor_Who

-----------------------------------------------------------------------

Si vous avez aimé la série 793___The_League_of_Gentlemen

	vous aimerez sûrement la série 12___Doctor_Who

-----------------------------------------------------------------------

Si vous avez aimé la série 79___The_Office_(UK)

	vous aimer

La recommandation ne semble pas trop mauvaise. Fait amusant : on remarque que pour les séries anglaises, on recommande aussi des séries anglaises.