In [1]:
import numpy as np
import copy
from utils.preprocessing import *
from utils.swSets import *
from sklearn import naive_bayes, svm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from scipy.sparse import vstack
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import pickle

# Recommandation avec SVM

<b>Idée :</b> 
- Entrainer le classifieur d'épisodes sur un ensemble de séries.  
- Prendre une série hors de cet ensemble.
- Tenter de classifier ses épisodes dans les séries aprises par le classifieur.
- Renvoyer la série apparaissant le plus dans les classifications comme étant la plus proche.

# Apprentissage

In [21]:
path = "/Vrac/PLDAC_reco/data_train"

In [22]:
corpus = get_corpus(path, texts_as="episodes")

In [23]:
d_info_train, d_name_train = getDicts(path)

In [24]:
d_name_train

{0: '103___The_Dead_Zone',
 1: '104___Las_Vegas',
 2: '10___Smallville',
 3: '12___Doctor_Who',
 4: '14___Oz',
 5: '15___House',
 6: '16___Desperate_Housewives',
 7: '17___The_Shield',
 8: '1___Lost',
 9: '20___The_L_Word',
 10: '21___Criminal_Minds',
 11: '240___Breaking_Bad',
 12: '24___Scrubs',
 13: '28___Bones',
 14: '29___Gilmore_Girls',
 15: '30___Grey_s_Anatomy',
 16: '47___Psych',
 17: '51___How_I_Met_Your_Mother',
 18: '52___Entourage',
 19: '53___ER',
 20: '54___Seinfeld',
 21: '55___The_Sopranos',
 22: '57___CSI__Crime_Scene_Investigation',
 23: '5___Supernatural',
 24: '61___Robot_Chicken',
 25: '64___One_Tree_Hill',
 26: '65___The_Office_(US)',
 27: '6___Dexter',
 28: '74___Numb3rs',
 29: '77___Nip_Tuck',
 30: '78___30_Rock',
 31: '7___24',
 32: '81___Weeds',
 33: '82___Monk',
 34: '85___The_X-Files',
 35: '86___Two_and_a_Half_Men',
 36: '90___The_Closer',
 37: '91___Magnum__P.I.',
 38: '95___Sex___the_City',
 39: '96___Curb_Your_Enthusiasm'}

In [30]:
X, vec = getTfidfSparseMatVectorizer(corpus, my_stopwords=stopwords_set)

taille vocabulaire :  134627


In [31]:
nb_eps_per_show = []
for i in range(0, len(d_info_train.keys())):
    nb_eps_per_show.append(sum(d_info_train[i].values()))
#labelisation
Y = []
nb_shows = len(nb_eps_per_show)
somme_cumul = np.cumsum(nb_eps_per_show)
indMin = 0
for i in range(0, nb_shows):
    indMax = somme_cumul[i]
    for j in range(indMin, indMax):
        Y.append(i)
    indMin = indMax

In [44]:
lsvm_clf = svm.LinearSVC()
lsvm_clf.fit(X, Y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

# Recommandation

In [27]:
path_test = "/Vrac/PLDAC_reco/data_test"
c_test = get_corpus(path_test, texts_as="episodes")
X2 = vec.transform(c_test)

In [45]:
predictions = lsvm_clf.predict(X2)

In [35]:
d_info_test, d_name_test = getDicts(path_test)

In [36]:
nb_eps_per_show = []
for i in range(0, len(d_info_test.keys())):
    nb_eps_per_show.append(sum(d_info_test[i].values()))
#labelisation
Y_test = []
nb_shows = len(nb_eps_per_show)
somme_cumul = np.cumsum(nb_eps_per_show)
indMin = 0
for i in range(0, nb_shows):
    indMax = somme_cumul[i]
    for j in range(indMin, indMax):
        Y_test.append(i)
    indMin = indMax

In [50]:
  def most_frequent(List): 
    counter = 0
    num = List[0] 
      
    for i in List: 
        curr_frequency = List.count(i) 
        if(curr_frequency> counter): 
            counter = curr_frequency 
            num = i 
  
    return num 

In [52]:
indmin = 0
pred = []
for i in d_name_test.keys():
    l = list(predictions[indmin : indmin+nb_eps_per_show[i]])
    pred.append(most_frequent(l))
    indmin = indmin+nb_eps_per_show[i]

In [53]:
pred

[25, 24, 22, 3, 14, 31, 3, 3, 24, 26, 16, 3, 3, 6, 10, 21, 35, 10, 3, 19]

In [54]:
for i in range(len(pred)):
    print("-----------------------------------------------------------------------\n")
    nomSerie = d_name_test[i]
    print("Si vous avez aimé la série "+str(nomSerie)+"\n")
    idSerieReco = pred[i]
    nomSerieReco = d_name_train[idSerieReco]
    print("\tvous aimerez sûrement la série "+str(nomSerieReco)+"\n")

-----------------------------------------------------------------------

Si vous avez aimé la série 22___Friday_Night_Lights

	vous aimerez sûrement la série 64___One_Tree_Hill

-----------------------------------------------------------------------

Si vous avez aimé la série 31___My_Name_Is_Earl

	vous aimerez sûrement la série 61___Robot_Chicken

-----------------------------------------------------------------------

Si vous avez aimé la série 32___Veronica_Mars

	vous aimerez sûrement la série 57___CSI__Crime_Scene_Investigation

-----------------------------------------------------------------------

Si vous avez aimé la série 33___Stargate_Atlantis

	vous aimerez sûrement la série 12___Doctor_Who

-----------------------------------------------------------------------

Si vous avez aimé la série 37___The_O.C.

	vous aimerez sûrement la série 29___Gilmore_Girls

-----------------------------------------------------------------------

Si vous avez aimé la série 40___The_Unit

	vou