In [21]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.snowball import FrenchStemmer
from dataprep.eda import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
fichiertest = "../../datasources/films/allocine_bigfoot_avis.csv"
test = pd.read_csv(fichiertest)

In [4]:
fichiertrain = "../../datasources/films/allocine_inception_avis.csv"
train = pd.read_csv(fichiertrain)

In [5]:
train.head()

Unnamed: 0,Note,Description,key
0,50,après le chef doeuvre super héroïque the dark ...,0
1,50,souvent l’on peut manquer d’inspiration ou de ...,1
2,50,chef d’œuvre le film est absolument parfait ...,2
3,35,le meilleur blockbuster de 2010 a pour thème l...,3
4,50,un film aussi novateur que complexe dont la mi...,4


In [6]:
REMPLACE_SANS_ESPACE = re.compile("[;:!\'?,\"()\[\]]")
REMPLACE_AVEC_ESPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)|[.]")
PUR_NOMBRE = re.compile("[0-9]")

def setClassBin(i):
    if (float(i.replace(',', '.')) > 3):
        return 1
    else:
        return 0
    
def preprocess(txt):
    txt = [PUR_NOMBRE.sub("", (str(line)).lower()) for line in txt] # retire les nombres (comme les années)
    txt = [line.replace('\n', ' ')  for line in txt] # Retire les \n (retours chariots)
    txt = [REMPLACE_SANS_ESPACE.sub("", line.lower()) for line in txt]
    txt = [REMPLACE_AVEC_ESPACE.sub(" ", line) for line in txt]
    return txt

def prepare_dataset(X):
    X['Description'] = pd.DataFrame(preprocess(X['Description']))
    french_stopwords = set(stopwords.words('french'))
    filtre_stopfr =  lambda text: [token for token in text if token.lower() not in french_stopwords]
    X['Description'] = [' '.join(filtre_stopfr(word_tokenize(item))) for item in X['Description']]
    stemmer = FrenchStemmer()
    X['Description'] = [stemmer.stem(w) for w in X['Description']]
    
    yList = [setClassBin(x) for x in X.Note]
    y = pd.DataFrame(yList)
    
    X = X.drop('Note', axis=1)
    return X, y

In [7]:
Xtrain, ytrain = prepare_dataset(train)

In [8]:
Xtest, ytest = prepare_dataset(test)

### on concatène les deux dataframe afin d'avoir a coup sur les mêmes caractéristiques (car corpus différent)

In [9]:
Xf = pd.concat([Xtrain, Xtest])
yf = pd.concat([ytrain, ytest])

## Vectorisation

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)
cv.fit(Xf["Description"])

Xf_onehot = cv.transform(Xf["Description"])
Xtest_onehot = cv.transform(Xtest["Description"]) 

In [11]:
pd.DataFrame(Xf_onehot.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21063,21064,21065,21066,21067,21068,21069,21070,21071,21072
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
Xf_onehot.shape

(6007, 21073)

In [13]:
Xtest_onehot.shape

(7, 21073)

### Trouve le meilleur hyperparametre c (régularisation)

In [22]:
X_train, X_val, y_train, y_val = train_test_split(Xf_onehot, yf, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Précision pour C=%s: %s" % (c, accuracy_score(y_val, lr.predict(X_val))))

Précision pour C=0.01: 0.8468708388814914
Précision pour C=0.05: 0.8901464713715047
Précision pour C=0.25: 0.9027962716378163
Précision pour C=0.5: 0.9014647137150466
Précision pour C=1: 0.8981358189081226


# Entraînement du modèle

In [23]:
final_model = LogisticRegression(C=0.5)
final_model.fit(Xf_onehot, yf)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
print ("Précision: %s" % accuracy_score(ytest, final_model.predict(Xtest_onehot)))

Précision: 0.8571428571428571


# Essayons avec le film i-Robot maintenant

In [26]:
fichierirobot = "../../datasources/films/allocine_irobot_avis.csv"
irobot = pd.read_csv(fichierirobot)
irobot.head()

Unnamed: 0,Note,Description,key
0,40,un univers original dans ce film will smith jo...,0
1,40,un très bon film de science fiction lhistoir...,1
2,40,après avoir longtemps entendu parler de ce fil...,2
3,40,un film de science fiction qui emprunte beauco...,3
4,40,le dernier bébé de proyas en date inspiré dire...,4


In [27]:
Xirobot, yirobot = prepare_dataset(irobot)

In [31]:
Xf_irobot_onehot = cv.transform(Xirobot["Description"])
pd.DataFrame(Xf_irobot_onehot.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21063,21064,21065,21066,21067,21068,21069,21070,21071,21072
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
910,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
911,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
912,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
913,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
print ("Précision: %s" % accuracy_score(yirobot, final_model.predict(Xf_irobot_onehot)))

Précision: 0.7650273224043715
