In [1]:
from os import listdir 
import numpy as np
import pandas as pd
import scipy as sp
import regex as re 
from scipy.spatial import Voronoi, voronoi_plot_2d
import seaborn as sb
import nltk
import heapq
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 

## chargement des données 

In [2]:
def build_df(set='train', lang='en'):
    '''
    Fonction pour lire tous les fichiers dans un dataframe.
    input :
        set : Le set à construire - 'train' ou 'test'.
        lang : La langue du set à construire - 'en' ou 'fr'.
    '''
    res = pd.DataFrame(columns=['content', 'label'])
    
    if lang=='en':
        path = 'data/movie-reviews-en/'+set
    elif lang=='fr':
        path = 'data/movie-reviews-fr/'+set
    pos_path = path+'/pos/'
    neg_path = path+'/neg/'
        
    files = sorted(listdir(pos_path))
    for file in files:
        with open(pos_path+file, 'r') as f:
            df = pd.DataFrame({'content':[f.read()], 'label':['pos']})
            res = res.append(df, ignore_index=True)

    files = sorted(listdir(neg_path))
    for file in files:
        with open(neg_path+file, 'r') as f:
            df = pd.DataFrame({'content':[f.read()], 'label':['neg']})
            res = res.append(df, ignore_index=True)
    
    return res

In [3]:
a_train = build_df(set='train', lang='en')
a_test = build_df(set='test', lang='en')

In [4]:
a_train

Unnamed: 0,content,label
0,films adapted from comic books have had plenty...,pos
1,every now and then a movie comes along from a ...,pos
2,you've got mail works alot better than it dese...,pos
3,""" jaws "" is a rare film that grabs your atten...",pos
4,moviemaking is a lot like being the general ma...,pos
...,...,...
1795,2 days in the valley is more or less a pulp fi...,neg
1796,what would inspire someone who cannot write or...,neg
1797,synopsis : a novelist struggling with his late...,neg
1798,"okay , okay . \nmaybe i wasn't in the mood to ...",neg


In [5]:
a_test

Unnamed: 0,content,label
0,"in 1912 , a ship set sail on her maiden voyage...",pos
1,the start of this movie reminded me of parts f...,pos
2,note : some may consider portions of the follo...,pos
3,robert altman's cookie's fortune is that rare ...,pos
4,well i'll be damned . . . \nthe canadians can ...,pos
...,...,...
195,"if anything , "" stigmata "" should be taken as ...",neg
196,"john boorman's "" zardoz "" is a goofy cinematic...",neg
197,the kids in the hall are an acquired taste . \...,neg
198,there was a time when john carpenter was a gre...,neg


## 1. Test avec scikitlearn

In [6]:
vectorizer = CountVectorizer()
 
counts = vectorizer.fit_transform(a_train['content'].values)

In [7]:
classifier = MultinomialNB()
targets = a_train['label'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
test = vectorizer.transform(a_test['content'])
predictions = classifier.predict(test)

In [9]:
predictions

array(['pos', 'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos',
       'neg', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos', 'neg',
       'neg', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'pos',
       'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos',
       'pos', 'neg', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos',
       'pos', 'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg', 'pos',
       'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos',
       'pos', 'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos',
       'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos',
       'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'neg',
       'pos', 'neg', 'neg', 'pos', 'neg', 'pos', 'pos', 'pos', 'pos',
       'pos', 'neg', 'pos', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg',
       'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg',
       'pos', 'neg',

In [10]:
a=0
for i in range(len(predictions)):
    if(a_test['label'][i]==predictions[i]):
        a+=1 

b = a/len(predictions)*100.0
print(b)

83.0


## 2. Implémentation du code

### A. Le compte des mots 

In [11]:
def freqMots(df):
    L = []
    dic = {}
    for e in df.itertuples():
        L.append(e[1])
    for i in range(len(df)): #traitement de textes 
        L[i] = L[i].lower() #lettres en micuscules
        L[i] = re.sub(r'\W',' ',L[i]) #suppression de la ponctuation 
        L[i] = re.sub(r'\s+',' ',L[i]) #suppresion de espaces blancs
    for phr in L: #création du dictionnaire
        mots = nltk.word_tokenize(phr)
        for mot in mots:
            if mot not in dic.keys():
                dic[mot] = 1
            else:
                dic[mot] += 1
    return dic

In [12]:
dic = freqMots(a_train) #dictionnaire qu'on pourra utiliser comme vocabulaire

In [13]:
a_train_pos = a_train[a_train['label']=='pos'] #documents de la classe pos 
a_train_neg = a_train[a_train['label']=='neg'] #documents de la classe neg 

In [14]:
dic_pos = freqMots(a_train_pos) #mots de la classe pos 
dic_neg = freqMots(a_train_neg) #mots de la classe neg

In [15]:
nbr_mots_pos=0 #nombre de mots total dans la classe pos
for valeur in dic_pos.values():
    nbr_mots_pos+=valeur
    
nbr_mots_neg=0 #nombre de mots total dans la classe neg
for valeur in dic_neg.values():
    nbr_mots_neg+=valeur

### B. Calcul des probabilités

In [16]:
def conditionnal_prob(dic):
    """
        fonction qui va calculer les probabilités conditionnelles de chaque mot selon la classe dont elle appartient
    """
    dpos_prob = {} #dictionnaire qui contiendra les probabilités du vocabulaire pour la classe pos
    dneg_prob = {} #dictionnaire qui contiendra les probabilités du vocabulaire pour la classe neg
    a = 0
    for cle, valeur in dic.items():
        if cle in dic_pos:
            a = (dic_pos[cle]+1.0)/(nbr_mots_pos+len(dic)) #utilisation de la formule 
            dpos_prob[cle]=a
        else: 
            a = 1.0/(nbr_mots_pos+len(dic))
            dpos_prob[cle]=a
        if cle in dic_neg:
            a = (dic_neg[cle]+1.0)/(nbr_mots_neg+len(dic))
            dneg_prob[cle]=a
        else: 
            a = 1.0/(nbr_mots_neg+len(dic))
            dneg_prob[cle]=a
    return dpos_prob, dneg_prob 

In [17]:
dpos_prob, dneg_prob = conditionnal_prob(dic)

In [18]:
print(dpos_prob)



In [19]:
print(dneg_prob)



### C. Choix de la classe

In [22]:
def classe(df): 
    """
        fonction qui va déterminer selon la base d'apprentissage si un document de la base de test
        est de la classe pos ou neg
    """
    L=[]
    Lp = []
    Ln = []
    l = []
    for e in df.itertuples():
        L.append(e[1])
    for i in range(len(df)): #traitement de textes 
        L[i] = L[i].lower() #lettres en micuscules
        L[i] = re.sub(r'\W',' ',L[i]) #suppression de la ponctuation 
        L[i] = re.sub(r'\s+',' ',L[i]) #suppresion de espaces blancs
        #compte des mots 
        mots = L[i].split()
        compte = {}.fromkeys(set(mots),0)
        for valeur in mots:
            compte[valeur] += 1
        #utilisation de la formule
        pp = np.log(0.5) #probabilité d'appartenir à la classe pos
        pn = 0.5 #probabilité d'appartenir à la classe pos
        for cle in compte.keys():
            if cle in dic: 
                pp+=np.log(dpos_prob[cle]**(compte[cle]))
            if cle in dic: 
                pn+=np.log(dneg_prob[cle]**(compte[cle]))
        Lp.append(pp)
        Ln.append(pn)
    for i in range(len(Lp)): 
        if(Lp[i]>Ln[i]):
            l.append('pos')
        else:
            l.append('neg')
    return l

In [23]:
pred = classe(a_test)

In [24]:
a=0
for i in range(len(pred)):
    if(a_test['label'][i]==pred[i]):
        a+=1 

b = a/len(predictions)*100.0
print('pourcentage de bonnes prédicitons : ', b,'%')

pourcentage de bonnes prédicitons :  82.5 %
