In [1]:
import pandas as pd
import sys
import re

sys.path.insert(0, "../lib")

import os
import nltk
from nltk.parse import CoreNLPParser

os.environ["NLTK_DATA"] = "../"
nltk.data.path.append("../nltk_data")

In [2]:
df_set = pd.read_csv('../data/dataset.csv',sep="\t", header=None, encoding="utf8")
df_set = df_set.rename(index = int, columns = {0: "comment"})

df_labels = pd.read_csv('../data/labels.csv',sep="\t", header=None, encoding="utf8")
df_labels = df_labels.rename(index = int, columns = {0: "result"})

df_raw = pd.concat([df_set, df_labels], axis=1)

<h1>fonctions de pré-traitement</h1>

In [3]:
def clean_text(text):
    replace_chars = [
        [r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "", True],
        [r'[\w\.-]+@[\w\.-]+', "", True],
        [r'<.*?>', "", True],
        ["\"", "", False],
        [".", " . ", False],
        [",", " , ", False],
        [";", " ; ", False],
        ["?", " ? ", False],
        [":", " : ", False],
        ["!", " ! ", False],
        [" '", " ", False],
        ["' ", " ", False]
    ]

    for e in replace_chars:
        if (e[2]):
            text = re.sub(e[0], e[1], text, flags=re.MULTILINE)
        else:
            text = text.replace(e[0], e[1])

    return text

In [4]:
def text_to_tags(text):
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)   
    return tagged

In [5]:
def filter_tags(tags, ls):
    r = []
    
    for e in tags:
        for t in ls:
            if e[1] == t:
                r.append(e[0])
    return r

In [6]:
def pre_traitement(text):
    text = clean_text(text)
    tags = text_to_tags(text)
    filtered_tags = filter_tags(tags, ["JJ", "JJS", "JJR", "RBR", "RBS"])
    
    return filtered_tags

In [7]:
def parse_df(df):
    size = df.shape[0]
    
    data = {
        "filtered_tags": [],
        "text_filtered_tags": [],
        "clean_comment": []
    }
    
    for i in range(0, size):
        text = df["comment"][i]
        filtered_tags = pre_traitement(text)
        
        text_filtered_tags = ""
        
        for e in filtered_tags:
            text_filtered_tags += " " + e
        
        data["filtered_tags"].append(filtered_tags)
        data["text_filtered_tags"].append(text_filtered_tags)
        data["clean_comment"].append(clean_text(text))
    
    df_temp = pd.DataFrame(data)
    df_result = pd.concat([df, df_temp], axis=1)
    
    return df_result

<p>application des différents traitement</p>

In [8]:
data_clean = parse_df(df_raw)


<h1>Test des classifiers</h1>

<p>import des librairies</p>

In [33]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from time import time

In [34]:
def classification(clf, x, y, min_df = 0.0015, max_df=0.3, ngram_range=(1,3), is_to_array=False):
    vectorizer = TfidfVectorizer(min_df = min_df, max_df=max_df, ngram_range=ngram_range)
    k_fold = KFold(n_splits=10, random_state=None, shuffle=True)
    x = vectorizer.fit_transform(x)
    
    if is_to_array: x = x.toarray()
    
    clf.fit(x, y)
    
    r = {"clf": clf, "k_fold": k_fold, "vectorizer": vectorizer, "x": x}
    
    return r

In [35]:
def scoring_classification(clf, k_fold, x, y):
    score = cross_val_score(clf, x, y, cv=k_fold, scoring='accuracy')

    print('Les différentes accuracy pour les 10 évaluations sont :')
    print(score)
    print ('Accuracy moyenne :', score.mean())
    print('standard deviation', score.std())

<h2>MultinomialNB</h2>

In [36]:
r = classification(
    MultinomialNB(),
    data_clean["clean_comment"],
    data_clean["result"]
)

clf_multi_nb = r["clf"]
k_fold = r["k_fold"]
vectorizer_multi_nb = r["vectorizer"]

scoring_classification(
    clf_multi_nb,
    k_fold,
    r["x"],
    data_clean["result"]
)


Les différentes accuracy pour les 10 évaluations sont :
[0.906 0.899 0.908 0.911 0.914 0.905 0.918 0.919 0.913 0.916]
Accuracy moyenne : 0.9109
standard deviation 0.006040695324215587


<h2>GaussianNB</h2>

In [13]:
r = classification(
    GaussianNB(),
    data_clean["clean_comment"],
    data_clean["result"],
    ngram_range=(1,3),
    is_to_array=True
)

clf_gaussian = r["clf"]
k_fold = r["k_fold"]
vectorizer_gaussian = r["vectorizer"]

scoring_classification(
    clf_gaussian,
    k_fold,
    r["x"],
    data_clean["result"]
)


Les différentes accuracy pour les 10 évaluations sont :
[0.871 0.866 0.875 0.888 0.862 0.886 0.873 0.867 0.874 0.88 ]
Accuracy moyenne : 0.8741999999999999
standard deviation 0.008022468448052638


<h2>DecisionTreeClassifier</h2>

In [37]:
r = classification(
    DecisionTreeClassifier(random_state=0),
    data_clean["text_filtered_tags"],
    data_clean["result"],
    ngram_range=(1,2)
)

clf_tree = r["clf"]
k_fold = r["k_fold"]
vectorizer_tree = r["vectorizer"]

scoring_classification(
    clf_tree,
    k_fold,
    r["x"],
    data_clean["result"]
)


Les différentes accuracy pour les 10 évaluations sont :
[0.732 0.745 0.755 0.736 0.745 0.749 0.729 0.748 0.737 0.735]
Accuracy moyenne : 0.7411000000000001
standard deviation 0.008018104514160447


<h1>Test des classifiers sur un jeu de test</h1>

In [15]:
def average_predict_test(df_test, predicted):
    size = df_test.shape[0]
    avg = 0;
    
    for i in range(0, size):
        result = df_test["result"][i]
        
        if (predicted[i] == result):
            avg += 1
    
    return (float(avg) / float(size))

In [31]:
df_test = pd.read_csv('../data/test.csv',sep="\t", header=None, encoding="utf8")
del df_test[0]
df_test = df_test.rename(index = int, columns = {1: "comment", 2: "result"})
df_test = parse_df(df_test)


In [38]:
x_test_multi_nb = vectorizer_multi_nb.transform(df_test["clean_comment"])
predict_multi_nb = clf_multi_nb.predict(x_test_multi_nb)

#x_test_gaussian = vectorizer_gaussian.transform(df_test["clean_comment"]).toarray()
#predict_gaussian = clf_gaussian.predict(x_test_gaussian)

x_test_tree = vectorizer_tree.transform(df_test["text_filtered_tags"])
predict_tree = clf_tree.predict(x_test_tree)

print("MultinomialNB:", average_predict_test(df_test, predict_multi_nb))
#print("GaussianNB:", average_predict_test(df_test, predict_gaussian))
print("DecisionTreeClassifier", average_predict_test(df_test, predict_tree))

MultinomialNB: 0.86416
DecisionTreeClassifier 0.76006


In [17]:
display(data_clean["text_filtered_tags"][4000])
display(text_to_tags(data_clean["comment"][4000]))

' other short few short only violent many horrible like whole other good enough least x good'

[('I', 'PRP'),
 ('went', 'VBD'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('this', 'DT'),
 ('movie', 'NN'),
 ('(', '('),
 ('actually', 'RB'),
 ('I', 'PRP'),
 ('went', 'VBD'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('Family', 'NNP'),
 ('Portraits', 'NNP'),
 (',', ','),
 ('which', 'WDT'),
 ('contains', 'VBZ'),
 ('Cutting', 'VBG'),
 ('Moments', 'NNS'),
 ('+', '$'),
 ('2', 'CD'),
 ('other', 'JJ'),
 ('short', 'JJ'),
 ('films', 'NNS'),
 ('by', 'IN'),
 ('Douglas', 'NNP'),
 ('Buck', 'NNP'),
 (')', ')'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('Mar', 'NNP'),
 ('del', 'FW'),
 ('Plata', 'NNP'),
 ('festival', 'NN'),
 ('(', '('),
 ('Argentina', 'NNP'),
 (')', ')'),
 ('...', ':'),
 ('I', 'PRP'),
 ('just', 'RB'),
 ('could', 'MD'),
 ("n't", 'RB'),
 ('watch', 'VB'),
 ('it', 'PRP'),
 ('!', '.'),
 ('I', 'PRP'),
 ('had', 'VBD'),
 ('to', 'TO'),
 ('cover', 'VB'),
 ('my', 'PRP$'),
 ('eyes', 'NNS'),
 ('after', 'IN'),
 ('the', 'DT'),
 ('1st', 'CD'),
 ('half', 'NN'),
 ('of', 'IN'),
 ('Cutting', 'VBG'),
 ('Moments', 'NNS'),
 ('and', '