# Presidents

## Ouverture des données

In [153]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import re
import numpy as np 
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
import os
import spacy
from collections import Counter

In [60]:
def load_data(train_file, test_file):
    train = pd.DataFrame(columns=["text", "label"])
    train_texts, train_labels = [], []
    print("Loading : train data")
    with open(train_file, "r") as f:
        for line in f:
            line = line.strip()
            m = re.match("<\d+:\d+:(\w)> (.+)", line)
            train_labels.append(1 if m.group(1)=="M" else -1 )
            train_texts.append(m.group(2))
    test = pd.DataFrame(columns=["text"])
    test_texts = []
    print("Loading : test data")
    with open(test_file, "r") as f:
        for line in f:
            line = line.strip()
            m = re.match("<.+> (.+)", line)
            test_texts.append(m.group(1))
    train["text"] = train_texts
    train["label"] = train_labels
    test["text"] = test_texts
    encoder = preprocessing.LabelEncoder()
    train["label"] = encoder.fit_transform(train["label"])
    return train, test


In [5]:
class DataSet:
    def __init__(self, train, test):
        self.train = train
        self.test = test
        self.nlp = spacy.load("fr_core_news_sm")
        pass
    def stemming(self):
        if os.path.exists("../cache/stemmed.csv"):
            stemmed = pd.read_csv("../cache/stemmed.csv")
            return stemmed
        else:
            stemmer = FrenchStemmer()
            stemmed = self.train["text"].apply(lambda x : " ".join([i.lemma_ for i in stemmer.stem(x)]))
            stemmed.to_csv (r'../cache/stemmed.csv', index = False, header=True)
            return stemmed
    def lemmatisation(self):
        if os.path.exists("../cache/lemmatized.csv"):
            lemmatized = pd.read_csv("../cache/lemmatized.csv")
            return lemmatized
        else:
            
            lemmatized = self.train["text"].apply(lambda x : " ".join([i.lemma_ for i in nlp(x)]))
            lemmatized.to_csv (r'../cache/lemmatized.csv', index = False, header=True)
            return lemmatized
    def CountVectoriser(self, NGram= (1,1) ):
        count_vect = CountVectorizer(analyzer='word', ngram_range= NGram )
        count_vect.fit(self.train['text'])
        train_count =  count_vect.transform(self.train["text"])
        return train_count
    def tf_idf(self, NGram= (1,1)):
        tfidf_vect = TfidfVectorizer(analyzer='word',ngram_range= NGram)
        tfidf_vect.fit(self.train['text'])
        train_tfidf =  tfidf_vect.transform(self.train["text"])
        test_tfidf =  tfidf_vect.transform(self.test["text"])
        return train_tfidf, test_tfidf
    def clean(sentence):
        #remplacer la ponctuation par des vides
        filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
        translate_dict = {c: "" for c in filters}
        translate_map = str.maketrans(translate_dict)
        return sentence.translate(translate_map)
    def stop_words(self,mode = "lemmatized"):
        if mode = "lemmatized":
            if os.path.exists("../cache/lemmatized_free_stopwords.csv"):
                stop_words_free = pd.read_csv("../cache/lemmatized_free_stopwords.csv")
                return stop_words_free
            else:
                df = self.lemmatisation()
                stop_words_free = df.apply(lambda x : " ".join([for i in self.nlp(x) if not i.is_stop]))
                stop_words_free.to_csv (r'../cache/lemmatized_free_stopwords.csv', index = False, header=True)
                return stop_words_free
        else:
            if os.path.exists("../cache/stemmed_free_stopwords.csv"):
                stop_words_free = pd.read_csv("../cache/stemmed_free_stopwords.csv")
                return stop_words_free
            else:
                df = self.stemming()
                stop_words_free = df.apply(lambda x : " ".join([for i in self.nlp(x) if not i.is_stop]))
                stop_words_free.to_csv (r'../cache/stemmed_free_stopwords.csv', index = False, header=True)
                return stop_words_free
    def pos_tag(self):
        df = pd.DataFrame()
        

In [62]:
trainFile, testFile = "../data/corpus.tache1.learn.utf8", "../data/corpus.tache1.test.utf8"
train, test = load_data(trainFile, testFile)

Loading : train data
Loading : test data


# Vectorizer


### Lemmatisation vs Stemmization

In [11]:
from sklearn.metrics import f1_score, make_scorer
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
linear = LinearSVC()
naiveBayes = MultinomialNB()
vector = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words=stopwords.words("french"), strip_accents='unicode')
trainFeatures = vector.fit_transform(train["text"])
ScoresLinear = cross_val_score( linear, trainFeatures, train["label"], cv=5, scoring=make_scorer(f1_score,pos_label=1))
ScoresNaive = cross_val_score( naiveBayes, trainFeatures, train["label"], cv=5, scoring=make_scorer(f1_score,pos_label=1))

  'stop_words.' % sorted(inconsistent))


In [160]:
POS_LIST = ["ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", "SPACE"]


In [161]:
pos_tag = pd.DataFrame()
pos_tag["label"] = train["label"]
for i in POS_LIST:
    pos_tag[i] = 0
    


In [162]:
pos_tag["length"] = train["text"].apply(lambda x : len(x.split()))

In [163]:
nlp = spacy.load("fr_core_news_sm")

In [164]:
for i, text in train.iterrows():
    tags = dict(Counter(map(lambda x : x.pos_, list(nlp(text["text"])))))
    for k, v in tags.items():
        pos_tag.at[i, k] = v
    if i % 5000 == 0:
        print(i)

0
5000


KeyboardInterrupt: 

Unnamed: 0,label,ADJ,ADP,ADV,AUX,CONJ,CCONJ,DET,INTJ,NOUN,...,PART,PRON,PROPN,PUNCT,SCONJ,SYM,VERB,X,SPACE,length
0,0,2,3,3,1,0,1,2,0,3,...,0,6,0,3,1,0,2,0,0,21
1,0,0,2,2,1,0,0,4,0,5,...,0,1,0,3,1,0,1,0,0,16
2,0,1,7,3,3,0,3,9,0,8,...,0,13,1,4,0,0,7,0,0,49
3,0,1,2,1,1,0,0,2,0,2,...,0,0,1,1,0,0,2,0,0,12
4,0,4,8,3,1,0,0,5,0,6,...,0,2,2,11,1,0,3,0,0,33
5,0,0,0,3,0,0,0,1,0,1,...,0,1,0,1,0,0,1,0,0,8
6,0,1,0,2,1,0,0,1,0,1,...,0,0,1,1,0,0,0,0,0,6
7,0,1,3,1,1,0,1,5,0,6,...,0,2,1,3,0,0,2,0,0,23
8,0,0,1,3,2,0,0,1,0,1,...,0,1,2,3,1,0,1,0,0,13
9,0,3,0,0,1,0,0,2,0,3,...,0,0,2,2,1,0,1,0,0,13


In [154]:
dict(Counter(map(lambda x : x.pos_, list(nlp("je suis la pour l'argent")))))

{'PRON': 1, 'AUX': 1, 'DET': 2, 'ADP': 1, 'NOUN': 1}