In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import numpy as np
import itertools
from sklearn.metrics import confusion_matrix, classification_report

In [7]:
df = pd.read_csv("./../preprocessed/reponses_annotees_multilabel.csv", sep="\t")

In [153]:
df.head()

Unnamed: 0.1,Unnamed: 0,reference,corpus,categories
0,0,1-100,Démocratie = citoyen qui commande Le peuple Ou...,"Société civile, citoyens"
1,1,1-10002,Mettre en place le RIC moi même Non Une bonn...,A Moi-même ou personne
2,2,1-10004,Tout vient de l éducation Représentation des f...,Q Proportionnelle
3,3,1-10016,Transparence Maire Non Plus d'échanges Une bo...,Maires et communes
4,4,1-10017,REVOIR LA CONSTITUTION DE 1958 PERSONNE Non No...,A Aucun élu; A Moi-même ou personne


In [38]:
ohe = OneHotEncoder()
le = LabelEncoder()

In [47]:
def tokenize(s):
    return [elt.strip() for elt in s.split(";")]

In [49]:
all_labels = list(set(itertools.chain.from_iterable(df.categories.apply(lambda x: tokenize(x)))))                  

['A Aucun élu',
 'Exemplarité, honnêteté des élus',
 'W Proximité',
 'Société civile, citoyens',
 'Plus proche des gens',
 'Démocratie directe (referendum, consultation)',
 'Application loi 1905',
 'Elus locaux',
 'Compter comme exprimé',
 "Annuler l'élection si fort",
 'Plus de transparence',
 'Associations, ONG',
 'Q Dose de proportionnelle',
 'Maires et communes',
 'A Moi-même ou personne',
 'Dialogue, débat, concertation',
 'Q Proportionnelle',
 'Tous les élus',
 'Députés, assemblée',
 'Président de la République']

In [101]:
target = []

In [102]:
from tqdm import tqdm 

In [103]:
for r, row in tqdm(df.iterrows()):
    cat_ = np.sum([ohe.transform(le.transform([elt]).reshape(-1,1)).todense()
                                      for elt in tokenize(row.categories)], axis=0)
    target.append(cat_[0])

9228it [00:07, 1235.87it/s]


In [104]:
y = np.asarray(target)

In [107]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier

In [108]:
classifier = DecisionTreeClassifier()

In [123]:
moc = MultiOutputClassifier(classifier, n_jobs=-1)

In [111]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [117]:
tfidf = TfidfVectorizer(max_features=1000)

In [121]:
X = tfidf.fit_transform(df.corpus.tolist()).todense()

In [119]:
from sklearn.model_selection import train_test_split

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [124]:
moc.fit(X_train, y_train)

MultiOutputClassifier(estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
           n_jobs=-1)

In [125]:
prediction = moc.predict(X_test)

In [129]:
prediction.sum(axis=1).shape

(2307,)

In [131]:
y_test.sum(axis=1).shape

(2307,)

In [132]:
moc.score(X_test, y_test)

0.1187689640225401

In [151]:
for i in range(y_test.shape[1]):
    y_dim = y_test[:,i]
    pred_dim = prediction[:,i]
    
    dim = np.zeros(y.shape[1])
    dim[i]=1
    print(le.inverse_transform(ohe.inverse_transform([dim])))
    
    print(confusion_matrix(y_dim, pred_dim))
    print(classification_report(y_dim, pred_dim))

['A Aucun élu']
[[1947  126]
 [ 134  100]]
              precision    recall  f1-score   support

         0.0       0.94      0.94      0.94      2073
         1.0       0.44      0.43      0.43       234

   micro avg       0.89      0.89      0.89      2307
   macro avg       0.69      0.68      0.69      2307
weighted avg       0.89      0.89      0.89      2307

['A Moi-même ou personne']
[[1708  200]
 [ 176  223]]
              precision    recall  f1-score   support

         0.0       0.91      0.90      0.90      1908
         1.0       0.53      0.56      0.54       399

   micro avg       0.84      0.84      0.84      2307
   macro avg       0.72      0.73      0.72      2307
weighted avg       0.84      0.84      0.84      2307

["Annuler l'élection si fort"]
[[2132   84]
 [  87    4]]
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96      2216
         1.0       0.05      0.04      0.04        91

   micro avg       0.93    

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [134]:
y_test.shape

(2307, 20)