In [81]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm, metrics
from sklearn.metrics import accuracy_score

np.random.seed(500)
nlp = spacy.load("fr_core_news_md")

In [82]:
TRAIN_PATH = "datasets/articles/train_text_dataset.csv"
TEST_PATH = "datasets/articles/test_text_dataset.csv"

fields = ["label", "article"]

train_df = pd.read_csv(TRAIN_PATH, usecols=fields)
test_df = pd.read_csv(TEST_PATH, usecols=fields)

In [96]:
# Basic cleansing
def cleansing(doc):
    # Remove stop words
    doc = [token for token in doc if not token.is_stop]
    return doc

def keep_specific_pos(doc, pos=["ADV", "ADJ", "VERB", "NOUN"]):
    doc = [token for token in doc if token.pos_ in pos]
    return doc

def preprocess(data):
    docs = list(nlp.pipe(data))
    preprocess_docs = [keep_specific_pos(cleansing(doc)) for doc in docs]
    # Doc -> Text (+ lemmatization)
    output_texts = [" ".join([token.lemma_ for token in doc]) for doc in preprocess_docs]
    return output_texts

In [219]:
y_train, y_test = train_df["label"].values - 1, test_df["label"].values - 1


# x_train = preprocess([str(text) for text in train_df["article"].values])
# np.save("datasets/articles/x_train.npy", x_train)
x_train = np.load("datasets/articles/x_train.npy")

train = np.asarray((x_train, y_train)).T
np.random.shuffle(train)
x_train, y_train = np.array(train[:,0], dtype=str), np.array(train[:,1], dtype=int)

STOP_LEMMA = ["pourcent", "greenpeace", "réaliste", "fig", "vidéo", "climato", "régression", "climat", "réchauffement", "température", "scientifique"]
for i in range(len(x_train)):
    for stop_lemma in STOP_LEMMA:
        x_train[i] = x_train[i].replace(stop_lemma, '')



# x_test = preprocess([str(text) for text in test_df["article"].values])
# np.save("datasets/articles/x_test.npy", x_test)
x_test = np.load("datasets/articles/x_test.npy")
for i in range(len(x_test)):
    for stop_lemma in STOP_LEMMA:
        x_test[i] = x_test[i].replace(stop_lemma, '')

x_test, y_test = np.array(x_test, dtype=str), np.array(y_test, dtype=int)

In [220]:
e_x_train = []
e_x_train += list(x_train[y_train == 0][:300])
e_x_train += list(x_train[y_train == 1][:300])
e_x_train += list(x_train[y_train == 2][:300])

x_train = e_x_train

e_y_train = []
e_y_train += list(y_train[y_train == 0][:300])
e_y_train += list(y_train[y_train == 1][:300])
e_y_train += list(y_train[y_train == 2][:300])

y_train = e_y_train

In [221]:
e_x_test = []
e_x_test += list(x_test[y_test == 0][:37])
e_x_test += list(x_test[y_test == 1][:37])
e_x_test += list(x_test[y_test == 2][:37])
x_test = e_x_test

e_y_test = []
e_y_test += list(y_test[y_test == 0][:37])
e_y_test += list(y_test[y_test == 1][:37])
e_y_test += list(y_test[y_test == 2][:37])

y_test = e_y_test

In [222]:
print(len(x_train), len(y_train))

824 824


In [223]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(x_train + x_test)
Train_X_Tfidf = Tfidf_vect.transform(x_train)
Test_X_Tfidf = Tfidf_vect.transform(x_test)

In [224]:
print(Tfidf_vect.vocabulary_)

{'français': 1966, 'effondrement': 1565, 'civilisation': 802, 'principalement': 3453, 'cause': 700, 'ique': 2478, 'montée': 2867, 'inégalité': 2472, 'révéler': 4079, 'sondage': 4234, 'âge': 4813, 'compte': 915, 'réponse': 4033, 'an': 256, 'peur': 3289, 'lier': 2590, 'exemple': 1738, 'dernier': 1233, 'ailleurs': 176, 'guerre': 2085, 'nucléaire': 3011, 'moyenne': 2889, 'ensemble': 1641, 'fermer': 1848, 'porte': 3401, 'conférence': 972, 'connaître': 980, 'record': 3760, 'succès': 4338, 'cop': 1084, 'mesure': 2778, 'nombre': 2982, 'participant': 3213, 'déclaration': 1413, 'produire': 3486, 'juger': 2529, 'capacité': 658, 'accélérer': 96, 'action': 110, 'terrain': 4473, 'matière': 2744, 'retenir': 3896, '26': 31, 'édition': 4861, 'accroissement': 81, 'objectif': 3043, 'réduction': 3988, 'émission': 4907, 'rendez': 3826, 'vous': 4777, 'constituer': 1015, 'point': 3371, 'étape': 4950, 'important': 2244, 'calendrier': 633, 'accord': 77, 'signer': 4177, 'cop21': 1085, 'pays': 3244, 'participer'

In [225]:
id_to_word = {v: k for k, v in Tfidf_vect.vocabulary_.items()}

In [226]:
print(Train_X_Tfidf)

  (0, 4813)	0.18613894131713277
  (0, 4234)	0.255699354254425
  (0, 4079)	0.1635375369397605
  (0, 4033)	0.16214509099427538
  (0, 3453)	0.18400747686124924
  (0, 3289)	0.3574309907598974
  (0, 3011)	0.14444595712467434
  (0, 2889)	0.14851661208706549
  (0, 2867)	0.187609845021159
  (0, 2590)	0.12527082081284818
  (0, 2478)	0.13313880515631643
  (0, 2472)	0.22531511887309266
  (0, 2085)	0.16643904050290578
  (0, 1966)	0.31665464799910953
  (0, 1738)	0.11375548257523634
  (0, 1641)	0.12901869848212238
  (0, 1565)	0.436302700973417
  (0, 1233)	0.08874113797414206
  (0, 915)	0.12010972112587466
  (0, 802)	0.2393221550049617
  (0, 700)	0.22714899855940301
  (0, 256)	0.1694870417282378
  (0, 176)	0.12327515260642113
  (1, 4990)	0.027977808164152816
  (1, 4989)	0.0360692641905338
  :	:
  (823, 525)	0.012947301691725612
  (823, 479)	0.04582541926890493
  (823, 478)	0.038582505045923136
  (823, 442)	0.045036900584632784
  (823, 425)	0.01960202625083032
  (823, 402)	0.01939604421736162
  (823, 

In [227]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,y_train)# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)
confusion_matrix = metrics.classification_report(y_test, predictions_SVM, zero_division=0)
report = metrics.confusion_matrix(y_test, predictions_SVM)
print(confusion_matrix)
print(report)

SVM Accuracy Score ->  81.08108108108108
              precision    recall  f1-score   support

           0       0.83      0.92      0.87        37
           1       0.80      0.65      0.72        37
           2       0.80      0.86      0.83        37

    accuracy                           0.81       111
   macro avg       0.81      0.81      0.81       111
weighted avg       0.81      0.81      0.81       111

[[34  2  1]
 [ 6 24  7]
 [ 1  4 32]]


In [228]:
for i in range(SVM.support_.shape[0]):
    label = SVM.predict(SVM.support_vectors_.getrow(i))
    word = id_to_word[SVM.support_vectors_.getrow(i).argmax()]
    print(f'{label}: {word}')

[0]: effondrement
[0]: méthane
[0]: satellite
[0]: installation
[0]: algue
[0]: constructif
[0]: météo
[0]: eau
[1]: malus
[0]: diesel
[0]: financement
[0]: extinction
[0]: récif
[0]: efficacité
[0]: vote
[0]: cop
[0]: progrès
[0]: abeille
[0]: hfc
[0]: marin
[0]: poumon
[0]: feu
[0]: élu
[0]: produit
[0]: fioul
[0]: plainte
[0]: vis
[2]: conflit
[0]: signature
[0]: chaleur
[0]: médical
[0]: requin
[1]: objectif
[0]: co2
[0]: météorologique
[0]: cirque
[0]: glace
[0]: bouteille
[0]: état
[0]: état
[0]: nappe
[0]: sexe
[0]: météo
[0]: invendu
[0]: bulle
[0]: indonésien
[0]: insecte
[0]: spécifiquement
[0]: chaud
[0]: campagne
[0]: chancelière
[0]: orage
[1]: ratification
[0]: serre
[0]: vérifier
[0]: basculement
[0]: animal
[1]: bois
[2]: variation
[0]: manifestant
[0]: 19
[0]: bâtiment
[0]: pêche
[0]: prince
[0]: roche
[0]: côtier
[0]: charbon
[0]: rhinocéros
[0]: barrage
[0]: gel
[0]: betterave
[0]: tourbière
[0]: forêt
[0]: action
[0]: plaine
[0]: émission
[0]: plaider
[0]: déchet
[0

In [229]:
arg_max = SVM.support_vectors_.sum(0).argsort().transpose()

for i in range(1, len(arg_max) + 1):
    id = int(arg_max[-i])
    print(id_to_word[id])

ique
émission
pays
énergie
co2
degré
changement
année
bien
an
nucléaire
gaz
grand
carbone
politique
mondial
européen
rapport
faire
monde
pouvoir
français
objectif
forêt
dernier
état
énergétique
niveau
devoir
accord
terre
serre
eau
gouvernement
falloir
mettre
ici
prendre
planète
non
milliard
entreprise
électricité
action
nouveau
jour
humain
fois
étude
projet
million
réduire
environnement
président
développement
voir
international
hui
aujourd
fin
économique
écologique
vouloir
transition
public
national
mesure
question
bon
permettre
passer
renouvelable
charbon
agir
euro
venir
augmentation
loi
crise
système
fossile
production
mois
vie
total
modèle
temps
moyen
atteindre
point
article
naturel
problème
cause
période
hausse
risque
environnemental
petit
centrale
américain
réduction
activité
citoyen
produire
publier
science
lutte
déchet
pétrole
prix
glace
ministre
population
important
face
homme
exemple
engagement
donnée
déforestation
urgence
fort
personne
raison
part
produit
siècle
espèce
océan