In [28]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm, metrics
from sklearn.metrics import accuracy_score

np.random.seed(500)
nlp = spacy.load("fr_core_news_md")

In [29]:
TRAIN_PATH =  "train_disjoint.csv" # "datasets/articles/train_text_dataset.csv" 
TEST_PATH = "test_disjoint.csv" # "datasets/articles/test_text_dataset.csv"  

fields = ["label", "article"]

train_df = pd.read_csv(TRAIN_PATH, usecols=fields)
test_df = pd.read_csv(TEST_PATH, usecols=fields)

In [30]:
# Basic cleansing
def cleansing(doc):
    # Remove stop words
    doc = [token for token in doc if not token.is_stop]
    return doc

def keep_specific_pos(doc, pos=["ADV", "ADJ", "VERB", "NOUN"]):
    doc = [token for token in doc if token.pos_ in pos]
    return doc

def preprocess(data):
    docs = list(nlp.pipe(data))
    preprocess_docs = [keep_specific_pos(cleansing(doc)) for doc in docs]
    # Doc -> Text (+ lemmatization)
    output_texts = [" ".join([token.lemma_ for token in doc]) for doc in preprocess_docs]
    return output_texts

In [57]:
y_train, y_test = train_df["label"].values - 1, test_df["label"].values - 1


x_train = preprocess([str(text) for text in train_df["article"].values])
# np.save("datasets/articles/x_train.npy", x_train)
# x_train = np.load("datasets/articles/x_train.npy")

train = np.asarray((x_train, y_train)).T
np.random.shuffle(train)
x_train, y_train = np.array(train[:,0], dtype=str), np.array(train[:,1], dtype=int)

STOP_LEMMA = ["pourcent", "greenpeace", "réaliste", "fig", "vidéo", "climato", "régression", "climat", "réchauffement", "température", "scientifique"]
for i in range(len(x_train)):
    for stop_lemma in STOP_LEMMA:
        x_train[i] = x_train[i].replace(stop_lemma, '')



x_test = preprocess([str(text) for text in test_df["article"].values])
# np.save("datasets/articles/x_test.npy", x_test)
# x_test = np.load("datasets/articles/x_test.npy")
for i in range(len(x_test)):
    for stop_lemma in STOP_LEMMA:
        x_test[i] = x_test[i].replace(stop_lemma, '')

x_test, y_test = np.array(x_test, dtype=str), np.array(y_test, dtype=int)

In [58]:
e_x_train = []
e_x_train += list(x_train[y_train == 0][:1000])
e_x_train += list(x_train[y_train == 1][:1000])
e_x_train += list(x_train[y_train == 2][:1000])

x_train = e_x_train

e_y_train = []
e_y_train += list(y_train[y_train == 0][:1000])
e_y_train += list(y_train[y_train == 1][:1000])
e_y_train += list(y_train[y_train == 2][:1000])

y_train = e_y_train

In [59]:
e_x_test = []
e_x_test += list(x_test[y_test == 0][:100])
e_x_test += list(x_test[y_test == 1][:100])
e_x_test += list(x_test[y_test == 2][:100])
x_test = e_x_test

e_y_test = []
e_y_test += list(y_test[y_test == 0][:100])
e_y_test += list(y_test[y_test == 1][:100])
e_y_test += list(y_test[y_test == 2][:100])

y_test = e_y_test

In [60]:
x_train, x_test = x_train_bis, x_test_bis
y_train, y_test = y_train_bis, y_test_bis

In [61]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(x_train + x_test)
Train_X_Tfidf = Tfidf_vect.transform(x_train)
Test_X_Tfidf = Tfidf_vect.transform(x_test)

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U50700'), dtype('<U50700')) -> dtype('<U50700')

In [None]:
print(Tfidf_vect.vocabulary_)

In [None]:
id_to_word = {v: k for k, v in Tfidf_vect.vocabulary_.items()}

In [None]:
print(Train_X_Tfidf)

In [None]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,y_train)# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)
confusion_matrix = metrics.classification_report(y_test, predictions_SVM, zero_division=0)
report = metrics.confusion_matrix(y_test, predictions_SVM)
print(confusion_matrix)
print(report)

In [None]:
for i in range(SVM.support_.shape[0]):
    label = SVM.predict(SVM.support_vectors_.getrow(i))
    word = id_to_word[SVM.support_vectors_.getrow(i).argmax()]
    print(f'{label}: {word}')

In [None]:
arg_max = SVM.support_vectors_.sum(0).argsort().transpose()

for i in range(1, len(arg_max) + 1):
    id = int(arg_max[-i])
    print(id_to_word[id])