# Bag of Words - Count occurences
- Preprocessing : Top-k, SpaCy 
- Classifier : SVM

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np
import spacy
import time
import os
import json

import spacy_transformers
import torch
from spacy import displacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, KFold

In [3]:
nlp = spacy.load("fr_dep_news_trf")

## Dataset

### Initial preprocessing functions

In [4]:
def remove_useless(paragraph_ann):
    content = paragraph_ann["content"]
    paragraphs_to_delete = []
    entities_to_delete = []

    for index, paragraph in enumerate(content) :
        if 'label' in paragraph.keys() and paragraph["label"]==0:
            paragraphs_to_delete.append(index)

        else :
            if type(paragraph["content"])==list:
              for index_entity, entity_content in enumerate(paragraph["content"]):
                if type(entity_content) == dict and 'label' in entity_content.keys() and entity_content["label"]==0:
                      entities_to_delete.append((index, index_entity))

    for i in range(-1, -len(entities_to_delete)-1, -1) :
      index, index_entity = entities_to_delete[i]
      del paragraph_ann['content'][index]['content'][index_entity]

    for index in reversed(paragraphs_to_delete):
      del paragraph_ann['content'][index]

    return paragraph_ann


def fusion(paragraph_ann):
  label = paragraph_ann['label']
  author = paragraph_ann['author']
  title = paragraph_ann['title']
  date = paragraph_ann['date']
  content = ""
  for paragraph in paragraph_ann['content']:
    if type(paragraph['content']) == str :
      content += paragraph['content']
    elif type(paragraph['content']) == list :
      for entity in paragraph['content']:
        content += entity['content']
  json = {
    'label' : label, 
    'date': date,
    'title': title,
    'author': author,
    'content' : content,
  }
  return json

### Creation of the dataset

In [5]:
# assign directory
ANNOTATIONS_FOLDER = 'C:\\Users\\louis\\Desktop\\NLP\\fake_news\\annotations'
dataset = []
 
# iterate over files in
# that directory
for filename in os.listdir(ANNOTATIONS_FOLDER):
    f = os.path.join(ANNOTATIONS_FOLDER, filename)
    # checking if it is a file
    if os.path.isfile(f):
        with open(f, 'r') as file:
            data = json.load(file)
            if data["label"] > 0:
                data = fusion(remove_useless(data))
                dataset += [(data["label"], data["content"])]

dataset = np.array(dataset, dtype=str)
print(dataset.shape)

(411, 2)


In [40]:
train, test = train_test_split(dataset)
y_train, x_train = np.array(train[:,0], dtype=int), np.array(train[:,1], dtype=str)
y_test, x_test = np.array(test[:, 0], dtype=int), np.array(test[:,1], dtype=str)

## Count features

## Text preprocessing

In [37]:
# Basic cleansing
def cleansing(doc):
    # Remove stop words
    doc = [token for token in doc if not token.is_stop]
    return doc

def keep_specific_pos(doc, pos=["ADV", "ADJ", "VERB", "NOUN"]):
    doc = [token for token in doc if token.pos_ in pos]
    return doc

def preprocess(data):
    docs = list(nlp.pipe(data))
    preprocess_docs = [keep_specific_pos(cleansing(doc)) for doc in docs]
    # Doc -> Text (+ lemmatization)
    output_texts = [" ".join([token.lemma_ for token in doc]) for doc in preprocess_docs]
    return output_texts
        

In [38]:
x = preprocess(x_train)

TypeError: Argument 'string' has incorrect type (expected str, got numpy.str_)

In [9]:
print(f'Nb doc: {len(x)}')

count_vec = CountVectorizer()
count = count_vec.fit_transform(x)
count = count.toarray()

Nb doc: 308


In [13]:
print('Number of Vocabulary: %d'% (len(count_vec.get_feature_names_out())))

Number of Vocabulary: 13912


Too much vocabulary : we need to remove the non-relevant ones

In the end, we have to remove the less important features (words) in order to restrain the number of vocabulary

In [19]:
# Select K best

LIMIT_VOCABULARY = 10000 # K
mask = np.argsort(np.sum(count, axis=0))[-LIMIT_VOCABULARY:]
words_as_features = count_vec.get_feature_names_out()[mask]
count_features = count[:, mask]

print(f'Features: {len(words_as_features)}')

Features: 10000


In [31]:
cl = SVC(verbose=True)
cl.fit(count, y_train)
print(y_train)

count_test = count_vec.transform(x_test).toarray()
print(count_test)
y_predicted = cl.predict(count_test)

[LibSVM][1 1 1 1 3 3 3 1 2 2 1 2 1 1 1 2 1 1 1 3 1 1 1 3 1 1 3 2 1 3 3 3 3 2 1 3 1
 2 1 1 3 3 1 1 3 1 3 1 3 1 1 3 1 1 1 1 3 1 3 1 2 3 3 1 2 1 1 3 1 1 1 3 1 1
 2 1 3 1 1 1 1 1 1 1 3 1 1 2 3 1 1 1 1 1 1 1 3 3 1 2 1 3 3 3 3 1 1 1 1 1 1
 1 2 1 3 3 1 1 2 3 2 1 1 1 2 1 1 1 1 3 1 1 1 3 2 1 2 3 2 1 1 1 3 1 1 1 1 1
 1 2 3 1 1 1 1 3 1 2 3 1 2 2 3 3 3 1 3 1 1 3 3 2 3 2 1 1 1 3 1 1 3 1 1 1 1
 1 1 1 3 1 1 1 2 1 1 1 1 1 3 1 2 1 3 1 2 1 1 2 3 3 3 1 1 1 1 1 1 1 1 1 1 1
 3 1 1 1 1 1 3 1 1 1 2 1 1 1 2 1 1 1 1 3 1 3 1 1 1 3 1 1 1 1 3 1 1 1 3 1 1
 1 1 1 1 1 1 1 2 1 1 1 1 3 1 1 1 1 3 1 1 1 2 1 3 1 2 1 3 1 1 3 1 1 3 1 3 1
 3 1 1 3 1 3 3 3 2 3 1 1]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Scores

In [32]:
def true_positives(predicted_labels, labels):
    return np.count_nonzero(labels[predicted_labels == 1])

def false_positives(predicted_labels, labels):
    return np.count_nonzero(1 - labels[predicted_labels == 1])

def true_negatives(predicted_labels, labels):
    return np.count_nonzero(1 - labels[predicted_labels == 0])

def false_negatives(predicted_labels, labels):
    return np.count_nonzero(labels[predicted_labels == 0])

class Scores:
    def __init__(self, predicted_labels, labels):
        self.predicted_labels = predicted_labels
        self.labels = labels

        self.tp = true_positives(self.predicted_labels, self.labels)
        self.fp = false_positives(self.predicted_labels, self.labels)
        self.tn = true_negatives(self.predicted_labels, self.labels)
        self.fn = false_negatives(self.predicted_labels, self.labels)

    def __repr__(self):
        print("===================\n")

        print(f"TP : { self.tp }")
        print(f"FP : { self.fp }")
        print(f"TN : { self.tn }")
        print(f"FN : { self.fn }\n")

        print("===================\n")

        print(f"P : { self.positives() }")
        print(f"N : { self.negatives() }")
        print(f"TPR : { self.tp_rate() }")
        print(f"TNR : { self.tn_rate() }")
        print(f"FPR : { self.fp_rate() }")
        print(f"ACC : { self.acc() }")
        print(f"Precision : { self.precision() }")
        print(f"NPV : { self.npv() }")
        print(f"MCC : { self.mcc() }")
        print(f"F1-score : { self.f_score(1) }")
        print(f"Kappa : { self.kappa() } \n")

        print("===================")

        return ""

    def positives(self):
        return self.tp + self.fn

    def negatives(self):
        return self.fp + self.tn

    def tp_rate(self):
        positives = self.positives()
        if positives > 0:
            return self.tp / positives
        else:
            return 0
    
    def tn_rate(self):
        negatives = self.negatives()
        if negatives > 0:
            return self.tn / negatives
        else:
            return 0

    def fp_rate(self):
        negatives = self.negatives()
        if negatives > 0:
            return self.fp / negatives
        else:
            return None

    def acc(self):
        return (self.tp + self.tn) / (self.negatives() + self.positives())

    def precision(self):
        den = self.tp + self.fp
        if den > 0:
            return self.tp / den
        else:
            return None

    def npv(self):
        den = self.tn + self.fn
        if den > 0:
            return self.tn / den
        else:
            return None

    def mcc(self):
        den = ((self.tp + self.fp)*(self.tp + self.fn)*(self.tn + self.fp)*(self.tn + self.fn)) ** 0.5
        if den != 0:
            return (self.tp * self.tn - self.fp * self.fn) / den
        else:
            return None

    def f_score(self, beta):
        precision = self.precision()
        tpr = self.tp_rate()

        if precision is None or tpr is None:
            return None
        return (1 + beta ** 2) * (self.precision() * self.tp_rate()) / ((beta ** 2) * self.precision() + self.tp_rate())

    def kappa(self):
        den = (self.tp + self.fp) * (self.fp + self.tn) + (self.tp + self.fn)*(self.fn + self.tn)
        if den != 0:
            return 2 * (self.tp * self.tn - self.fn * self.fp) / den
        else:
            return None

In [33]:
y_predicted = np.array(y_predicted, dtype=int)

In [35]:
print(y_predicted)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [34]:
scores = Scores(y_predicted, y_test)
print(scores)


TP : 103
FP : 49
TN : 0
FN : 0


P : 103
N : 49
TPR : 1.0
TNR : 0.0
FPR : 1.0
ACC : 0.6776315789473685
Precision : 0.6776315789473685
NPV : None
MCC : None
F1-score : 0.807843137254902
Kappa : 0.0 


