# TP2 - Feature Engineering


In [15]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import time

import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn.neighbors import KNeighborsClassifier



## Étape 1 : On considère les fichiers du répertoire bbcsport
* Indiquer les points marquants l'exploration.
* Pour chaque observation, indiquer l’opération à effectuer qui serait la plus appropriée.

In [17]:

# Définir le chemin du répertoire principal
base_dir = "./bbcsport"

# Initialiser une liste pour stocker les données
data = []





# Parcourir chaque sous-répertoire
for category in os.listdir(base_dir):
    category_path = os.path.join(base_dir, category)
    
    # Vérifier si c'est bien un répertoire
    if os.path.isdir(category_path):
        # Lire tous les fichiers dans le sous-répertoire
        for file_name in os.listdir(category_path):
            file_path = os.path.join(category_path, file_name)
            
            # Lire le contenu du fichier texte
            with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
                text = file.read()
                
            # Ajouter aux données
            data.append((text, category))
    


# Créer le DataFrame
df = pd.DataFrame(data, columns=["texte", "categorie"])

from sklearn.utils import resample

# Taille cible = taille de la catégorie majoritaire
target_size = df['categorie'].value_counts().max()

# Rééchantillonnage par catégorie
df = pd.concat([ resample(df[df['categorie'] == cat], replace=True, n_samples=target_size, random_state=42) for cat in df['categorie'].unique() ])


df = df.reset_index()
df.rename(columns={"index": "ID"}, inplace=True)

print('Dimensions:',df.shape, '\n')

# pd.set_option("display.max_colwidth", None)  # Afficher toute la largeur du texte
maxCharacters = 200
pd.set_option("display.max_colwidth", maxCharacters)  # Affiche jusqu'à maxCharacters caractères

# Remplace les '\n' littéraux dans les chaînes qui auraient été échappés ('\\n') par de vrais retours à la ligne.
#df['texte'] = df['texte'].apply(lambda x: x.replace('\\n', '\n'))

# Afficher les 20 premières lignes du DataFrame avec retours à la ligne
print('Dataframe original:')
print(df.head(20))

# Mélanger les lignes du DataFrame de façon aléatoire
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Remplace les '\n' littéraux dans les chaînes qui auraient été échappés ('\\n') par de vrais retours à la ligne.
#df['texte'] = df['texte'].apply(lambda x: x.replace('\\n', '\n'))

# Afficher les 20 premières lignes du DataFrame avec retours à la ligne
#df.index.name = "ID"


Dimensions: (1325, 3) 

Dataframe original:
     ID  \
0   102   
1   106   
2    71   
3   188   
4    20   
5   102   
6   121   
7   214   
8    87   
9    99   
10  151   
11  130   
12  149   
13  257   
14  191   
15  160   
16   21   
17  252   
18  235   
19   48   

                                                                                                                                                                                                      texte  \
0   Barwick calls for Highbury calm\n\nNew Football Association chief Brian Barwick pleaded with Arsenal and Manchester United to show calm ahead of their Highbury showdown.\n\n"When these two great t...   
1   Beckham defends form for England\n\nEngland captain David Beckham insists he is not concerned by criticism of his displays for club and country.\n\n"As long as I'm happy with my form for Real Madr...   
2   Gerrard future not decided by Cardiff loss\n\nSteven Gerrard's own goal in Liverpool's Carling Cup 

## Étape 2: Appliquer des préditeurs simples

In [19]:
# 2.	Calculer le nombre de mots sur chaque ligne de texte 
df["word_count"] = df["texte"].apply(lambda x: len(str(x).split()))

#Calculer le nombre de caractères avec espaces pour chaque ligne
df["char_count"] =df["texte"].map(lambda x : len(str(x)))

#Calculer la longueur moyenne de mots (non incluant les espaces) pour chaque ligne
def calcul_moyenne_mots(x):
    mots = x.split()
    return sum(len(mot)  for mot in mots ) / len(mots)
df["longueur_moyenne_mot"] = df["texte"].map(lambda x : calcul_moyenne_mots(x))

#Calculer le nombre de stopwords sur chaque ligne
nltk.download('stopwords')

# Charger la liste des stopwords en anglais
stop_words = set(stopwords.words("english"))
print('Stop word:', stop_words)

# Fonction pour compter les stopwords dans un texte
def count_stopwords(text):
    words = text.split()  # Découper en mots
    return sum(1 for word in words if word.lower() in stop_words)

# Ajouter une colonne 'stopword_count' avec le nombre de stopwords par ligne
df["stopword_count"] = df["texte"].apply(count_stopwords)
print('Dataframe avec lignes aléatoires:')
df.head(20)

Stop word: {"he'll", "won't", 'before', "haven't", 'or', 'my', 'who', 'same', 'd', 'isn', 'than', 'his', 'very', 's', 'now', 'weren', "you're", "that'll", 'having', 'to', 'which', "you've", 'itself', 'again', 'nor', 'needn', 'where', 'when', 'shouldn', 'will', 'are', 'am', 'ourselves', "i'm", 'on', 'yours', 've', 'their', 'being', 'if', "wouldn't", 'yourself', 'in', 'some', 'you', "he'd", 'won', 'through', 'but', 'were', 'out', 'while', 'a', 'have', 'doesn', "don't", 'did', 'couldn', "i'll", 'once', 'our', "they're", 'mightn', 'above', 'haven', "she's", 'the', "it'd", 'me', "mustn't", 'between', 'and', 'doing', 'yourselves', 'an', 'been', 'over', "i'd", 'shan', 'hasn', "she'll", 'that', 'y', 'they', 'up', 'o', "they'll", 'what', "they'd", 'those', 'myself', 'until', 'wasn', 'just', 'any', 'does', 'into', 'why', 'after', 'as', "he's", 'them', 'him', 'don', 'there', 'down', 'herself', 'theirs', 'from', 'further', "doesn't", "i've", 'ours', 're', 'should', "should've", 'under', 'all', 'of

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/follyayeboua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,ID,texte,categorie,word_count,char_count,longueur_moyenne_mot,stopword_count
0,537,Vickery out of Six Nations\n\nEngland tight-head prop Phil Vickery has been ruled out of the rest of the 2005 RBS Six Nations after breaking a bone in his right forearm.\n\nVickery was injured as ...,rugby,387,2259,4.826873,155
1,365,Ganguly plays down fears\n\nIndia captain Sourav Ganguly has attempted to play down safety fears over their tour to Bangladesh.\n\nThe Indian squad arrived in Dhaka on Wednesday for a 19-day tour ...,cricket,264,1590,5.011364,100
2,501,Harinordoquy suffers France axe\n\nNumber eight Imanol Harinordoquy has been dropped from France's squad for the Six Nations match with Ireland in Dublin on 12 March.\n\nHarinordoquy was a second-...,rugby,341,2240,5.55132,110
3,186,Newcastle line up Babayaro\n\nNewcastle manager Graeme Souness is closing in on signing Chelsea defender Celestine Babayaro when the transfer window reopens.\n\nSouness is bidding to bolster his d...,football,130,758,4.807692,63
4,721,"Faultless Federer has no equal\n\nRoger Federer - nice bloke, fantastic tennis player - the ultimate sportsman.\n\nWhen Lleyton Hewitt shook his hand after getting another thrashing, a third in as...",tennis,420,2357,4.604762,176
5,379,"Australia build imposing lead\n\nThird Test, Nagpur, day three (stumps): \nAustralia 202-3 & 398 v India 185\n\nIndia were bowled out for just 185 in the morning session, with paceman Gillespie re...",cricket,506,2905,4.731225,188
6,347,Murali to miss one-dayers in NZ\n\nSri Lanka have put back plans to take spinner Muttiah Muralitharan on the one-day leg of their New Zealand tour.\n\nMuralitharan was to have travelled with the t...,cricket,162,925,4.697531,64
7,680,Melzer shocks Agassi\n\nSecond seed Andre Agassi suffered a comprehensive defeat by Jurgen Melzer in the quarter-finals of the SAP Open.\n\nAgassi was often bamboozled by the Austrian's drop shots...,tennis,203,1137,4.586207,72
8,327,Pakistan question warm-up venue\n\nPakistan have voiced concern about the choice of Dharamshala to stage their only warm-up game in India.\n\nTuesday's practice session had to be cancelled because...,cricket,447,2605,4.818792,191
9,372,Aussies unhappy with pitch\n\nAustralian skipper Ricky Ponting was still able to raise a smile despite his side's 13-run defeat by India in the fourth Test at Mumbai.\n\nThey had already done enou...,cricket,458,2473,4.39083,208


## Etape 3: Nettyage des données

In [21]:
# Prétraitement du texte
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)  # Suppression des ponctuations
    text = re.sub("\d+", "", text)  # Suppression des chiffres
    return text

df["cleaned_text"] = df["texte"].apply(clean_text)

# Séparation des données X et y
X = df["cleaned_text"]
y = df["categorie"]


# Séparation train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 


df_metrics = pd.DataFrame(columns=["Technique", "Modele", "Score F1", "Accuracy"])

  text = re.sub("\d+", "", text)  # Suppression des chiffres


## Étape 4: Utilisation d'une technique d’extraction (par ex: TF–IDF Vectors) et choisir au moins 3 algorithmes de classification. De ce fait, on obtient au moins 3 modèles. On fera la comparaison en se basant la technique d’extraction.

In [23]:
# Modèles de classification
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000, solver="lbfgs"),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

# Entraînement et évaluation
def appliquer_model_classification(technique, train_x, test_x):
    results = {}
    for name, model in models.items():
        model.fit(train_x, y_train)
        y_pred = model.predict(test_x)
        f1 = f1_score(y_test, y_pred, average='weighted')
        accuracy = accuracy_score(y_test, y_pred)
        results[name] = {"F1-score": f1, "Accuracy": accuracy}
        df_metrics.loc[len(df_metrics)] = [technique, name, f1, accuracy]

    # Affichage des résultats
    print("\nComparaison des scores F1 et Accuracy:")
    for model, scores in results.items():
        print(f"{model}: F1-score = {scores['F1-score']:.4f}, Accuracy = {scores['Accuracy']:.4f}")

In [24]:
from collections import Counter
from scipy.sparse import hstack


all_words = " ".join(df["cleaned_text"]).split()
word_freq = Counter(all_words)
most_common_words = [word for word, _ in word_freq.most_common(2000)]  # Top 2000 mots

techniques = {
    "TfidfVectorizer": TfidfVectorizer(),
    "Bag of Words": CountVectorizer(),
    "Word Embeddings": None,
    "NLP based features": None,
    "Prédicteurs de base": CountVectorizer(vocabulary=most_common_words)
}

In [25]:
!pip install gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.preprocessing import StandardScaler

# Fonction pour convertir les textes en vecteurs de mots
def document_vector(model, doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv] or [np.zeros(embedding_dim)], axis=0)


for technique, model in techniques.items():
    if technique == "Word Embeddings":
        X_train_tokens = [simple_preprocess(text) for text in X_train]
        X_test_tokens = [simple_preprocess(text) for text in X_test]

        # Entraînement du modèle Word2Vec
        embedding_dim = 100
        word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=embedding_dim, window=5, min_count=2, workers=4)

        train_x = np.array([document_vector(word2vec_model, doc) for doc in X_train_tokens])
        test_x = np.array([document_vector(word2vec_model, doc) for doc in X_test_tokens])


    elif technique == "NLP based features":
        continue

    elif technique == "Prédicteurs de base":
        technique == "Prédicteurs de base"
        print(50*"=")
        print(f"Technique: {technique}")

        X_bow = model.fit_transform(df["cleaned_text"])

        # Création de la matrice de nouvelles features
        X_base = df[["char_count", "longueur_moyenne_mot", "stopword_count", "word_count"]].values  

        print(f"Shape:",X_base.shape)

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_base)
        #X_base = df[["stopword_count"]].values

        # Combinaison avec le Bag-of-Words
        X = hstack([X_bow, X_scaled])


        # Séparation des données X et y
        #X = X_bow
        y = df["categorie"]
        train_x, test_x, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
    elif model is None:
        continue

    else:
        print(50*"=")
        print(f"Technique: {technique}")
        train_x = model.fit_transform(X_train)
        test_x = model.transform(X_test)

    appliquer_model_classification(technique, train_x, test_x)
    print("\n")

    

Technique: TfidfVectorizer

Comparaison des scores F1 et Accuracy:
Logistic Regression: F1-score = 0.9962, Accuracy = 0.9962
Random Forest: F1-score = 0.9887, Accuracy = 0.9887
SVM: F1-score = 0.9962, Accuracy = 0.9962
KNN: F1-score = 0.9735, Accuracy = 0.9736


Technique: Bag of Words

Comparaison des scores F1 et Accuracy:
Logistic Regression: F1-score = 0.9962, Accuracy = 0.9962
Random Forest: F1-score = 0.9925, Accuracy = 0.9925
SVM: F1-score = 0.9666, Accuracy = 0.9660
KNN: F1-score = 0.8255, Accuracy = 0.8264



Comparaison des scores F1 et Accuracy:
Logistic Regression: F1-score = 0.6859, Accuracy = 0.6868
Random Forest: F1-score = 0.9425, Accuracy = 0.9434
SVM: F1-score = 0.5690, Accuracy = 0.5736
KNN: F1-score = 0.7395, Accuracy = 0.7396


Technique: Prédicteurs de base
Shape: (1325, 4)

Comparaison des scores F1 et Accuracy:
Logistic Regression: F1-score = 0.9962, Accuracy = 0.9962
Random Forest: F1-score = 0.9887, Accuracy = 0.9887
SVM: F1-score = 0.9666, Accuracy = 0.9660
K

In [26]:
df_pivot = df_metrics.pivot_table(index='Technique', columns='Modele', values=['Score F1', 'Accuracy'])
df_pivot

Unnamed: 0_level_0,Accuracy,Accuracy,Accuracy,Accuracy,Score F1,Score F1,Score F1,Score F1
Modele,KNN,Logistic Regression,Random Forest,SVM,KNN,Logistic Regression,Random Forest,SVM
Technique,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Bag of Words,0.826415,0.996226,0.992453,0.966038,0.825523,0.996227,0.992451,0.966604
Prédicteurs de base,0.864151,0.996226,0.988679,0.966038,0.864013,0.996227,0.988678,0.966604
TfidfVectorizer,0.973585,0.996226,0.988679,0.996226,0.973549,0.996224,0.988678,0.996224
Word Embeddings,0.739623,0.686792,0.943396,0.573585,0.739526,0.68586,0.942546,0.568974
