# TP2 - Feature Engineering


In [25]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import time

import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score

## Étape 1 : On considère les fichiers du répertoire bbcsport
* Indiquer les points marquants l'exploration.
* Pour chaque observation, indiquer l’opération à effectuer qui serait la plus appropriée.

In [26]:

# Définir le chemin du répertoire principal
base_dir = "./bbcsport"

# Initialiser une liste pour stocker les données
data = []

# Parcourir chaque sous-répertoire
for category in os.listdir(base_dir):
    category_path = os.path.join(base_dir, category)
    
    # Vérifier si c'est bien un répertoire
    if os.path.isdir(category_path):
        # Lire tous les fichiers dans le sous-répertoire
        for file_name in os.listdir(category_path):
            file_path = os.path.join(category_path, file_name)
            
            # Lire le contenu du fichier texte
            with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
                text = file.read()
                
            # Ajouter aux données
            data.append((text, category))


# Créer le DataFrame
df = pd.DataFrame(data, columns=["texte", "categorie"])

df = df.reset_index()
df.rename(columns={"index": "ID"}, inplace=True)

print('Dimensions:',df.shape, '\n')

# pd.set_option("display.max_colwidth", None)  # Afficher toute la largeur du texte
maxCharacters = 200
pd.set_option("display.max_colwidth", maxCharacters)  # Affiche jusqu'à maxCharacters caractères

# Remplace les '\n' littéraux dans les chaînes qui auraient été échappés ('\\n') par de vrais retours à la ligne.
#df['texte'] = df['texte'].apply(lambda x: x.replace('\\n', '\n'))

# Afficher les 20 premières lignes du DataFrame avec retours à la ligne
print('Dataframe original:')
print(df.head(20))

# Mélanger les lignes du DataFrame de façon aléatoire
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Remplace les '\n' littéraux dans les chaînes qui auraient été échappés ('\\n') par de vrais retours à la ligne.
#df['texte'] = df['texte'].apply(lambda x: x.replace('\\n', '\n'))

# Afficher les 20 premières lignes du DataFrame avec retours à la ligne
#df.index.name = "ID"


Dimensions: (737, 3) 

Dataframe original:
    ID  \
0    0   
1    1   
2    2   
3    3   
4    4   
5    5   
6    6   
7    7   
8    8   
9    9   
10  10   
11  11   
12  12   
13  13   
14  14   
15  15   
16  16   
17  17   
18  18   
19  19   

                                                                                                                                                                                                      texte  \
0   Van Nistelrooy hungry for return\n\nManchester United striker Ruud van Nistelrooy said he was "hungry to play" as he returned to training on Tuesday.\n\nThe Dutch striker, 28, is closing in on a c...   
1   Reyes tricked into Real admission\n\nJose Antonio Reyes has added to speculation linking him with a move from Arsenal to Real Madrid after falling victim to a radio prank.\n\nThe Spaniard believed...   
2   Adriano's Chelsea link rejected\n\nAdriano's agent Gilmar Rinaldi has insisted that he has had no contact with Chelsea ov

## Étape 2: Appliquer des préditeurs simples

In [27]:
# 2.	Calculer le nombre de mots sur chaque ligne de texte 
df["word_count"] = df["texte"].apply(lambda x: len(str(x).split()))

#Calculer le nombre de caractères avec espaces pour chaque ligne
df["char_count"] =df["texte"].map(lambda x : len(str(x)))

#Calculer la longueur moyenne de mots (non incluant les espaces) pour chaque ligne
def calcul_moyenne_mots(x):
    mots = x.split()
    return sum(len(mot)  for mot in mots ) / len(mots)
df["longueur_moyenne_mot"] = df["texte"].map(lambda x : calcul_moyenne_mots(x))

#Calculer le nombre de stopwords sur chaque ligne
nltk.download('stopwords')

# Charger la liste des stopwords en anglais
stop_words = set(stopwords.words("english"))
print('Stop word:', stop_words)

# Fonction pour compter les stopwords dans un texte
def count_stopwords(text):
    words = text.split()  # Découper en mots
    return sum(1 for word in words if word.lower() in stop_words)

# Ajouter une colonne 'stopword_count' avec le nombre de stopwords par ligne
df["stopword_count"] = df["texte"].apply(count_stopwords)
print('Dataframe avec lignes aléatoires:')
df.head(20)

Stop word: {"won't", "he'd", 'themselves', 'further', 'between', "isn't", 'once', 'those', 'wouldn', 'am', 'other', "hadn't", "shouldn't", 'ours', 'itself', 'after', 'for', 'on', "they'd", "we'll", "you've", 'down', 'of', 'is', 'hasn', 'shan', 'i', "she'd", 'whom', 'isn', "it'll", 'shouldn', 'will', "he's", 'or', "i'd", 'and', 'be', 'd', 'ma', "doesn't", 'herself', 'off', 'was', "wouldn't", "that'll", 'while', 'to', 'a', 'below', 'been', 'won', 'y', 'when', 'through', 'mightn', "you're", 'they', 'ourselves', "they've", "she's", 'under', 'does', 'in', 'with', 'own', 'because', 'not', 'yourself', 'about', 'did', 'then', 'you', 'm', "it's", 'me', 'against', 'doesn', 's', "he'll", 'any', 'aren', "i've", 'll', 'are', 'has', 're', 'than', "you'll", 'all', 'hadn', 'ain', 'more', 'over', 'haven', 'hers', 'she', 'him', 'only', 'which', "didn't", 'some', 'here', 'were', 't', 'such', 've', 'how', 'don', 'few', 'their', "we're", 'there', "weren't", "you'd", "i'm", 'now', 'above', 'myself', 'at', "

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


Unnamed: 0,ID,texte,categorie,word_count,char_count,longueur_moyenne_mot,stopword_count
0,669,Henman overcomes rival Rusedski\n\nTim Henman saved a match point before fighting back to defeat British rival Greg Rusedski 4-6 7-6 (8-6) 6-4 at the Dubai Tennis Championships on Tuesday.\n\nWorl...,tennis,340,1886,4.538235,135
1,33,Ferguson urges Henry punishment\n\nSir Alex Ferguson has called on the Football Association to punish Arsenal's Thierry Henry for an incident involving Gabriel Heinze.\n\nFerguson believes Henry d...,football,273,1622,4.930403,114
2,549,"Fuming Robinson blasts officials\n\nEngland coach Andy Robinson insisted he was ""livid"" after his side were denied two tries in Sunday's 19-13 Six Nations loss to Ireland in Dublin.\n\nMark Cueto'...",rugby,386,2141,4.536269,169
3,199,Arsenal through on penalties\n\nArsenal win 4-2 on penalties\n\nThe Spanish goalkeeper saved from Alan Quinn and Jon Harley as Arsenal sealed a quarter-final trip to Bolton with a 4-2 victory on p...,football,508,3138,5.155512,178
4,264,Man City 1-1 Newcastle\n\nAlan Shearer hit his 250th Premiership goal to help Newcastle earn a battling draw against Manchester City.\n\nShearer put Newcastle ahead when he raced on to a raking cr...,football,666,3850,4.771772,258
5,583,"Leeds v Saracens (Fri)\n\nHeadingley\n\nFriday, 25 February\n\n2000 GMT\n\nThe Tykes have brought in Newcastle prop Ed Kalman and Tom McGee from the Borders on loan while fly-half Craig McMullen h...",rugby,133,938,6.015038,27
6,39,Redknapp's Saints face Pompey tie\n\nNew Southampton manager Harry Redknapp faces an immediate reunion with his old club Portsmouth after they were drawn together in the FA Cup fourth round.\n\nEx...,football,407,2325,4.665848,152
7,554,"Pountney handed ban and fine\n\nNorthampton coach Budge Pountney has been fined £2,000 and banned from match-day coaching for six weeks for calling a referee ""a disgrace"".\n\nPountney was found gu...",rugby,123,787,5.382114,39
8,585,Wood - Ireland can win Grand Slam\n\nFormer captain Keith Wood believes Ireland can win only their second Grand Slam - and first since 1948 - in this year's RBS Six Nations Championship.\n\nAfter ...,rugby,317,1727,4.435331,135
9,609,Wilkinson to lead England\n\nFly-half Jonny Wilkinson has been named as England's new rugby union captain for the three November Tests.\n\nThe 25-year-old Newcastle star takes over from Lawrence D...,rugby,430,2514,4.834884,182


## Utilisation d'une technique d’extraction (par ex: TF–IDF Vectors) et choisir au moins 3 algorithmes de classification. De ce fait, on obtient au moins 3 modèles. On fera la comparaison en se basant la technique d’extraction.

In [28]:
# Prétraitement du texte
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)  # Suppression des ponctuations
    text = re.sub("\d+", "", text)  # Suppression des chiffres
    return text

df["cleaned_text"] = df["texte"].apply(clean_text)

# Séparation des données X et y
X = df["cleaned_text"]
y = df["categorie"]

# Vectorisation TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

# Séparation train/test
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Modèles de classification
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Naive Bayes": MultinomialNB()
}

# Entraînement et évaluation
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = f1
    print(f"{name}: F1-score = {f1:.4f}")
    print(f"{name}: Accuracy = {accuracy:.4f}")

# Affichage des résultats
print("\nComparaison des scores F1:")
for model, score in results.items():
    print(f"{model}: {score:.4f}")


  text = re.sub("\d+", "", text)  # Suppression des chiffres


Logistic Regression: F1-score = 0.9599
Logistic Regression: Accuracy = 0.9595
Random Forest: F1-score = 0.9597
Random Forest: Accuracy = 0.9595
SVM: F1-score = 0.9532
SVM: Accuracy = 0.9527
Naive Bayes: F1-score = 0.6032
Naive Bayes: Accuracy = 0.6216

Comparaison des scores F1:
Logistic Regression: 0.9599
Random Forest: 0.9597
SVM: 0.9532
Naive Bayes: 0.6032


## Utilisation de Prédicteurs de base (par ex: stop words, mots fréquents)

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from nltk.corpus import stopwords
import nltk
from scipy.sparse import hstack

# Extraction des mots les plus fréquents
all_words = " ".join(df["cleaned_text"]).split()
word_freq = Counter(all_words)
most_common_words = [word for word, _ in word_freq.most_common(2000)]  # Top 2000 mots

# Création d'un vecteur bag-of-words basé sur les mots fréquents
vectorizer = CountVectorizer(vocabulary=most_common_words)
X_bow = vectorizer.fit_transform(df["cleaned_text"])

# Création de la matrice de nouvelles features
X_base = df[["char_count", "longueur_moyenne_mot", "stopword_count", "word_count"]].values
#X_base = df[["stopword_count"]].values

# Combinaison avec le Bag-of-Words
X = hstack([X_bow, X_base])

# Séparation des données X et y
#X = X_bow
y = df["categorie"]

# Séparation train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entraînement et évaluation
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {"F1-score": f1, "Accuracy": accuracy}
    #print(f"{name}: F1-score = {f1:.4f}, Accuracy = {accuracy:.4f}")

# Affichage des résultats
print("\nComparaison des scores F1 et Accuracy:")
for model, scores in results.items():
    print(f"{model}: F1-score = {scores['F1-score']:.4f}, Accuracy = {scores['Accuracy']:.4f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Comparaison des scores F1 et Accuracy:
Logistic Regression: F1-score = 0.8184, Accuracy = 0.8176
Random Forest: F1-score = 0.9664, Accuracy = 0.9662
SVM: F1-score = 0.1827, Accuracy = 0.3514
Naive Bayes: F1-score = 0.9729, Accuracy = 0.9730
