# TP2 - Feature Engineering


In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import time

import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score

## Étape 1 : On considère les fichiers du répertoire bbcsport
* Indiquer les points marquants l'exploration.
* Pour chaque observation, indiquer l’opération à effectuer qui serait la plus appropriée.

In [4]:

# Définir le chemin du répertoire principal
base_dir = "c:/tmp/bbcsport"

# Initialiser une liste pour stocker les données
data = []

# Parcourir chaque sous-répertoire
for category in os.listdir(base_dir):
    category_path = os.path.join(base_dir, category)
    
    # Vérifier si c'est bien un répertoire
    if os.path.isdir(category_path):
        # Lire tous les fichiers dans le sous-répertoire
        for file_name in os.listdir(category_path):
            file_path = os.path.join(category_path, file_name)
            
            # Lire le contenu du fichier texte
            with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
                text = file.read()
                
            # Ajouter aux données
            data.append((text, category))


# Créer le DataFrame
df = pd.DataFrame(data, columns=["texte", "categorie"])

df = df.reset_index()
df.rename(columns={"index": "ID"}, inplace=True)

print('Dimensions:',df.shape, '\n')

maxCharacters = 200
pd.set_option("display.max_colwidth", maxCharacters)  # Affiche jusqu'à maxCharacters caractères

# Afficher les 20 premières lignes du DataFrame avec retours à la ligne
print('Dataframe original:')
print(df.head(20))

# Mélanger les lignes du DataFrame de façon aléatoire
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


Dimensions: (737, 3) 

Dataframe original:
    ID  \
0    0   
1    1   
2    2   
3    3   
4    4   
5    5   
6    6   
7    7   
8    8   
9    9   
10  10   
11  11   
12  12   
13  13   
14  14   
15  15   
16  16   
17  17   
18  18   
19  19   

                                                                                                                                                                                                      texte  \
0   Claxton hunting first major medal\n\nBritish hurdler Sarah Claxton is confident she can win her first major medal at next month's European Indoor Championships in Madrid.\n\nThe 25-year-old has al...   
1   O'Sullivan could run in Worlds\n\nSonia O'Sullivan has indicated that she would like to participate in next month's World Cross Country Championships in St Etienne.\n\nAthletics Ireland have hinte...   
2   Greene sets sights on world title\n\nMaurice Greene aims to wipe out the pain of losing his Olympic 100m title in Athens 

## Étape 2: Appliquer des préditeurs simples

In [6]:
# 2.	Calculer le nombre de mots sur chaque ligne de texte 
df["word_count"] = df["texte"].apply(lambda x: len(str(x).split()))

#Calculer le nombre de caractères avec espaces pour chaque ligne
df["char_count"] =df["texte"].map(lambda x : len(str(x)))

#Calculer la longueur moyenne de mots (non incluant les espaces) pour chaque ligne
def calcul_moyenne_mots(x):
    mots = x.split()
    return sum(len(mot)  for mot in mots ) / len(mots)
df["longueur_moyenne_mot"] = df["texte"].map(lambda x : calcul_moyenne_mots(x))

#Calculer le nombre de stopwords sur chaque ligne
nltk.download('stopwords')

# Charger la liste des stopwords en anglais
stop_words = set(stopwords.words("english"))
print('Stop word:', stop_words)

# Fonction pour compter les stopwords dans un texte
def count_stopwords(text):
    words = text.split()  # Découper en mots
    return sum(1 for word in words if word.lower() in stop_words)

# Ajouter une colonne 'stopword_count' avec le nombre de stopwords par ligne
df["stopword_count"] = df["texte"].apply(count_stopwords)
print('Dataframe avec lignes aléatoires:')
df.head(20)

Stop word: {'be', "i've", "wasn't", 'with', 'off', 'our', "we'd", 'why', 'other', "they've", 'll', "she's", 'who', "you've", 'being', 'ma', 'these', 'before', 'its', 'on', 'don', 'where', "she'll", 'than', "we'll", "they'll", 'i', "needn't", "weren't", 'itself', 'needn', 'his', 'your', "you're", "it's", 'no', 'they', 't', "it'd", 'shan', 'then', 'those', 'theirs', 'wouldn', 'or', "we're", 'here', 'under', "that'll", 'myself', "shan't", "it'll", 'further', 'themselves', 'have', 'very', 'hadn', 'has', 'if', 'an', 'are', 'him', 'do', 'will', 'yourselves', 'by', 'between', "couldn't", "we've", 'weren', 'during', 'which', 'ours', 'each', 'their', 'couldn', 'been', 'doing', 'again', 'most', 'so', 'shouldn', 'as', 'over', "he'll", 'up', 'into', 'from', "mustn't", 'there', 'just', 'haven', 'too', 'does', "i'm", "shouldn't", 'now', 'below', 'hasn', "you'll", 'that', 'to', 'm', "they'd", 'at', 'can', 'only', 'y', 'ain', 'how', 'out', "you'd", 'about', 'doesn', 'more', 'a', 'he', "he's", 'through

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nyeck\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,ID,texte,categorie,word_count,char_count,longueur_moyenne_mot,stopword_count
0,669,Johansson takes Adelaide victory\n\nSecond seed Joachim Johansson won his second career title with a 7-5 6-3 win over Taylor Dent at the Australian hardcourt championships in Adelaide.\n\nThe Swed...,tennis,163,925,4.662577,68
1,33,"Athens memories soar above lows\n\nWell, it's goodbye to another Olympic year and as usual there were plenty of highs and lows in Athens.\n\nObviously, there's no getting away from the differing f...",athletics,944,5104,4.356992,477
2,549,England coach faces rap after row\n\nEngland coach Andy Robinson is facing disciplinary action after criticising referee Jonathan Kaplan in his side's Six Nations defeat to Ireland.\n\nThe Rugby F...,rugby,249,1538,5.15261,97
3,199,New Zealand to step up security\n\nNew Zealand cricket officials are to review security after Australia players were hit by missiles during Saturday's opening one-day international.\n\nThe match w...,cricket,261,1595,5.099617,96
4,264,Irish finish with home game\n\nRepublic of Ireland manager Brian Kerr has been granted his wish for a home game as the final World Cup qualifier.\n\nIreland will close their bid to reach the 2006 ...,football,301,1798,4.923588,81
5,583,Ireland 17-12 South Africa\n\nRonan O'Gara scored all Ireland's points as the home side claimed only their second ever win over South Africa on an emotional day at Lansdowne Road.\n\nO'Gara's firs...,rugby,637,3760,4.88697,217
6,39,Radcliffe tackles marathon tasks\n\nPaula Radcliffe faces arguably the biggest test of her career in the New York City Marathon on Sunday.\n\nBack under the spotlight of public scrutiny she will a...,athletics,738,4281,4.776423,327
7,554,"Owen set for skipper role\n\nWales number eight Michael Owen says replacing Gareth Thomas as Wales' captain will be straightforward because of the leadership quality in the squad.\n\n""You dream ab...",rugby,299,1691,4.645485,131
8,585,Ireland call up uncapped Campbell\n\nUlster scrum-half Kieran Campbell is one of five uncapped players included in Ireland's RBS Six Nations squad.\n\nCampbell is joined by Ulster colleagues Roger...,rugby,379,2431,5.398417,108
9,609,Bath faced with Tindall ultimatum\n\nMike Tindall's agent has warned Bath they have until next week to improve their contract offer to the England man or risk losing him to a rival club.\n\nDipo A...,rugby,185,1017,4.486486,86


## Utilisation d'une technique d’extraction (par ex: TF–IDF Vectors) et choisir au moins 3 algorithmes de classification. De ce fait, on obtient au moins 3 modèles. On fera la comparaison en se basant sur la technique d’extraction.

In [8]:
# Prétraitement du texte
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)  # Suppression des ponctuations
    text = re.sub("\d+", "", text)  # Suppression des chiffres
    return text

df["cleaned_text"] = df["texte"].apply(clean_text)

# Séparation des données X et y
X = df["cleaned_text"]
y = df["categorie"]

# Vectorisation TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

# Séparation train/test
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Modèles de classification
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000, solver="lbfgs"),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

# Entraînement et évaluation
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {"F1-score": f1, "Accuracy": accuracy}
    #print(f"{name}: F1-score = {f1:.4f}")
    #print(f"{name}: Accuracy = {accuracy:.4f}")

# Affichage des résultats
print("\nComparaison des scores F1 et Accuracy:")
for model, scores in results.items():
    print(f"{model}: F1-score = {scores['F1-score']:.4f}, Accuracy = {scores['Accuracy']:.4f}")



Comparaison des scores F1 et Accuracy:
Logistic Regression: F1-score = 0.9319, Accuracy = 0.9324
Random Forest: F1-score = 0.9525, Accuracy = 0.9527
SVM: F1-score = 0.9455, Accuracy = 0.9459
KNN: F1-score = 0.9587, Accuracy = 0.9595


## Utilisation de Prédicteurs de base (par ex: stop words, mots fréquents)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from nltk.corpus import stopwords
import nltk
from scipy.sparse import hstack

# Extraction des mots les plus fréquents
all_words = " ".join(df["cleaned_text"]).split()
word_freq = Counter(all_words)
most_common_words = [word for word, _ in word_freq.most_common(2000)]  # Top 2000 mots

# Création d'un vecteur bag-of-words basé sur les mots fréquents
vectorizer = CountVectorizer(vocabulary=most_common_words)
X_bow = vectorizer.fit_transform(df["cleaned_text"])

# Création de la matrice de nouvelles features
X_base = df[["char_count", "longueur_moyenne_mot", "stopword_count", "word_count"]].values
#X_base = df[["stopword_count"]].values

# Combinaison avec le Bag-of-Words
X = hstack([X_bow, X_base])

# Séparation des données X et y
#X = X_bow
y = df["categorie"]

# Séparation train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entraînement et évaluation
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {"F1-score": f1, "Accuracy": accuracy}
    #print(f"{name}: F1-score = {f1:.4f}, Accuracy = {accuracy:.4f}")

# Affichage des résultats
print("\nComparaison des scores F1 et Accuracy:")
for model, scores in results.items():
    print(f"{model}: F1-score = {scores['F1-score']:.4f}, Accuracy = {scores['Accuracy']:.4f}")



Comparaison des scores F1 et Accuracy:
Logistic Regression: F1-score = 0.9727, Accuracy = 0.9730
Random Forest: F1-score = 0.9932, Accuracy = 0.9932
SVM: F1-score = 0.2024, Accuracy = 0.3716
KNN: F1-score = 0.3270, Accuracy = 0.3378
