# Preprocessing

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()
STOP_WORDS = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str) or text.strip() == '':
        return ''
    
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    tokens = word_tokenize(text)
    
    cleaned_tokens = [
        lemmatizer.lemmatize(token) for token in tokens
        if token not in STOP_WORDS and token.isalpha() and len(token) > 2
    ]
    
    return ' '.join(cleaned_tokens)

In [None]:
import pandas as pd

df = pd.read_csv("data/merged_generative_ai_data_20250516.csv")

print("Prétraitement des titres...")
df['cleaned_title'] = df['title'].apply(preprocess_text)


print("Prétraitement des résumés...")
df['cleaned_abstract'] = df['abstract'].apply(preprocess_text)

df['cleaned_text'] = df['cleaned_title'] + " " + df['cleaned_abstract']

df.to_csv("preprocessing/processed_generative_ai_data.csv", index=False)

print("Prétraitement terminé et sauvegardé dans 'processed_generative_ai_data.csv'")

In [None]:
import pandas as pd
df = pd.read_csv("preprocessing/processed_generative_ai_data.csv")
df.isnull().sum()