In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from collections import Counter
from transformers import pipeline
import re
import torch

# Téléchargements NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Chargement des données
df = pd.read_csv("friends_dialogues.csv")
main_characters = ['Chandler', 'Joey', 'Monica', 'Phoebe', 'Rachel', 'Ross']
df = df[df["character"].isin(main_characters)]

# Modèle DistilBERT pour l'analyse des émotions
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
emotion_pipeline = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion", device=device)

# Initialisation des outils
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Fonctions
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text).lower()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return lemmatized_tokens

def analyze_emotion_batch(texts, batch_size=32):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        results.extend(emotion_pipeline(batch))
    return results

# Analyse des émotions
emotion_results = analyze_emotion_batch(df["line"].tolist())
emotions = [{"emotion": result["label"], "score": result["score"]} for result in emotion_results]

# Ajout des émotions au DataFrame
df["emotion"] = [emotion["emotion"] for emotion in emotions]
df["score_emotion"] = [emotion["score"] for emotion in emotions]

# Extraction des tics de langage
def extract_tics(dialogues):
    tokens_dialogues = [preprocess_text(dialogue) for dialogue in dialogues]
    all_ngrams = []
    for tokens in tokens_dialogues:
        all_ngrams.extend(list(ngrams(tokens, 3)))  # trigrammes
    return Counter(all_ngrams).most_common(1)

# Extraction des tics de langage et des phrases préférées
phrases_preferees = {}
tics_par_personnage = {}

for personnage in main_characters:
    dialogues_personnage = df[df["character"] == personnage]["line"].tolist()
    tics = extract_tics(dialogues_personnage)
    tics_par_personnage[personnage] = tics

    if tics:
        tic_mots = " ".join(tics[0][0])
        phrase_preferee = ""
        for dialogue in dialogues_personnage:
            if tic_mots in dialogue.lower():
                phrase_preferee = dialogue
                break
        phrases_preferees[personnage] = phrase_preferee
    else:
        phrases_preferees[personnage] = "N/A"

# Ajout des phrases préférées et des tics au DataFrame
df["phrase_preferee"] = df["character"].apply(lambda x: phrases_preferees.get(x, "N/A"))
df["tics_langage"] = df["character"].apply(lambda x: " ".join([" ".join(tic) for tic in tics_par_personnage.get(x, [])[0][0]]) if tics_par_personnage.get(x) else "N/A")

# Création du DataFrame pour les phrases préférées et les tics
phrases_tics_df = pd.DataFrame({
    "Personnage": main_characters,
    "Phrase préférée": [phrases_preferees[personnage] for personnage in main_characters],
    "Tics de langage": [" ".join([" ".join(tic) for tic in tics_par_personnage[personnage][0][0]]) if tics_par_personnage[personnage] else "N/A" for personnage in main_characters]
})

# Écriture dans les fichiers CSV
df.drop(columns=["phrase_preferee", "tics_langage"], inplace=True) #Suppression des colonnes
df.to_csv("friends_dialogues_emotions.csv", index=False)
phrases_tics_df.to_csv("friends_dialogues_phrases_tics.csv", index=False)

print("Résultats écrits dans friends_dialogues_emotions.csv et friends_dialogues_phrases_tics.csv")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Device set to use cpu


Résultats écrits dans friends_dialogues_emotions.csv et friends_dialogues_phrases_tics.csv
