# Sentiment analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import pipeline
from scipy.special import softmax

import ast
from sentence_transformers import SentenceTransformer, util


  from .autonotebook import tqdm as notebook_tqdm


In [21]:
df = pd.DataFrame(np.random.randint(1, 100, size=(10, 10)),
                               columns=[f'Colonne_{i+1}' for i in range(10)],
                               index=[f'Ligne_{i+1}' for i in range(10)])
df.iloc[5, 5] = np.nan
print(df)
df['Colonne_6'] = df['Colonne_6'].dropna().apply(lambda x : x**2)
df

          Colonne_1  Colonne_2  Colonne_3  Colonne_4  Colonne_5  Colonne_6  \
Ligne_1          26         27         99         57         73       83.0   
Ligne_2          78         41         81          4         24       10.0   
Ligne_3          49         45         46         35         70        7.0   
Ligne_4          69         78         24         64         63       40.0   
Ligne_5          55         54         26         29         95       53.0   
Ligne_6          11         88         56          5         51        NaN   
Ligne_7          33         77         74         18         22       44.0   
Ligne_8          70         68         54         87         97       63.0   
Ligne_9          67         11         44         58         58       53.0   
Ligne_10         52         66         14         42         43       92.0   

          Colonne_7  Colonne_8  Colonne_9  Colonne_10  
Ligne_1          75         58         16          77  
Ligne_2          83         7

Unnamed: 0,Colonne_1,Colonne_2,Colonne_3,Colonne_4,Colonne_5,Colonne_6,Colonne_7,Colonne_8,Colonne_9,Colonne_10
Ligne_1,26,27,99,57,73,6889.0,75,58,16,77
Ligne_2,78,41,81,4,24,100.0,83,74,45,77
Ligne_3,49,45,46,35,70,49.0,44,7,73,74
Ligne_4,69,78,24,64,63,1600.0,96,93,49,36
Ligne_5,55,54,26,29,95,2809.0,59,8,88,7
Ligne_6,11,88,56,5,51,,90,1,49,57
Ligne_7,33,77,74,18,22,1936.0,13,74,45,25
Ligne_8,70,68,54,87,97,3969.0,9,50,57,90
Ligne_9,67,11,44,58,58,2809.0,49,53,54,25
Ligne_10,52,66,14,42,43,8464.0,82,66,65,58


In [2]:
DATA_PATH = "../../data/our_movie_data_extended.csv"
df_extended = pd.read_csv(DATA_PATH, index_col="Wikipedia_movie_ID")
df_extended['category'] = df_extended['category'].dropna().apply(ast.literal_eval)

In [9]:
df_extended['summary'].iloc[1]

'After being pulled through a time portal, Ash Williams lands in 1300 AD, where he is almost immediately captured by Lord Arthur\'s men, who suspect him to be an agent for Duke Henry, with whom Arthur is at war. He is enslaved along with the captured Henry, his gun and chainsaw confiscated, and is taken to a castle. Ash is thrown in a pit where he fights off a Deadite and regains his weapons from Arthur\'s Wise Man. After demanding Henry and his men be set free  and killing a deadite in full view of everyone, Ash is celebrated as a hero. He also grows attracted to Sheila, the sister of one of Arthur\'s fallen knights. According to the Wise Man, the only way Ash can return to his time is to retrieve the Necronomicon Ex-Mortis. After bidding goodbye to Sheila, Ash starts his search for the Necronomicon. As he enters a haunted forest, an unseen force pursues Ash through the woods. Fleeing, he ducks into a windmill where he crashes into a mirror. The small reflections of Ash climb out from

In [4]:
def summary_to_segment(summary):    
    if pd.isna(summary):
        return []
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    sentences = summary.split(". ")
    sentences = [s.strip() for s in sentences if s]  # Supprimer les espaces et les phrases vides

    embeddings = model.encode(sentences)

    similarities = [util.pytorch_cos_sim(embeddings[i], embeddings[i+1]).item() for i in range(len(embeddings)-1)]

    # 4. Identifier les points de transition basés sur un seuil de similarité
    threshold = 0.5 
    segments = []
    current_segment = [sentences[0]]

    for i, similarity in enumerate(similarities):
        if similarity < threshold:  
            segments.append(" ".join(current_segment))  
            current_segment = []  
        current_segment.append(sentences[i + 1])

    segments.append(" ".join(current_segment))
    #if we have only empty list, return an empty list
    if all(len(sublist) == 0 for sublist in segments):
        return []
    return segments

def segments_to_emotions(segments):
    """
    take a segments (list of segment) and compute the probability of each emotions for each segment
    """
    #the emotions are anger, fear, joy, love, sadness, surprise and neutral
    
    #if segments is empty, return an "empty dataframe" with nan
    if len(segments) == 0:
        return pd.DataFrame([[np.nan] * 7], columns=['neutral', 'joy', 'surprise', 'disgust', 'fear', 'anger', 'sadness'])
    
    classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
    emotions = classifier(segments)
    emotions_flattened = [{item['label']: item['score'] for item in entry} for entry in emotions]
    df_emotions =  pd.DataFrame(emotions_flattened)
    return df_emotions

def create_df_emotions_with_every_film(df):
    df_emotions = pd.DataFrame()
    for idx, segments in tqdm(enumerate(df['summary_segmented'].dropna())): #on pourra retirer le dropna quand on calculera cela pour tout les summaries
        emotions = segments_to_emotions(segments)
        #set the index to be the UID of the film
        emotions.index = [df.index[idx]] * len(emotions)
        emotions['category'] = df['category']
        df_emotions = pd.concat([df_emotions, emotions])
    return df_emotions

In [5]:
df_extended['summary_segmented'] = df_extended['summary'].iloc[:1000].apply(summary_to_segment)
df_emotions_extended = create_df_emotions_with_every_film(df_extended)
df_emotions_extended.sample(5)

1000it [52:58,  3.18s/it]


Unnamed: 0,neutral,joy,surprise,disgust,fear,anger,sadness,category
330,0.446200,0.147301,0.138631,0.118294,0.077507,0.038214,0.033855,"[Comedy, Drama]"
3217,0.065840,0.003035,0.096655,0.005681,0.782216,0.042827,0.003746,"[Others, Family/Animation, Horror, Drama, Fant..."
3217,0.144797,0.003058,0.014325,0.214261,0.085287,0.397198,0.141074,"[Others, Family/Animation, Horror, Drama, Fant..."
3217,0.042516,0.018551,0.013053,0.073122,0.021625,0.737254,0.093879,"[Others, Family/Animation, Horror, Drama, Fant..."
3217,0.779588,0.079820,0.020288,0.086737,0.003069,0.011677,0.018821,"[Others, Family/Animation, Horror, Drama, Fant..."
...,...,...,...,...,...,...,...,...
168570,,,,,,,,"[Drama, Others]"
168571,0.190531,0.587667,0.010018,0.033155,0.001434,0.009787,0.167410,"[Thriller, Romance, Drama, Others]"
168571,0.662094,0.230744,0.004565,0.050244,0.003976,0.031725,0.016653,"[Thriller, Romance, Drama, Others]"
168571,0.825387,0.029670,0.059088,0.023844,0.006891,0.008173,0.046947,"[Thriller, Romance, Drama, Others]"


In [24]:
unique_idx = df_emotions_extended.index.unique()
unique_idx

Index([   330,   3217,   3333,   3746,   3837,   3947,   4227,   4231,   4560,
         4726,
       ...
       167857, 167924, 168483, 168491, 168498, 168551, 168554, 168561, 168570,
       168571],
      dtype='int64', length=1000)