In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm
import os
import pickle as pkl
import time
import nltk
from textblob import TextBlob
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer


nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
DATA_FOLDER = 'data/MovieSummaries/'
ADDITIONAL_FOLDER = 'data/AdditionalData/'

In [3]:
# reading a txt file and convert it to a dataframe
def read_txt(path):
    df = pd.read_csv(path, sep='\t', header=None)
    return df

In [4]:
# importing the data
plots = pd.read_csv(DATA_FOLDER + 'plot_summaries.txt', header=None, sep="\t")
movies = pd.read_csv(DATA_FOLDER + 'movie.metadata.tsv', header=None, sep="\t")
characters = pd.read_csv(DATA_FOLDER + 'character.metadata.tsv', header=None, sep="\t")
names = pd.read_csv(DATA_FOLDER + 'name.clusters.txt', header=None, sep="\t")
tvtropes = pd.read_csv(DATA_FOLDER + 'tvtropes.clusters.txt', header=None, sep="\t")

In [5]:
# renaming columns the plots dataframe
plots.columns = ['wikipedia_movie_id', 'plot']

In [6]:
# Sample synopsis
synopsis = plots.iloc[:, 1].loc[0]

In [7]:
def find_ending(plot, max_sentences=3):
    blob = TextBlob(plot)
    last_sentences = blob.sentences[-max_sentences:]  # Adjust the number of sentences as needed
    return " ".join([str(sentence) for sentence in last_sentences])

In [8]:
plots['endings'] = plots.iloc[:, 1].apply(lambda plot: find_ending(plot))
plots['endings']

0        Shlykov, a hard-working taxi driver and Lyosha...
1        However, before they can commit suicide, they ...
2        At Menon's funeral, Manapally Pavithran arrive...
3        Moran and Charley are arrested while the judge...
4        In October 1982, Lindy is found guilty and sen...
                               ...                        
42298    The story is about Reema , a young Muslim scho...
42299    Moved, Leo tells him they have. The assistant ...
42300    Now Parsons is creating instruments that are h...
42301    Mini does not recognize Rehman, who realises t...
42302    Mainwaring and his men become the pride of the...
Name: endings, Length: 42303, dtype: object

In [9]:
sia = SentimentIntensityAnalyzer()

In [10]:
def attribute_sentiments(text):
    dico = sia.polarity_scores(text)
    return dico['neg'], dico['neu'], dico['pos'], dico['compound']


In [11]:
sentiments = plots['endings'].apply(lambda end: attribute_sentiments(end))
sentiments_df = pd.DataFrame(sentiments.tolist(), columns=['ending_negativity', 'ending_neutrality', 'ending_positivity', 'ending_compound_sentiment'])
plots = plots.join(sentiments_df)
plots

Unnamed: 0,wikipedia_movie_id,plot,endings,ending_negativity,ending_neutrality,ending_positivity,ending_compound_sentiment
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...","Shlykov, a hard-working taxi driver and Lyosha...",0.093,0.812,0.095,0.0083
1,31186339,The nation of Panem consists of a wealthy Capi...,"However, before they can commit suicide, they ...",0.171,0.753,0.077,-0.7269
2,20663735,Poovalli Induchoodan is sentenced for six yea...,"At Menon's funeral, Manapally Pavithran arrive...",0.187,0.692,0.122,-0.5719
3,2231378,"The Lemon Drop Kid , a New York City swindler,...",Moran and Charley are arrested while the judge...,0.078,0.791,0.132,0.7096
4,595909,Seventh-day Adventist Church pastor Michael Ch...,"In October 1982, Lindy is found guilty and sen...",0.144,0.843,0.013,-0.9042
...,...,...,...,...,...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho...","The story is about Reema , a young Muslim scho...",0.066,0.843,0.091,0.2732
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look...","Moved, Leo tells him they have. The assistant ...",0.000,0.652,0.348,0.9231
42300,35102018,American Luthier focuses on Randy Parsons’ tra...,Now Parsons is creating instruments that are h...,0.000,0.746,0.254,0.9686
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se...","Mini does not recognize Rehman, who realises t...",0.034,0.883,0.083,0.4215
