In [None]:
""" https://www.kaggle.com/datasets/Cornell-University/movie-dialog-corpus """

In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import numpy as np
import seaborn as sns

In [None]:
# conversations = pd.read_csv(
#     "./kaggledialogue/movie_conversations.tsv", 
#     sep='\t', 
#     encoding='ISO-8859-2',
#     names = ['charID_1', 'charID_2', 'movieID', 'conversation']
# )


lines = pd.read_csv(
    "./kaggledialogue/movie_lines.tsv", 
    encoding='utf-8-sig', 
    sep='\t', 
    error_bad_lines=False, 
    header = None,
    names = ['lineID', 'charID', 'movieID', 'charName', 'text'],
    index_col=['lineID']
)

lines.dropna(subset=['text'], inplace=True)

characters = pd.read_csv(
    "./kaggledialogue/movie_characters_metadata.tsv", 
    sep='\t', 
    header = None,
    error_bad_lines=False,
    names = ['charID','charName','movieID','movieName','gender','score'],
    index_col=['charID']
)

titles = pd.read_csv(
    "./kaggledialogue/movie_titles_metadata.tsv",
    sep='\t',
    header=None,
    error_bad_lines=False,
    names=['movieID', 'title', 'year', 'ratingIMDB', 'votes', 'genresIMDB'],
    index_col=['movieID']
)

In [None]:
# combine all texts/quotes of a movie together
scripts = lines.groupby('movieID')['text'].unique().str.join(' ')
scripts = pd.DataFrame(scripts)
# join titles with joined dialogue
title_corpus = titles.join(scripts, on='movieID')
title_corpus.dropna(subset=['text'], inplace=True)

In [None]:
# perform sentiment analysis on entire movie dialogue for each movie
# run sentiment analysis-vader
s  = SentimentIntensityAnalyzer()
compound_scores = []
for movie in title_corpus['text']:
    sentiment = s.polarity_scores(movie)
    compound_scores.append(sentiment['compound'])

title_corpus['sentiment_score'] = compound_scores

In [None]:
title_corpus['sentiment_score'] = compound_scores

In [None]:
# clean years with "/I" in them
title_corpus.loc[title_corpus[title_corpus['year'].str.contains('/I')]['year'].index, 'year'] = title_corpus[title_corpus['year'].str.contains('/I')]['year'].apply(lambda row: re.sub('/I','',row))
# set values -1 or +1
title_corpus['project_score'] = title_corpus['sentiment_score'].apply(lambda row: 1 if row>=0 else -1)

In [175]:
# unwrap the genre column
title_corpus['genresIMDB'] = title_corpus['genresIMDB'].str.strip('[]\' \'')
title_corpus['genresIMDB'] = title_corpus['genresIMDB'].str.replace('\'','')
title_corpus.dropna(subset=['genresIMDB'], inplace=True, axis=0)
title_corpus.reset_index(drop=True, inplace=True)

In [177]:
# get unqiue genres list
genres = title_corpus['genresIMDB'].str.split(' ')
res = []
for k in range(len(genres)):
    for genre in genres[k]:
        res.append(genre)
res = pd.Series(res)
genre_list = res.unique()

In [179]:
# save
title_corpus.to_csv("./output/title_corpus.csv", index=False)
# load
#title_corpus = pd.read_csv("./output/title_corpus.csv")

In [None]:
# find score for each genre
neg_scores = []
pos_score = []
for genre in genre_list:
    val = title_corpus[title_corpus['genresIMDB'].str.contains(genre)].groupby('project_score').mean()['ratingIMDB']
    try:
        neg_scores.append(val.iloc[0])
    except:
        neg_scores.append(0)
    try:
        pos_score.append(val.iloc[1])
    except:
        pos_score.append(0)


In [181]:
# find count of movies per genre by score
neg_scores = []
pos_score = []
for genre in genre_list:
    val = title_corpus[title_corpus['genresIMDB'].str.contains(genre)].groupby('project_score').count()['title']
    try:
        neg_scores.append(val.iloc[0])
    except:
        neg_scores.append(0)
    try:
        pos_score.append(val.iloc[1])
    except:
        pos_score.append(0)


In [187]:
title_corpus

Unnamed: 0,title,year,ratingIMDB,votes,genresIMDB,text,sentiment_score,project_score
0,10 things i hate about you,1999,6.9,62847.0,comedy romance,They do not! They do to! I hope so. She okay? ...,0.9995,1
1,1492: conquest of paradise,1992,6.2,10421.0,adventure biography drama history,Can't be that far I say. Also I don't like th...,0.9988,1
2,15 minutes,2001,6.1,25854.0,action crime drama thriller,Officers there's your killer do your duty arre...,-0.9922,-1
3,2001: a space odyssey,1968,8.4,163227.0,adventure mystery sci-fi,We're trying to get there. I hope we can. CONT...,0.9999,1
4,48 hrs.,1982,6.9,22289.0,action comedy crime drama thriller,Great just great. That we do. And we put air i...,-0.9995,-1
...,...,...,...,...,...,...,...,...
600,watchmen,2009,7.8,135229.0,action crime fantasy mystery sci-fi thriller,Please. What?! The access code's been changed....,-0.9975,-1
601,xxx,2002,5.6,53505.0,action adventure crime,You're a jerk-off you know that? All for show....,-0.9917,-1
602,x-men,2000,7.4,122149.0,action sci-fi,Beast. Storm. Try to look for the highest van...,0.9660,1
603,young frankenstein,1974,8.0,57618.0,comedy sci-fi,Herr Falkstein -- you must go at once and pres...,1.0000,1
