## Import all libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re
from itertools import chain

import spacy

data_folder = 'data/'

# Table of contents
1. [Load CMU Movie Summary Corpus data](#loadCMU)
2. [Data exploration of the movie metadata](#paragraph1)
    1. [Sub paragraph](#subparagraph1)
3. [Data exploration of the character metadata](#paragraph2)
4. [Data exploration of summaries](#paragraph2)
5. [Summaries analysis pipeline](#paragraph2)

Blabla introduce what we are doing here 

# Load CMU Movie Summary Corpus <a name="loadCMU"></a>

## Movies Metadata

In [24]:
#Load data and set indexes
movie_metadata = pd.read_csv(data_folder + "movie.metadata.tsv", sep='\t', header = None)
movie_metadata.columns = ['movie_ID','FB_ID','movie_name','release_date','box_office','length','languages','country','genres']

#Force float type to the movie ID column, set as index
movie_metadata.movie_ID = movie_metadata.movie_ID.astype(int)
movie_metadata = movie_metadata.set_index('movie_ID')

#Drop freebase database ID, we don't need it
movie_metadata = movie_metadata.drop(columns=['FB_ID'])

#Remove movies with non-defined release date
#Convert release date to datetime type, only keep year 
movie_metadata = movie_metadata[movie_metadata.release_date.notna()]
movie_metadata['release_date'] = movie_metadata.release_date.apply(lambda d:datetime.strptime(str(d[0:4]), "%Y"))
movie_metadata['release_date'] = [date.year for date in movie_metadata.release_date]

#Change release date 1010, it's a mistake movie actually release in 2010
movie_metadata = movie_metadata.replace(1010,2010)

#Clean languages, country, genres columns
movie_metadata['languages'] = [lang.split('"')[3::4] for lang in movie_metadata.languages]
movie_metadata['country'] = [lang.split('"')[3::4] for lang in movie_metadata.country]
movie_metadata['genres'] = [lang.split('"')[3::4] for lang in movie_metadata.genres]

movie_metadata.head(10)

Unnamed: 0_level_0,movie_name,release_date,box_office,length,languages,country,genres
movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
975900,Ghosts of Mars,2001,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]"
28463795,Brun bitter,1988,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]"
9363483,White Of The Eye,1987,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri..."
261236,A Woman in Flames,1983,,106.0,[German Language],[Germany],[Drama]
13696889,The Gangsters,1913,,35.0,"[Silent film, English Language]",[United States of America],"[Short Film, Silent film, Indie, Black-and-whi..."
18998739,The Sorcerer's Apprentice,2002,,86.0,[English Language],[South Africa],"[Family Film, Fantasy, Adventure, World cinema]"
10408933,Alexander's Ragtime Band,1938,3600000.0,106.0,[English Language],[United States of America],"[Musical, Comedy, Black-and-white]"
9997961,Contigo y aquí,1974,,,[Spanish Language],[Argentina],"[Musical, Drama, Comedy]"
2345652,City of the Dead,1960,,76.0,[English Language],[United Kingdom],"[Horror, Supernatural]"


## Characters metadata

In [25]:
#Load character metadata
character_metadata = pd.read_csv(data_folder + "character.metadata.tsv", sep='\t', header=None)

#only keep movie_ID , name, gender, and age
character_metadata = character_metadata.iloc[:,[0,3,5,9]] 
character_metadata.columns = ['movie_ID','name','gender','age']
character_metadata.name = character_metadata['name'].astype(str)
character_metadata.movie_ID = character_metadata['movie_ID'].astype(int)
character_metadata = character_metadata.set_index('movie_ID')

#Drop the characters without gender defined
character_metadata = character_metadata[character_metadata.gender.notna()]

character_metadata.head(10)

Unnamed: 0_level_0,name,gender,age
movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
975900,Akooshay,F,42.0
975900,Lieutenant Melanie Ballard,F,27.0
975900,Desolation Williams,M,32.0
975900,Sgt Jericho Butler,M,33.0
975900,Bashira Kincaid,F,23.0
975900,Commander Helena Braddock,F,52.0
975900,Whitlock,F,56.0
975900,Big Daddy Mars,M,
975900,Michael Descanso,M,30.0
975900,Uno,M,


## Plot summaries

In [26]:
summary_file = open(data_folder + 'plot_summaries.txt', 'r', encoding="utf8")
summaries = summary_file.readlines()

#Separate summary and movie ID
summaries = pd.DataFrame([summ.split("\t") for summ in summaries], columns=['movie_ID','summary'])
summaries['summary'] = summaries.summary.replace(r'\n',' ', regex=True) #Remove /n at end of summaries
summaries['movie_ID'] = summaries['movie_ID'].astype(int)
summaries = summaries.set_index('movie_ID')

#Count number of words in each summary
def count_words_simple(x):
    return len(x.split(" "))

summaries['length'] = summaries['summary'].apply(count_words_simple)

summaries.head()

Unnamed: 0_level_0,summary,length
movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
23890098,"Shlykov, a hard-working taxi driver and Lyosha...",26
31186339,The nation of Panem consists of a wealthy Capi...,781
20663735,Poovalli Induchoodan is sentenced for six yea...,505
2231378,"The Lemon Drop Kid , a New York City swindler,...",854
595909,Seventh-day Adventist Church pastor Michael Ch...,398
