# Applied Data Analysis Project
**Team**: ToeStewBrr - Alexander Sternfeld, Marguerite Thery, Antoine Bonnet, Hugo Bordereaux

**Dataset**: CMU Movie Summary Corpus


In [None]:
import requests
import tarfile
import urllib
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib as plt
import re
import gzip

## 1. Loading data

We first extract all files from the [MoviesSummaries dataset](http://www.cs.cmu.edu/~ark/personas/). 

In [None]:
if not os.path.exists('Data/MovieSummaries'):
    filename = 'http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz'
    my_tar = tarfile.open(fileobj=urllib.request.urlopen(filename), mode="r:gz") 
    my_tar.extractall('./Data') # specify which folder to extract to
    my_tar.close()

Note: Extraction of CoreNLP files takes 15min42s, while conversion takes 33s. 

In [None]:
# Extract all coreNLP files to Data/CoreNLP
if not os.path.exists('Data/CoreNLP'):
    coreNLPfilename = 'http://www.cs.cmu.edu/~ark/personas/data/corenlp_plot_summaries.tar'
    my_tar = tarfile.open(fileobj=urllib.request.urlopen(coreNLPfilename), mode="r|") 
    my_tar.extractall(path='./Data/CoreNLP') # specify which folder to extract to
    my_tar.close()

# Convert every file in directory Data/CoreNLP to xml format
raw_dir = 'Data/CoreNLP/corenlp_plot_summaries'
extracted_dir = 'Data/CoreNLP/corenlp_plot_summaries_xml'
if not os.path.exists(extracted_dir):
    os.mkdir(extracted_dir)
    for filename in os.listdir(raw_dir):
        f = os.path.join(raw_dir, filename) 
        if os.path.isfile(f):
            # Open and store file as xml 
            with gzip.open(f, 'rb') as f_in:
                gz_file = os.path.join(extracted_dir, filename)
                with open(gz_file[:-3], 'wb') as f_out:
                    f_out.write(f_in.read())



## 2. Pre-processing data

### 2.1. Plot summaries

`plot_summaries.txt [29 M]`: Plot summaries of 42,306 movies extracted from the November 2, 2012 dump of English-language Wikipedia.  Each line contains the Wikipedia movie ID (which indexes into movie.metadata.tsv) followed by the summary.

In [None]:
plot_path = 'Data/MovieSummaries/plot_summaries.txt'
plot_cols = ['Wikipedia ID', 'Summary']
plot_df = pd.read_csv(plot_path, sep='\t', header=None, names=plot_cols, index_col=0)
plot_df

In [None]:
# For Hugo: this method stems the words to their lexical root. 
# Implement Stemming using out of the box Porter algorithm
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
plot_stemmed = [[stemmer.stem(word) for word in sentence.split(" ")] for sentence in plot_df.iloc[:5].Summary]
plot_stemmed = [" ".join(sentence) for sentence in plot_stemmed]


In [None]:
# Note: The word count conversion and tf-idf weighting produce sparse matrices which are destined to be used by NNs. We need something different. 
# Word count conversion
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(strip_accents='ascii',stop_words='english')
plot_counts = count_vect.fit_transform(plot_stemmed)


In [None]:
# TF-IDF weighting
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
plot_data = tfidf_transformer.fit_transform(plot_counts)

In [None]:
# Normalize
# from sklearn.preprocessing import normalize
# normalize(newsgroups_trainData, norm='l1', axis=0, copy=False)

### 2.2. Movie metadata

`movie.metadata.tsv.gz [3.4 M]`: Metadata for 81,741 movies, extracted from the Noverber 4, 2012 dump of Freebase.  Tab-separated; columns:

1. Wikipedia movie ID
2. Freebase movie ID
3. Movie name
4. Movie release date
5. Movie box office revenue
6. Movie runtime
7. Movie languages (Freebase ID:name tuples)
8. Movie countries (Freebase ID:name tuples)
9. Movie genres (Freebase ID:name tuples)


In [None]:
strip_encoding = lambda x: np.nan if x == '{}' else \
    [w.replace(' Language', '').replace(' language', '') for w in re.findall(r'"(.*?)"', x)[1::2]]

In [None]:
movie_path = 'Data/MovieSummaries/movie.metadata.tsv'
movie_cols = ['Wikipedia ID', 'Freebase ID', 'Name', 'Release date', 
              'Box office revenue', 'Runtime', 'Languages', 'Countries', 'Genres']
movie_df = pd.read_csv(movie_path, sep='\t', header=None, names=movie_cols, index_col=0, dtype = {'Freebase ID': str})
movie_df['Languages'] = movie_df['Languages'].apply(strip_encoding)
movie_df['Countries'] = movie_df['Countries'].apply(strip_encoding)
movie_df['Genres'] = movie_df['Genres'].apply(strip_encoding)
movie_df

### 2.3. Character metadata

`character.metadata.tsv.gz [14 M]`: Metadata for 450,669 characters aligned to the movies above, extracted from the November 4, 2012 dump of Freebase.  Tab-separated; columns:

1. Wikipedia movie ID
2. Freebase movie ID
3. Movie release date
4. Character name
5. Actor date of birth
6. Actor gender
7. Actor height (in meters)
8. Actor ethnicity (Freebase ID)
9. Actor name
10. Actor age at movie release
11. Freebase character/actor map ID
12. Freebase character ID
13. Freebase actor ID


In [None]:
char_path = 'Data/MovieSummaries/character.metadata.tsv'
char_cols = ['Wikipedia ID', 'Freebase ID', 'Release date', 'Character name', 'Date of birth', 
             'Gender', 'Height', 'Ethnicity', 'Actor name', 'Actor age at release', 
             'Freebase character/map ID', 'Freebase character ID', 'Freebase actor ID']
char_df = pd.read_csv(char_path, sep='\t', header=None, names=char_cols, index_col=0)
char_df

## 3. Exploratory Data Analysis


### 3.1. Analysing romantic genres

In [None]:
##One notices that there are several types of romantic movies: romantic comedy, romance film, romantic drama
romance_genres = ['Romantic comedy', 'Romance Film', 'Romantic drama', 'Romantic fantasy', 'Romantic thriller']
romance_movies = movie_df[movie_df['Genres'].apply(lambda x: any(y in romance_genres for y in x) if type(x) == list else False)]

In [None]:
#Organize by category
romantic_comedy = romance_movies[movie_df['Genres'].apply(lambda x: any(y in romance_genres[0] for y in x) if type(x) == list else False)]
romantic_film = romance_movies[movie_df['Genres'].apply(lambda x: any(y in romance_genres[1] for y in x) if type(x) == list else False)]
romantic_drama = romance_movies[movie_df['Genres'].apply(lambda x: any(y in romance_genres[2] for y in x) if type(x) == list else False)]
romantic_fantasy = romance_movies[movie_df['Genres'].apply(lambda x: any(y in romance_genres[3] for y in x) if type(x) == list else False)]
romantic_thriller = romance_movies[movie_df['Genres'].apply(lambda x: any(y in romance_genres[4] for y in x) if type(x) == list else False)]

print('Roman' , romance_movies.shape[0])
print('Romantic comedies: ', romantic_comedy.shape[0], '\nRomantic films: ', romantic_film.shape[0], '\nRomantic drama: ', romantic_drama.shape[0], '\nRomantic fantasy: ', romantic_fantasy.shape[0], '\nRomantic thriller: ', romantic_thriller.shape[0])
print('Total number of films: ', movie_df.shape[0])

### 3.2. Romantic movies runtime

In [None]:
##Should correct outliers
#combined_runtime = pd.DataFrame({'Romantic comedy': romantic_comedy['Runtime'], 'Romance Film': romantic_film['Runtime'], 'Romantic drama': romantic_drama['Runtime'], 'Romantic fantasy': romantic_fantasy['Runtime']})
#sns.boxplot(combined_runtime)

In [None]:
ax = sns.kdeplot(romantic_comedy['Runtime'], color='blue')
ax = sns.kdeplot(romantic_drama['Runtime'], color='green')
ax = sns.kdeplot(romantic_film['Runtime'], color='red')
ax = sns.kdeplot(romantic_fantasy['Runtime'], color='orange')
ax.set_xlim(0,250)
ax.legend(['Romantic comedy', 'Romantic drama', 'Romance Film', 'Romantic fantasy'])


### 3.3. Romantic movies box office revenue

In [None]:
#Does not give a good view
#combined_box_office = pd.DataFrame({'Romantic comedy': romantic_comedy['Box office revenue'], 'Romance Film': romantic_film['Box office revenue'], 'Romantic drama': romantic_drama['Box office revenue'], 'Romantic fantasy': romantic_fantasy['Box office revenue']})
#sns.boxplot(combined_box_office)

In [None]:
ax = sns.kdeplot(romantic_comedy['Box office revenue'], log_scale=True, color='blue')
ax = sns.kdeplot(romantic_drama['Box office revenue'], log_scale=True, color='green')
ax = sns.kdeplot(romantic_film['Box office revenue'], log_scale=True, color='red')
ax = sns.kdeplot(romantic_fantasy['Box office revenue'], log_scale=True, color='orange')
ax.legend(['Romantic comedy', 'Romantic drama', 'Romance Film', 'Romantic fantasy'])


### 3.4. Romantic movies countries

In [None]:
romantic_comedy

In [None]:
get_countries = lambda x: len(x) if type(x) == list else np.nan
romantic_comedy['number_countries'] = romantic_comedy['Countries'].apply(get_countries)
romantic_fantasy['number_countries'] = romantic_fantasy['Countries'].apply(get_countries)
romantic_film['number_countries'] = romantic_film['Countries'].apply(get_countries)
romantic_drama['number_countries'] = romantic_drama['Countries'].apply(get_countries)

combined_numb_countries = pd.DataFrame({
    'Romantic comedy': romantic_comedy['number_countries'], 
    'Romance Film': romantic_film['number_countries'], 
    'Romantic drama': romantic_drama['number_countries'], 
    'Romantic fantasy': romantic_fantasy['number_countries']})

print('Percentage romantic comedy movie countries > 1: ', round(romantic_comedy[romantic_comedy['number_countries']> 1].shape[0]/romantic_comedy.shape[0], 2), '%')
print('Other countries can be added in code...')

### 3.5. Movie languages

In [None]:
#Get languages whole movie set
movies_language = movie_df[movie_df['Languages'].notnull()]
languages=movies_language['Languages'].sum()
values, counts = np.unique(languages, return_counts=True)
print('5 most common languages in movies are: ')
print(values[counts.argsort()[-5:][::-1]])

#Get languages romantic movies overall
romance_movies_lang = romance_movies[romance_movies['Languages'].notnull()]
languages_romance = romance_movies_lang.Languages.sum()
values, counts = np.unique(languages_romance, return_counts=True)
print('5 most common languages in romantic movies: ')
print(values[counts.argsort()[-5:][::-1]])


rom_com_known = romantic_comedy[romantic_comedy['Languages'].notnull()]
languages_romcom = rom_com_known.Languages.sum()
values, counts = np.unique(languages_romcom, return_counts=True)
print('\n5 most common languages in romantic comedies: ')
print(values[counts.argsort()[-5:][::-1]])

### 3.6. CoreNLP Plot Summaries

`corenlp_plot_summaries.tar.gz [628 M, separate download]`: The plot summaries from above, run through the Stanford CoreNLP pipeline (tagging, parsing, NER and coref). Each filename begins with the Wikipedia movie ID (which indexes into movie.metadata.tsv).

In [None]:
#Use file I already extracted on my computer to run some tests
import xml.etree.ElementTree as ET
tree = ET.parse('Data/CoreNLP/corenlp_plot_summaries_xml/3217.xml')
root = tree.getroot()

#NER tag = person can give us the characters mention in the plot summary. 

print(len(root.findall('.//*governor'))) #use parse or basic-dependencies to have more info 
#print(root.findall('.//*governor').text())
for l in root.findall('.//*NER'): 
  if len(l.text) > 1:
    print(l.text)


In [None]:
# To print xml files as a pretty tree
import xml.dom.minidom

def print_tree_xml(xml_name):
    tree_xml = xml.dom.minidom.parse(xml_name).toprettyxml()
    print(tree_xml)

#print_tree_xml('Data/CoreNLP/corenlp_plot_summaries_xml/3217.xml')

### 3.7. Name clusters

From the file `name.clusters.txt`, we extract movie characters and their related character cluster.

In [None]:
path = 'Data/MovieSummaries/'
names_path = path+'name.clusters.txt'
names_cols = ['Character name', 'Cluster']
names_df = pd.read_csv(names_path, sep='\t', header=None, names=names_cols, dtype = {'Freebase ID': str})
names_df = names_df.groupby('Character name').aggregate(list)
names_df

### 3.8. TV Tropes Clusters

We reformat the file `tvtropes.clusters.txt` so it is easier to use.

In [None]:
cluster_path = path+'tvtropes.clusters.format.txt'
cluster_cols = ['Cluster', 'Character name', 'Movie', 'Freebase character/map ID', 'Actor']
cluster_df = pd.read_csv(cluster_path, sep=',', header=None, names=cluster_cols, dtype = {'Freebase ID': str})
cluster_df

We now join the TV tropes clusters with movie.metadata so we are able to access movie genre and filter on romance. 

In [None]:
cluster_char = cluster_df.merge(char_df, on='Freebase character/map ID')
cluster_char_movie = cluster_char.merge(movie_df, on='Freebase ID')
romance_cluster = cluster_char_movie[cluster_char_movie['Genres'].apply(lambda x: 'Roman' in x)]
romance_cluster.groupby(romance_cluster['Cluster']).size().sort_values(ascending=False)