#### Milestone 2 #####

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json


Load dataframes into proper format

In [None]:
df_plot = pd.read_csv("data/MovieSummaries/plot_summaries.txt", sep='\t', header=None, names=["wikiID", "plot"])
df_meta = pd.read_csv("data/MovieSummaries/movie.metadata.tsv", sep='\t', header=None, names=["wikiID", "freeID", "name", "release_date", "revenue", "runtime", "languages", "countries", "genres"])
df_char = pd.read_csv("data/MovieSummaries/character.metadata.tsv", sep='\t',header=None, names=["WikiID","freeID","release_date","char_name","actor_DOB","actor_gender","actor_height","actor_ethnicity","actor_name","Actor_age","freeID_char_map", "FreeID_char", "FreeID_actor"])
df_char_names = pd.read_csv('data/MovieSummaries/name.clusters.txt', sep="\t", header=None, names=["char_name","freeID_char_map"])

In [None]:
from pandas.io.json import json_normalize

df_tropes = pd.read_csv('data/MovieSummaries/tvtropes.clusters.txt', sep='\t', header=None, names=["trope","details"])
#flatten json column into panda dataframe:
df_tropes = df_tropes.join(json_normalize(df_tropes["details"].map(json.loads).tolist())).drop(["details"], axis=1).rename(columns={"id":"freeID_char_map"})

In [None]:
df_tropes.info()

In [None]:
df_meta.info()

In [None]:
df_plot.info()

In [None]:
df_char.head(5)

In [None]:
df_tropes.info()

In [None]:
df_char_names.info()

The feature connecting dataframes together is the Wikipedia ID. Also there are more metadatas of movies (81741 movies) than plots (42303 movies). We will only keep the metadatas of the movies we know the plot of.

In [None]:
df_full = df_meta.merge(df_plot, how='inner', on="wikiID")
df_full.info()

Only a small fraction of the characters have been labellised with a trope (500), compared to the number of unlabeled (450669). Here is the dataframe containing the characters with trope

In [None]:
df_inner_char = df_char.merge(df_tropes, how='inner', on='freeID_char_map')

In [None]:
df_inner_char.drop(columns=["char","actor"],inplace=True)
df_inner_char.info()

#### Superficial analysis of datasets #####
* Distribution of number of words in plot description:

In [None]:
#use raw plot_summaries and count nb of words in each plot 
df_plot_copy = df_plot.copy()
df_plot_copy['nb_words']=df_plot_copy['plot'].apply(lambda n: len(n.split()))
df_plot_copy.head(3)

In [None]:
n_bins = 1000

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.hist(df_plot_copy['nb_words'], bins=n_bins)
ax.set_title('Distribution of number of words per plot description')

fig.show()

Only keep the plots with less than 2000 words (which interval?)

In [None]:
threshold = 2000
df_plot_copy = df_plot_copy.loc[df_plot_copy['nb_words'] < threshold]

In [None]:
n_bins = 1000

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.hist(df_plot_copy['nb_words'], bins=n_bins)
ax.set_title('Distribution of number of words per plot description')


fig.show()

What would be the minimum number of words to find senseful topic extraction?  

* characters by film distribition: how many characters have been labelised for each film?

In [None]:
df_char_copy = df_char[['freeID','FreeID_actor']].copy()
df_char_copy = df_char_copy.groupby(['freeID']).size().reset_index(name='counts')
df_char_copy.head(5)

In [None]:
n_bins = 150

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.hist(df_char_copy['counts'], bins=n_bins)
ax.set_title('Distribution of number of characters labelised per film')

fig.show()

In [None]:
n_bins = 200
red_square = dict(markerfacecolor='r', marker='s')

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
#ax.hist(df_char_copy['counts'], bins=n_bins)
ax.boxplot(df_char_copy['counts'],vert=False, flierprops=red_square)
ax.set_title('Number of characters labelised per film')
fig.show()
print("1rst quartile, median and 3rd quartile values: ")
print(df_char_copy['counts'].quantile([0.25,0.5,0.75]))


"Hemingway & Gellhorn" has more than 115 characters labelised! 

In [None]:
df_char_copy.loc[df_char_copy['counts'] > 100].head(10)