# CMU movie data set initial data exploration
In this notebook we seek to find general trends in the available data without performing in depth analysis. We seek to find high level trends which we can further explore

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import json

## Import the data

In [None]:
character_df = pd.read_csv("dataset/MovieSummaries/character.metadata.tsv", sep="\t", header=None)
movies_df = pd.read_csv("dataset/MovieSummaries/movie.metadata.tsv", sep="\t", header=None)

### Relabel the columns

In [None]:
# relabel the columns 
character_df.columns = [
    "Wikipedia movie ID",
    "Freebase movie ID",
    "Movie release date",
    "Character name",
    "Actor date of birth",
    "Actor gender",
    "Actor height (in meters)",
    "Actor ethnicity (Freebase ID)",
    "Actor name",
    "Actor age at movie release",
    "Freebase character/actor map ID",
    "Freebase character ID",
    "Freebase actor ID"
]

# relabel the movie columns 
movies_df.columns = [
    "Wikipedia movie ID",
    "Freebase movie ID",
    "Movie name",
    "Movie release date",
    "Movie box office revenue",
    "Movie runtime",
    "Movie languages (Freebase ID:name tuples)",
    "Movie countries (Freebase ID:name tuples)",
    "Movie genres (Freebase ID:name tuples)"
]

## Helper function to convert the json columns
Converts to json columns (language, country, genre) into a more workable format

In [62]:
# Helper function to clean up the columns which are stored as a json.
def convert_json_cols_(df, id_cols, convert_col, new_col_name):
    _df = df.loc[:, id_cols + [convert_col]]

    # create a new data frame based on just the movie genres
    _df["_dict"] = _df.apply(
        lambda row : json.loads(row[convert_col]), axis = 1)

    # get the values as a list
    _df[new_col_name] = _df.apply(
        lambda row : list(row["_dict"].values()), axis = 1)

    # make a long data frame where each genre of a film is replicated as a row. 
    # If a film has two genres, it will have two rows. 
    columns_to_drop = [convert_col, "_dict"]
    _df_long = _df.drop(columns=columns_to_drop).explode(column=new_col_name)
    return _df_long.reset_index(drop=True)

## Clean up the json columns
Make the columns easier to use

In [71]:
# more workable genre df
genre_df = convert_json_cols_(
    movies_df, 
    id_cols=["Wikipedia movie ID", "Freebase movie ID"],
    convert_col="Movie genres (Freebase ID:name tuples)",
    new_col_name="Genre"
    )

# more workable language df
language_df = convert_json_cols_(
    movies_df, 
    id_cols=["Wikipedia movie ID", "Freebase movie ID"],
    convert_col="Movie languages (Freebase ID:name tuples)",
    new_col_name="Language"
    )

# more workable language df
country_df = convert_json_cols_(
    movies_df, 
    id_cols=["Wikipedia movie ID", "Freebase movie ID"],
    convert_col="Movie countries (Freebase ID:name tuples)",
    new_col_name="Country"
    )

### Clean up the year column

In [72]:
movies_df["Release_year"] = movies_df.loc[:, "Movie release date"].str.slice(0, 4)
year_df = movies_df.loc[:, ["Wikipedia movie ID", "Freebase movie ID", "Release_year"]]

### Clean up the revenue column

In [78]:
# remove all the na revenue rows
revenue_df = movies_df.loc[:, ["Wikipedia movie ID", "Freebase movie ID", "Movie box office revenue"]].dropna(subset=["Movie box office revenue"])

In [80]:
na_revenue_bool_ = movies_df.loc[:, "Movie box office revenue"].isna()
na_revenue_df = movies_df.loc[na_revenue_bool_, :]

## Clean the character dataframe

In [84]:
character_df

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.620,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.780,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.750,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.650,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
450664,913762,/m/03pcrp,1992-05-21,Elensh,1970-05,F,,,Dorothy Elias-Fahn,,/m/0kr406c,/m/0kr406h,/m/0b_vcv
450665,913762,/m/03pcrp,1992-05-21,Hibiki,1965-04-12,M,,,Jonathan Fahn,27.0,/m/0kr405_,/m/0kr4090,/m/0bx7_j
450666,28308153,/m/0cp05t9,1957,,1941-11-18,M,1.730,/m/02w7gg,David Hemmings,15.0,/m/0g8ngmc,,/m/022g44
450667,28308153,/m/0cp05t9,1957,,,,,,Roberta Paterson,,/m/0g8ngmj,,/m/0g8ngmm


### Plots to plot
- Avg. num of female actors in film as a time series
    - subdivide by genre
    - subdivide by country
    - subdivide by language
- boxplots of genre and avg. num of female actors
- instead of avg. num female actors, could look at proportion of actors who are female to scale for the number of characters in a film. 
- Avg. num of female actors above age 40 as a temporal analysis
- Compare the boxplots of the age of male actors and female actors. See if the young and beautiful standards are perpetuated in female actresses. 
- repeat all the above with female actors above age 40. 
