In [1]:
import numpy as np
import pandas as pd
import ast
import scipy.stats as ss

import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns
import pylab as pl

import itertools
import unidecode
from collections import Counter

import os

import requests

'''Download the files from the following folder and place them under datasets/moviedata/
https://drive.google.com/drive/folders/1mJsTvP3VfzLWgV_zlO1S9xiWyrTJlNG4?usp=sharing
'''

PATH_IN = './datasets/moviedata/'

## 1. Load the data <a class="anchor" id="chapter1"></a>

In this project, we want to analyze the characteristics of a well-performing movie. Thus, we are particularly interested in the "movie.metadata.tsv" file from the CMU dataset.

In [2]:
fname = os.path.join(PATH_IN, 'movie.metadata.tsv')
df = pd.read_csv(fname, sep='\t', header=None)

df.columns = ['Wikipedia ID', 'Freebase ID', 'Name', 'Release date', 'Revenue', 'Runtime', 'Languages', 'Countries', 'Genres']

df.head(3)

Unnamed: 0,Wikipedia ID,Freebase ID,Name,Release date,Revenue,Runtime,Languages,Countries,Genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."


In [3]:
df.shape

(81741, 9)

The "movie.metadata.tsv" contains 81741 entries, each corresponding to a movie. The movies are defined by 9 features: the `Wikipedia ID`, the `Freebase ID`, the `name`, the `release date`, the `box office revenue`, the `runtime`, the `languages`, the `countries` and the `genres`.
* The `release date` is a discrete numerical variable.
* The `revenue` and the `runtime` are continuous numerical variables.
* The `languages`, the `countries` and the `genres` are lists of categorical variables. Indeed, a given movie can be part of several genres.

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81741 entries, 0 to 81740
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Wikipedia ID  81741 non-null  int64  
 1   Freebase ID   81741 non-null  object 
 2   Name          81741 non-null  object 
 3   Release date  74839 non-null  object 
 4   Revenue       8401 non-null   float64
 5   Runtime       61291 non-null  float64
 6   Languages     81741 non-null  object 
 7   Countries     81741 non-null  object 
 8   Genres        81741 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 5.6+ MB


### a) Data size <a class="anchor" id="section_2_1"></a>

We load the dataset using a Pandas dataframe. From the memory usage (5.6 MB), we can handle the data in its size.

### b) Missing values <a class="anchor" id="section_2_2"></a>

From the information of the dataframe, we can observe that many `revenue` values are missing. Only 10% of the `revenue` values are available. Thus, we decide to not base our study on this feature.

In [5]:
'''Remove the revenue column'''
df = df[['Wikipedia ID', 'Freebase ID', 'Name', 'Release date', 'Runtime', 'Languages', 'Countries', 'Genres']]
df.head(3)

Unnamed: 0,Wikipedia ID,Freebase ID,Name,Release date,Runtime,Languages,Countries,Genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."


### c) Formatting the data <a class="anchor" id="section_2_3"></a>

#### `Release date` <a class="anchor" id="sub_section_2_3_1"></a>

In [6]:
df['Release date'].head(3)

0    2001-08-24
1    2000-02-16
2          1988
Name: Release date, dtype: object

In [7]:
type(df['Release date'][0])

str

We observe that the values for the release dates are string. Moreover, we notice that some release dates consist of the full date, including the year, the month and the day, while others only contain the year. To homogenize the format of the `release date` feature, we decide to pad the incomplete dates with 0. For each release date, we obtain a 8-digits number corresponding to YEAR-MONTH-DAY.

In [8]:
# For all values that are not Nan:
# 1. Split the string into the year, the month and the day
# 2. Concatenate, so we obtain a date without '-'
# 3. Pad to 8 digits with 0s.
df['Release date'] = [''.join(x.split('-')).ljust(8, '0') if pd.isna(x)==False else x for x in df['Release date']]

df['Release date'].head(3)

0    20010824
1    20000216
2    19880000
Name: Release date, dtype: object

#### `Languages`, `Countries` and `Genres` <a class="anchor" id="sub_section_2_3_2"></a>

In [9]:
df['Languages'].head(10)

0                   {"/m/02h40lc": "English Language"}
1                   {"/m/02h40lc": "English Language"}
2                   {"/m/05f_3": "Norwegian Language"}
3                   {"/m/02h40lc": "English Language"}
4                    {"/m/04306rv": "German Language"}
5    {"/m/06ppq": "Silent film", "/m/02h40lc": "Eng...
6                   {"/m/02h40lc": "English Language"}
7                   {"/m/02h40lc": "English Language"}
8                     {"/m/06nm1": "Spanish Language"}
9                   {"/m/02h40lc": "English Language"}
Name: Languages, dtype: object

In [10]:
type(df['Languages'][0])

str

In [11]:
def categorical_var_formatting(var_name):
    '''Convert the dataseries to a list of lists'''

    # Change the format to a list of values
    # eg convert {code1: language1, code2: language2} to [language1, language2]
    var_list = df[var_name].apply(lambda x: pd.read_json(x, typ = 'series').to_list())

    # (Extract the languages)
    # (eg convert [English Language] to [English])
    if var_name == 'Languages':
        var_list = var_list.apply(lambda x: [y.split(' ')[0] for y in x])

    return pd.Series(var_list)

In [12]:
df['Languages'] = categorical_var_formatting('Languages')

In [13]:
df['Countries'] = categorical_var_formatting('Countries')

In [14]:
df['Genres'] = categorical_var_formatting('Genres')

In [15]:
!ls /datasets/moviedata

ls: cannot access '/datasets/moviedata': No such file or directory


In [16]:
df.head(3)

Unnamed: 0,Wikipedia ID,Freebase ID,Name,Release date,Runtime,Languages,Countries,Genres
0,975900,/m/03vyhn,Ghosts of Mars,20010824,98.0,[English],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,20000216,95.0,[English],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]"
2,28463795,/m/0crgdbh,Brun bitter,19880000,83.0,[Norwegian],[Norway],"[Crime Fiction, Drama]"


In [17]:
'''Save the reformatted dataframe'''

# Reformat the release date into YEAR-MONTH-DAY separated by '-' so it is more readable
df_toSave = df.copy()
df_toSave['Release date'] = [x[0:4]+'-'+x[4:6]+'-'+x[6:8] if pd.isna(x)==False else x for x in df['Release date']]

df_toSave.to_csv(PATH_IN + 'formatted_movie_metadata.csv', index=False)
