In [1]:
# mutliple outputs in cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# cell width
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
import os
import numpy as np
import ast

PATH_IN = './Dataset/'

In [3]:
movie_fname = os.path.join(PATH_IN, 'movie.metadata.tsv')

movie_column_names = [
    "Wikipedia movie ID", 
    "Freebase movie ID", 
    "Movie name", 
    "Movie release date", 
    "Movie box office revenue", 
    "Movie runtime", 
    "Movie languages", 
    "Movie countries", 
    "Movie genres",
]

movies = pd.read_csv(movie_fname,  delimiter= '\t', names= movie_column_names)#, index_col=0)

movies.head()
movies.shape
movies.dtypes

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


(81741, 9)

Wikipedia movie ID            int64
Freebase movie ID            object
Movie name                   object
Movie release date           object
Movie box office revenue    float64
Movie runtime               float64
Movie languages              object
Movie countries              object
Movie genres                 object
dtype: object

### Date parsing and dtype change

In [4]:
movies['Freebase movie ID'] = movies['Freebase movie ID'].astype('string')

movies['Movie release Year'] = movies['Movie release date'].str.split('-').str[0].astype('Int64')
movies['Movie release Month'] = movies['Movie release date'].str.split('-').str[1].astype('Int64')
movies['Movie release Day'] = movies['Movie release date'].str.split('-').str[2].astype('Int64')

movies.drop(columns=['Movie release date'], inplace=True)

### Languages parsing

In [5]:
movies['parsed languages'] = movies['Movie languages'].apply(ast.literal_eval)

movies['language codes'] = movies['parsed languages'].apply(lambda x: ','.join(list(x.keys())))
movies['languages'] = movies['parsed languages'].apply(lambda x: ','.join([val.replace(' Language', '') for val in list(x.values())]))

movies.drop(columns=['Movie languages', 'parsed languages'], inplace=True)

### Countires parsing

In [6]:
movies['parsed countries'] = movies['Movie countries'].apply(ast.literal_eval)

movies['countries codes'] = movies['parsed countries'].apply(lambda x: ','.join(list(x.keys())))
movies['countries'] = movies['parsed countries'].apply(lambda x: ','.join(list(x.values())))

movies.drop(columns=['Movie countries', 'parsed countries'], inplace=True)

### Genres parsing

In [7]:
movies['parsed genres'] = movies['Movie genres'].apply(ast.literal_eval)

movies['genres codes'] = movies['parsed genres'].apply(lambda x: ','.join(list(x.keys())))
movies['genres'] = movies['parsed genres'].apply(lambda x: ','.join(list(x.values())))

movies.drop(columns=['Movie genres', 'parsed genres'], inplace=True)

### Renaming columns and separating codes

In [8]:
movies.rename(columns={
    'Wikipedia movie ID': 'Wiki_ID',
    'Freebase movie ID': 'Freebase_ID',
    'Movie name': 'Name',
    'Movie box office revenue': 'Revenue',
    'Movie runtime': 'Runtime',
    'Movie release Year': 'Year',
    'Movie release Month': 'Month',
    'Movie release Day': 'Day',
    'languages': 'Languages',
    'countries': 'Countries',
    'genres': 'Genres',
    'language codes': 'language_codes',
    'countries codes': 'countries_codes',
    'genres codes': 'genres_codes',
}, inplace=True)

probably_not_useful = movies[['Freebase_ID', 'language_codes', 'countries_codes', 'genres_codes']]
movies.drop(columns=['Freebase_ID', 'language_codes', 'countries_codes', 'genres_codes'], inplace=True)

movies = movies[['Wiki_ID', 'Name', 'Year', 'Revenue', 'Runtime', 'Languages', 'Countries', 'Genres', 'Month', 'Day']]

movies.head()
probably_not_useful.head()

Unnamed: 0,Wiki_ID,Name,Year,Revenue,Runtime,Languages,Countries,Genres,Month,Day
0,975900,Ghosts of Mars,2001,14010832.0,98.0,English,United States of America,"Thriller,Science Fiction,Horror,Adventure,Supe...",8.0,24.0
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000,,95.0,English,United States of America,"Mystery,Biographical film,Drama,Crime Drama",2.0,16.0
2,28463795,Brun bitter,1988,,83.0,Norwegian,Norway,"Crime Fiction,Drama",,
3,9363483,White Of The Eye,1987,,110.0,English,United Kingdom,"Thriller,Erotic thriller,Psychological thriller",,
4,261236,A Woman in Flames,1983,,106.0,German,Germany,Drama,,


Unnamed: 0,Freebase_ID,language_codes,countries_codes,genres_codes
0,/m/03vyhn,/m/02h40lc,/m/09c7w0,"/m/01jfsb,/m/06n90,/m/03npn,/m/03k9fj,/m/0fdjb..."
1,/m/08yl5d,/m/02h40lc,/m/09c7w0,"/m/02n4kr,/m/03bxz7,/m/07s9rl0,/m/0hj3n01"
2,/m/0crgdbh,/m/05f_3,/m/05b4w,"/m/0lsxr,/m/07s9rl0"
3,/m/0285_cd,/m/02h40lc,/m/07ssc,"/m/01jfsb,/m/0glj9q,/m/09blyk"
4,/m/01mrr1,/m/04306rv,/m/0345h,/m/07s9rl0
