## Import libraries and modules

In [2]:
import pandas as pd
%matplotlib inline

## Loading the data

In [4]:
DATA_FOLDER = 'Data/'

#load only datasets that seem useful to our project
TITLE_AKAS_DATASET = DATA_FOLDER + 'title.akas.tsv.gz'
TITLE_BASICS_DATASET = DATA_FOLDER + 'title.basics.tsv.gz'
TITLE_PRINCIPALS_DATASET = DATA_FOLDER + 'title.principals.tsv.gz'
TITLE_RATINGS_DATASET = DATA_FOLDER + 'title.ratings.tsv.gz'
NAME_BASICS_DATASET = DATA_FOLDER + 'name.basics.tsv.gz'

def load_metadata(path, column_names, header=None, low_memory=False):
    return pd.read_table(path, header=header, names=column_names)

columns_akas = ['titleId', 'ordering', 'title', 'region', 'language', 'types', 'attributes', 'isOriginalTitle']
columns_title_basics = ['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres']
columns_principals = ['tconst', 'ordering', 'nconst', 'category', 'job', 'characters']
columns_ratings = ['tconstIdentifier', 'averageRating', 'numVotes']
columns_names = ['nconst', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession', 'knownForTitles']

In [5]:
title_basics = load_metadata(TITLE_BASICS_DATASET, column_names=columns_title_basics)
print("length of title_basics: ", len(title_basics))
title_basics.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


length of title_basics:  9363391


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
1,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
2,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
3,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
4,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"


In [8]:
#drop the first row which represents the titles
# Careful: execute only once, otherwise it will delete the first row each time!
#title_basics = title_basics.drop(index=title_basics.index[0], axis=0) 
#print("length of title_basics: ", len(title_basics))
#title_basics.head()

In [3]:
#test: proof of concept with small dataframe
empoyees = [('Jack',    34, 'Sydney',   5) ,
            ('Riti',    31, 'Delhi' ,   7) ,
            ('Aadi',    16, 'London',   11) ,
            ('Mark',    41, 'Delhi' ,   12)]
# Create a DataFrame object
df = pd.DataFrame(empoyees, columns=['Name', 'Age', 'City', 'Experience'])
print(df)
# keep rows where City=Delhi
employees_Delhi = df[df["City"] == "Delhi"]
        
employees_Delhi.head()

   Name  Age    City  Experience
0  Jack   34  Sydney           5
1  Riti   31   Delhi           7
2  Aadi   16  London          11
3  Mark   41   Delhi          12


Unnamed: 0,Name,Age,City,Experience
1,Riti,31,Delhi,7
3,Mark,41,Delhi,12


In [6]:
#create a new table with only titleType=movies (get rid of videos, tvshows, tvepisodes and short)
title_basics_movies = title_basics[title_basics["titleType"] == "movie"]
        
print("length of title_basics_movies: ", len(title_basics_movies))
title_basics_movies.head()

length of title_basics_movies:  626772


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
9,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
499,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
571,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
588,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
611,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama


In [15]:
#remove the endYear column since movies are not concerned by thats
title_basics_movies_cleaned = title_basics_movies.drop(columns='endYear')
title_basics_movies_cleaned.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
9,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,45,Romance
499,tt0000502,movie,Bohemios,Bohemios,0,1905,100,\N
571,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography"
588,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,90,Drama
611,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,Drama


In [5]:
#maybe not useful for us 
title_akas = load_metadata(TITLE_AKAS_DATASET, column_names=columns_akas)
print("length of title_akas: ", len(title_akas))
title_akas.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


length of title_akas:  33781110


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
1,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
2,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
3,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
4,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0


In [6]:
principals = load_metadata(TITLE_PRINCIPALS_DATASET, column_names=columns_principals)
principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tconst,ordering,nconst,category,job,characters
1,tt0000001,1,nm1588970,self,\N,"[""Self""]"
2,tt0000001,2,nm0005690,director,\N,\N
3,tt0000001,3,nm0374658,cinematographer,director of photography,\N
4,tt0000002,1,nm0721526,director,\N,\N


In [7]:
ratings = load_metadata(TITLE_RATINGS_DATASET, column_names=columns_ratings)
ratings.head()

Unnamed: 0,tconstIdentifier,averageRating,numVotes
0,tconst,averageRating,numVotes
1,tt0000001,5.7,1922
2,tt0000002,5.8,259
3,tt0000003,6.5,1734
4,tt0000004,5.6,174


In [8]:
names = load_metadata(NAME_BASICS_DATASET, column_names=columns_names)
names.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
1,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0053137,tt0072308,tt0050419"
2,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0117057,tt0071877,tt0038355"
3,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0054452,tt0057345,tt0049189,tt0056404"
4,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0072562,tt0077975,tt0080455"
