## Data transformation

Modifying the datasets in a way that is easier to work with.

In [1]:
import pandas as pd
import plotly.express as px
import numpy as np

In [2]:
df_title_basics = pd.read_csv("data/title_basics.tsv", sep='\t', low_memory=False)
df_title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10295872,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10295873,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10295874,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10295875,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [3]:
title_types = df_title_basics["titleType"].unique()
title_types

array(['short', 'movie', 'tvShort', 'tvMovie', 'tvSeries', 'tvEpisode',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
      dtype=object)

In [4]:
df_title_basics = df_title_basics[df_title_basics["titleType"].isin(["tvSeries", "tvEpisode"])]
df_title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34973,tt0035599,tvSeries,Voice of Firestone Televues,Voice of Firestone Televues,0,1943,1947,15,\N
35174,tt0035803,tvSeries,The German Weekly Review,Die Deutsche Wochenschau,0,1940,1945,12,"Documentary,News"
37603,tt0038276,tvSeries,You Are an Artist,You Are an Artist,0,1946,1955,15,Talk-Show
38437,tt0039120,tvSeries,Americana,Americana,0,1947,1949,30,"Family,Game-Show"
38438,tt0039121,tvSeries,Birthday Party,Birthday Party,0,1947,1949,30,Family
...,...,...,...,...,...,...,...,...,...
10295871,tt9916846,tvEpisode,Episode #3.18,Episode #3.18,0,2009,\N,\N,"Action,Drama,Family"
10295872,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10295873,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10295874,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"


In [5]:
df_title_ratings = pd.read_csv("data/title_ratings.tsv", sep='\t', low_memory=False)
df_title_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2005
1,tt0000002,5.8,269
2,tt0000003,6.5,1908
3,tt0000004,5.5,178
4,tt0000005,6.2,2688
...,...,...,...
1366964,tt9916730,7.6,11
1366965,tt9916766,7.0,22
1366966,tt9916778,7.2,36
1366967,tt9916840,8.8,6


In [6]:
df_title_ratings = df_title_ratings[df_title_ratings["tconst"].isin(df_title_basics['tconst'])]
df_title_ratings

Unnamed: 0,tconst,averageRating,numVotes
18285,tt0035803,8.2,58
21018,tt0039120,2.7,18
21019,tt0039123,8.1,209
21020,tt0039125,5.1,27
21764,tt0040021,7.0,86
...,...,...,...
1366963,tt9916708,8.6,6
1366965,tt9916766,7.0,22
1366966,tt9916778,7.2,36
1366967,tt9916840,8.8,6


In [7]:
df_title_ratings.dtypes

tconst            object
averageRating    float64
numVotes           int64
dtype: object

In [8]:
df_title_crew = pd.read_csv("data/title_crew.tsv", sep='\t', low_memory=False)
df_title_crew

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N
...,...,...,...
10295872,tt9916848,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
10295873,tt9916850,nm1485677,"nm9187127,nm1485677,nm9826385,nm1628284"
10295874,tt9916852,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
10295875,tt9916856,nm10538645,nm6951431


In [9]:
df_title_crew = df_title_crew[df_title_crew["tconst"].isin(df_title_basics['tconst'])]
df_title_crew

Unnamed: 0,tconst,directors,writers
34973,tt0035599,\N,\N
35174,tt0035803,\N,\N
37603,tt0038276,"nm0626972,nm3811084",nm12264820
38437,tt0039120,"nm11657766,nm0590202,nm0168641",\N
38438,tt0039121,\N,\N
...,...,...,...
10295871,tt9916846,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
10295872,tt9916848,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
10295873,tt9916850,nm1485677,"nm9187127,nm1485677,nm9826385,nm1628284"
10295874,tt9916852,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"


In [10]:
df_title_crew['directors'].replace('\\N', np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_title_crew['directors'].replace('\\N', np.nan, inplace=True)


In [11]:
df_title_crew['writers'].unique()
df_title_crew['writers'].replace('\\N', np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_title_crew['writers'].replace('\\N', np.nan, inplace=True)


In [12]:
df_title_principals = pd.read_csv("data/title_principals.tsv", sep='\t', low_memory=False)
df_title_principals

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N
...,...,...,...,...,...,...
58969171,tt9916880,5,nm0584014,director,\N,\N
58969172,tt9916880,6,nm0996406,director,principal director,\N
58969173,tt9916880,7,nm1482639,writer,\N,\N
58969174,tt9916880,8,nm2586970,writer,books,\N


In [13]:
df_title_principals = df_title_principals[df_title_principals["tconst"].isin(df_title_basics['tconst'])]
df_title_principals

Unnamed: 0,tconst,ordering,nconst,category,job,characters
286977,tt0035803,10,nm0445198,self,\N,"[""Self""]"
286978,tt0035803,1,nm0386944,self,\N,"[""Self""]"
286979,tt0035803,2,nm0230638,self,\N,"[""Self""]"
286980,tt0035803,3,nm0246872,self,\N,"[""Self""]"
286981,tt0035803,4,nm0290950,self,\N,"[""Self""]"
...,...,...,...,...,...,...
58969171,tt9916880,5,nm0584014,director,\N,\N
58969172,tt9916880,6,nm0996406,director,principal director,\N
58969173,tt9916880,7,nm1482639,writer,\N,\N
58969174,tt9916880,8,nm2586970,writer,books,\N


In [14]:
df_title_principals['job'].replace('\\N', np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_title_principals['job'].replace('\\N', np.nan, inplace=True)


In [15]:
df_title_episode = pd.read_csv("data/title_episode.tsv", sep='\t', low_memory=False)
df_title_episode

Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0041951,tt0041038,1,9
1,tt0042816,tt0989125,1,17
2,tt0042889,tt0989125,\N,\N
3,tt0043426,tt0040051,3,42
4,tt0043631,tt0989125,2,16
...,...,...,...,...
7853468,tt9916846,tt1289683,3,18
7853469,tt9916848,tt1289683,3,17
7853470,tt9916850,tt1289683,3,19
7853471,tt9916852,tt1289683,3,20


In [16]:
df_title_episode['seasonNumber'].replace('\\N', np.nan, inplace=True)
df_title_episode['seasonNumber'] = df_title_episode['seasonNumber'].astype('Int64')
df_title_episode[df_title_episode["seasonNumber"] > 1000]

Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
2016,tt0111465,tt1868747,1994,89
2053,tt0113395,tt1868747,1995,114
2691,tt0146296,tt0057775,1993,3
2710,tt0147908,tt1868747,1996,168
4091,tt0219500,tt0057775,1965,1
...,...,...,...,...
7682336,tt9479716,tt1868747,1996,160
7682357,tt9479762,tt1868747,1998,201
7686830,tt9490998,tt9139576,2018,0
7698875,tt9521320,tt9139576,2017,0


In [17]:
df_title_episode['episodeNumber'].replace('\\N', np.nan, inplace=True)
df_title_episode['episodeNumber'] = df_title_episode['episodeNumber'].astype('Int64')

In [18]:
df_name_basics = pd.read_csv("data/name_basics.tsv", sep='\t', low_memory=False)
df_name_basics

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0045537,tt0072308,tt0053137"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0075213,tt0037382,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0054452,tt0056404,tt0049189,tt0057345"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0080455,tt0077975,tt0072562"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0069467,tt0050986,tt0083922,tt0050976"
...,...,...,...,...,...,...
12991586,nm9993714,Romeo del Rosario,\N,\N,"animation_department,art_department","tt14069590,tt2455546,tt11657662"
12991587,nm9993716,Essias Loberg,\N,\N,,\N
12991588,nm9993717,Harikrishnan Rajan,\N,\N,cinematographer,tt8736744
12991589,nm9993718,Aayush Nair,\N,\N,cinematographer,tt8736744


In [19]:
df_name_basics.dtypes


nconst               object
primaryName          object
birthYear            object
deathYear            object
primaryProfession    object
knownForTitles       object
dtype: object

In [20]:
for col in ['birthYear', 'deathYear']:
    df_name_basics[col].replace('\\N', np.nan, inplace=True)
    df_name_basics[col] = df_name_basics[col].astype('Int64')
    
df_name_basics['knownForTitles'].replace('\\N', np.nan, inplace = True)

In [21]:
df_name_basics.dtypes

nconst               object
primaryName          object
birthYear             Int64
deathYear             Int64
primaryProfession    object
knownForTitles       object
dtype: object

In [22]:
title_basics = pd.merge(df_title_basics, df_title_ratings, on='tconst', how='left')
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0035599,tvSeries,Voice of Firestone Televues,Voice of Firestone Televues,0,1943,1947,15,\N,,
1,tt0035803,tvSeries,The German Weekly Review,Die Deutsche Wochenschau,0,1940,1945,12,"Documentary,News",8.2,58.0
2,tt0038276,tvSeries,You Are an Artist,You Are an Artist,0,1946,1955,15,Talk-Show,,
3,tt0039120,tvSeries,Americana,Americana,0,1947,1949,30,"Family,Game-Show",2.7,18.0
4,tt0039121,tvSeries,Birthday Party,Birthday Party,0,1947,1949,30,Family,,
...,...,...,...,...,...,...,...,...,...,...,...
8105441,tt9916846,tvEpisode,Episode #3.18,Episode #3.18,0,2009,\N,\N,"Action,Drama,Family",,
8105442,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family",,
8105443,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family",,
8105444,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family",,


In [23]:
title_basics['genres'].replace('\\N', np.nan, inplace = True)

In [24]:
title_basics = pd.merge(title_basics, df_title_crew, on='tconst', how='left')
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,directors,writers
0,tt0035599,tvSeries,Voice of Firestone Televues,Voice of Firestone Televues,0,1943,1947,15,,,,,
1,tt0035803,tvSeries,The German Weekly Review,Die Deutsche Wochenschau,0,1940,1945,12,"Documentary,News",8.2,58.0,,
2,tt0038276,tvSeries,You Are an Artist,You Are an Artist,0,1946,1955,15,Talk-Show,,,"nm0626972,nm3811084",nm12264820
3,tt0039120,tvSeries,Americana,Americana,0,1947,1949,30,"Family,Game-Show",2.7,18.0,"nm11657766,nm0590202,nm0168641",
4,tt0039121,tvSeries,Birthday Party,Birthday Party,0,1947,1949,30,Family,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8105441,tt9916846,tvEpisode,Episode #3.18,Episode #3.18,0,2009,\N,\N,"Action,Drama,Family",,,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
8105442,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family",,,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
8105443,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family",,,nm1485677,"nm9187127,nm1485677,nm9826385,nm1628284"
8105444,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family",,,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"


In [25]:
titles = title_basics[title_basics['titleType'] == 'tvSeries']
episodes = title_basics[title_basics['titleType'] == 'tvEpisode']

In [26]:
titles

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,directors,writers
0,tt0035599,tvSeries,Voice of Firestone Televues,Voice of Firestone Televues,0,1943,1947,15,,,,,
1,tt0035803,tvSeries,The German Weekly Review,Die Deutsche Wochenschau,0,1940,1945,12,"Documentary,News",8.2,58.0,,
2,tt0038276,tvSeries,You Are an Artist,You Are an Artist,0,1946,1955,15,Talk-Show,,,"nm0626972,nm3811084",nm12264820
3,tt0039120,tvSeries,Americana,Americana,0,1947,1949,30,"Family,Game-Show",2.7,18.0,"nm11657766,nm0590202,nm0168641",
4,tt0039121,tvSeries,Birthday Party,Birthday Party,0,1947,1949,30,Family,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8105173,tt9916210,tvSeries,Rumpole of the Bailey,Rumpole of the Bailey,0,\N,\N,\N,,,,,
8105175,tt9916216,tvSeries,Kalyanam Mudhal Kadhal Varai,Kalyanam Mudhal Kadhal Varai,0,2014,2017,22,Romance,8.6,15.0,nm11202129,
8105176,tt9916218,tvSeries,Lost in Food,Lost in Food,0,2016,2017,\N,Talk-Show,,,nm0872469,nm0298279
8105249,tt9916380,tvSeries,Meie aasta Aafrikas,Meie aasta Aafrikas,0,2019,\N,43,"Adventure,Comedy,Family",8.3,117.0,nm1857733,"nm4339326,nm1859351"


In [27]:
episodes

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,directors,writers
139,tt0041951,tvEpisode,The Tenderfeet,The Tenderfeet,0,1949,\N,30,Western,7.6,86.0,nm0782690,"nm0872077,nm0289014,nm1080563,nm0834503"
244,tt0042816,tvEpisode,Othello,Othello,0,1950,\N,135,Drama,,,nm0791041,nm0000636
245,tt0042889,tvEpisode,The Tragedy of King Richard II/II,The Tragedy of King Richard II/II,0,1950,\N,145,Drama,,,nm0605925,"nm0000636,nm0605925"
327,tt0043426,tvEpisode,Coriolanus,Coriolanus,0,1951,\N,60,Drama,,,nm0629999,"nm0548529,nm0000636"
328,tt0043631,tvEpisode,The Life of King Henry V,The Life of King Henry V,0,1951,\N,133,Drama,,,nm0107960,"nm0000636,nm0605925,nm0107960"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8105441,tt9916846,tvEpisode,Episode #3.18,Episode #3.18,0,2009,\N,\N,"Action,Drama,Family",,,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
8105442,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family",,,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
8105443,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family",,,nm1485677,"nm9187127,nm1485677,nm9826385,nm1628284"
8105444,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family",,,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"


In [28]:
episodes = pd.merge(episodes, df_title_episode, on='tconst', how='left')
episodes

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,directors,writers,parentTconst,seasonNumber,episodeNumber
0,tt0041951,tvEpisode,The Tenderfeet,The Tenderfeet,0,1949,\N,30,Western,7.6,86.0,nm0782690,"nm0872077,nm0289014,nm1080563,nm0834503",tt0041038,1,9
1,tt0042816,tvEpisode,Othello,Othello,0,1950,\N,135,Drama,,,nm0791041,nm0000636,tt0989125,1,17
2,tt0042889,tvEpisode,The Tragedy of King Richard II/II,The Tragedy of King Richard II/II,0,1950,\N,145,Drama,,,nm0605925,"nm0000636,nm0605925",tt0989125,,
3,tt0043426,tvEpisode,Coriolanus,Coriolanus,0,1951,\N,60,Drama,,,nm0629999,"nm0548529,nm0000636",tt0040051,3,42
4,tt0043631,tvEpisode,The Life of King Henry V,The Life of King Henry V,0,1951,\N,133,Drama,,,nm0107960,"nm0000636,nm0605925,nm0107960",tt0989125,2,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7853657,tt9916846,tvEpisode,Episode #3.18,Episode #3.18,0,2009,\N,\N,"Action,Drama,Family",,,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284",tt1289683,3,18
7853658,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family",,,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284",tt1289683,3,17
7853659,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family",,,nm1485677,"nm9187127,nm1485677,nm9826385,nm1628284",tt1289683,3,19
7853660,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family",,,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284",tt1289683,3,20


In [29]:
df_title_principals

Unnamed: 0,tconst,ordering,nconst,category,job,characters
286977,tt0035803,10,nm0445198,self,,"[""Self""]"
286978,tt0035803,1,nm0386944,self,,"[""Self""]"
286979,tt0035803,2,nm0230638,self,,"[""Self""]"
286980,tt0035803,3,nm0246872,self,,"[""Self""]"
286981,tt0035803,4,nm0290950,self,,"[""Self""]"
...,...,...,...,...,...,...
58969171,tt9916880,5,nm0584014,director,,\N
58969172,tt9916880,6,nm0996406,director,principal director,\N
58969173,tt9916880,7,nm1482639,writer,,\N
58969174,tt9916880,8,nm2586970,writer,books,\N


In [30]:
df_title_principals['characters'].replace('\\N', np.nan, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_title_principals['characters'].replace('\\N', np.nan, inplace = True)


In [31]:
principals = df_title_principals.copy()
principals

Unnamed: 0,tconst,ordering,nconst,category,job,characters
286977,tt0035803,10,nm0445198,self,,"[""Self""]"
286978,tt0035803,1,nm0386944,self,,"[""Self""]"
286979,tt0035803,2,nm0230638,self,,"[""Self""]"
286980,tt0035803,3,nm0246872,self,,"[""Self""]"
286981,tt0035803,4,nm0290950,self,,"[""Self""]"
...,...,...,...,...,...,...
58969171,tt9916880,5,nm0584014,director,,
58969172,tt9916880,6,nm0996406,director,principal director,
58969173,tt9916880,7,nm1482639,writer,,
58969174,tt9916880,8,nm2586970,writer,books,


In [32]:
names = df_name_basics.copy()
names

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0045537,tt0072308,tt0053137"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0075213,tt0037382,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934,,"actress,soundtrack,music_department","tt0054452,tt0056404,tt0049189,tt0057345"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0080455,tt0077975,tt0072562"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0069467,tt0050986,tt0083922,tt0050976"
...,...,...,...,...,...,...
12991586,nm9993714,Romeo del Rosario,,,"animation_department,art_department","tt14069590,tt2455546,tt11657662"
12991587,nm9993716,Essias Loberg,,,,
12991588,nm9993717,Harikrishnan Rajan,,,cinematographer,tt8736744
12991589,nm9993718,Aayush Nair,,,cinematographer,tt8736744


In [33]:
titles.set_index('tconst', inplace=True)
episodes.set_index('tconst', inplace=True)
principals.set_index(['tconst', 'nconst'], inplace = True)
names.set_index('nconst', inplace = True)

In [34]:
#storing datasets as pickle files
titles.to_pickle('data/tvseries.pkl')
episodes.to_pickle('data/episodes.pkl')
principals.to_pickle('data/principals.pkl')
names.to_pickle('data/names.pkl')