In [1]:
import pandas as pd
import numpy as np
import zipfile
import sqlite3

In [2]:
data_sources = {
    # movielens
    'genome_scores'    : './data/ml-25m/genome-scores.csv',
    'genome_tags'      : './data/ml-25m/genome-tags.csv',
    'links'            : './data/ml-25m/links.csv',
    'movies'           : './data/ml-25m/movies.csv',
    'ratings'          : './data/ml-25m/ratings.csv',
    'tags'             : './data/ml-25m/tags.csv',
#     imdb
    'imdb_name_basics' : './data/imdb/name.basics.tsv.gz',
    'imdb_title_basics': './data/imdb/title.basics.tsv.gz',
    'imdb_ratings'     : './data/imdb/title.ratings.tsv.gz',
    'imdb_crew'        : './data/imdb/title.crew.tsv.gz',
    'imdb_principals'  : './data/imdb/title.principals.tsv.gz',
}

In [3]:
dfs = {}
for name, path in data_sources.items():
    print(dfs.keys(), end='\r')
    separator = '\t' if 'tsv' in path else ','
    dfs[name] = pd.read_csv(path, sep=separator).replace(to_replace=r'\N', value=np.nan)

dict_keys(['genome_scores', 'genome_tags', 'links', 'movies', 'ratings', 'tags', 'imdb_name_basics'])

  interactivity=interactivity, compiler=compiler, result=result)


dict_keys(['genome_scores', 'genome_tags', 'links', 'movies', 'ratings', 'tags', 'imdb_name_basics', 'imdb_title_basics', 'imdb_ratings', 'imdb_crew'])

In [4]:
for name, path in data_sources.items():
    dfs[name] = dfs[name].dropna()

# connect to database

In [7]:
db_name = './movie_sqlite.db'
conn = sqlite3.connect(db_name)
c = conn.cursor()

In [8]:
# get needed movie ids
sql = """
select id from movies ;
"""
c.execute(sql)
movie_ids = c.fetchall()
needed_movies = [m[0] for m in movie_ids]
needed_movies[:5]  # check first 5

['tt0000012', 'tt0000417', 'tt0000439', 'tt0004972', 'tt0006333']

In [9]:
df_needed_movies = pd.DataFrame(needed_movies)
df_needed_movies.columns = ['movie_id']
df_needed_movies.head()

Unnamed: 0,movie_id
0,tt0000012
1,tt0000417
2,tt0000439
3,tt0004972
4,tt0006333


In [10]:
df_needed_movies.shape

(13795, 1)

In [11]:
# actors table
# directors table

In [12]:
needed_movies_joined_to_directors = pd.merge(df_needed_movies,
                                             dfs['imdb_crew'],
                                             left_on='movie_id',
                                             right_on='tconst',
                                             how='left')
needed_movies_joined_to_directors.head()

Unnamed: 0,movie_id,tconst,directors,writers
0,tt0000012,,,
1,tt0000417,tt0000417,nm0617588,"nm0617588,nm0894523,nm0920229"
2,tt0000439,tt0000439,nm0692105,"nm1145809,nm0692105"
3,tt0004972,tt0004972,nm0000428,"nm0228746,nm0000428,nm0940488"
4,tt0006333,tt0006333,nm0665737,"nm0894523,nm0665737"


In [13]:
unique_movies = needed_movies_joined_to_directors.movie_id.unique()
unique_movies

array(['tt0000012', 'tt0000417', 'tt0000439', ..., 'tt9412098',
       'tt9495224', 'tt9573980'], dtype=object)

In [14]:
needed_movies_joined_to_directors.movie_id.nunique()

13795

In [15]:
needed_movies_joined_to_directors.directors.nunique()

5569

In [16]:
np.nan

nan

In [17]:
# needed_movies_joined_to_directors.loc[:,'writers'] = np.where(needed_movies_joined_to_directors.writers == r'\N',
#                                                               np.nan,
#                                                               needed_movies_joined_to_directors.writers)

In [18]:
needed_movies_joined_to_directors = (needed_movies_joined_to_directors[~
                               needed_movies_joined_to_directors.directors.isna()])

In [19]:
needed_movies_joined_to_directors[needed_movies_joined_to_directors.directors.isna()]

Unnamed: 0,movie_id,tconst,directors,writers


In [20]:
needed_movies_joined_to_directors.shape

(13512, 4)

In [21]:
all_needed_directors = pd.Series(needed_movies_joined_to_directors.directors.unique())

In [22]:
all_needed_directors.shape

(5569,)

In [23]:
all_needed_directors.head()

0    nm0617588
1    nm0692105
2    nm0000428
3    nm0665737
4    nm0000122
dtype: object

In [24]:
cleaned_all_needed_directors = []
for pair in [person.split(',') for person in all_needed_directors]:
    for item in pair:
        cleaned_all_needed_directors.append(item)

In [25]:
cleaned_all_needed_directors = pd.DataFrame(np.array(cleaned_all_needed_directors), columns=['nconst'])
cleaned_all_needed_directors.head()

Unnamed: 0,nconst
0,nm0617588
1,nm0692105
2,nm0000428
3,nm0665737
4,nm0000122


In [26]:
cleaned_all_needed_directors.shape

(7013, 1)

In [27]:
'nm5278146' in cleaned_all_needed_directors.nconst

False

In [28]:
dfs['imdb_crew'].head()

Unnamed: 0,tconst,directors,writers
8,tt0000009,nm0085156,nm0085156
34,tt0000036,nm0005690,nm0410331
74,tt0000076,nm0005690,nm0410331
89,tt0000091,nm0617588,nm0617588
106,tt0000108,nm0005690,nm0410331


In [29]:
dfs['imdb_crew'][dfs['imdb_crew'].writers.isna()]

Unnamed: 0,tconst,directors,writers


In [30]:
cleaned_all_needed_directors[cleaned_all_needed_directors.nconst.isna()]

Unnamed: 0,nconst


In [31]:
# * NOTE some directors exist in the directors field for movies but in the imdb crew are not found
#  using this we can find these cases how='left')
# 618 directors dont exist in the imdb crew dataset

In [32]:
cleaned_all_needed_directors.head()

Unnamed: 0,nconst
0,nm0617588
1,nm0692105
2,nm0000428
3,nm0665737
4,nm0000122


In [33]:
dfs['imdb_crew'].head()

Unnamed: 0,tconst,directors,writers
8,tt0000009,nm0085156,nm0085156
34,tt0000036,nm0005690,nm0410331
74,tt0000076,nm0005690,nm0410331
89,tt0000091,nm0617588,nm0617588
106,tt0000108,nm0005690,nm0410331


In [34]:
directors_and_movies = pd.merge(cleaned_all_needed_directors, 
                                dfs['imdb_crew'],
                                left_on='nconst',
                                right_on='directors',
                                how='inner')
directors_and_movies.head()

Unnamed: 0,nconst,tconst,directors,writers
0,nm0617588,tt0000091,nm0617588,nm0617588
1,nm0617588,tt0000132,nm0617588,nm0617588
2,nm0617588,tt0000138,nm0617588,nm0617588
3,nm0617588,tt0000211,nm0617588,nm0617588
4,nm0617588,tt0000218,nm0617588,nm0617588


In [35]:
directors_and_movies.tconst.nunique()

155889

In [36]:
# filter this set down by only the movies we have access to.
directors_and_movies_filtered = directors_and_movies[directors_and_movies.tconst.isin(unique_movies)]
directors_and_movies_filtered.tconst.nunique()

12509

In [37]:
# check how many movies each director was in
s = directors_and_movies.groupby('nconst').count()['tconst'].sort_values(ascending=False)
s.head()

nconst
nm0455741    1788
nm0005062    1768
nm0814716    1570
nm1853544    1080
nm0400958     910
Name: tconst, dtype: int64

In [38]:
s.tail()

nconst
nm0095298    1
nm0582481    1
nm5169133    1
nm1703612    1
nm2499640    1
Name: tconst, dtype: int64

In [39]:
directors_movie_counts = s.reset_index()
directors_movie_counts[directors_movie_counts['nconst'] == 'nm0455741']

Unnamed: 0,nconst,tconst
0,nm0455741,1788


In [40]:
# EDA this is the average number of movies each director in our dataset directed
s.mean()

35.21098901098901

In [41]:
directors_and_movies = directors_and_movies[['directors', 'tconst']]
directors_and_movies.columns = ['directors', 'movies']

In [42]:
# load movies, ids, releveance into df
# join this with directors and movies on movies id
# aggregate on [directors, movies] and apply aggregate function (mean) (or weighted based on rating)

In [43]:
sql = '''
select *
from tag_relevance
join movies
on fk_id = id
limit 50000000
'''
c
df_movie_tags = pd.read_sql(sql, con=conn)
df_movie_tags.head()

Unnamed: 0,fk_id,tag_id,relevance,id,kind,primary_title,original_title,release_year,runtime_minutes,genres
0,tt0000012,1,0.045,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,"Action,Documentary,Short"
1,tt0000012,2,0.04225,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,"Action,Documentary,Short"
2,tt0000012,3,0.03475,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,"Action,Documentary,Short"
3,tt0000012,4,0.0375,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,"Action,Documentary,Short"
4,tt0000012,5,0.21475,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,"Action,Documentary,Short"


In [44]:
# df_movie_tags.groupby('fk_id').count().tag_id.unique()

In [45]:
# NOTE: fix nans from being joined above
# for now just remove

In [46]:
# directors_and_movies = directors_and_movies[~directors_and_movies.tconst.isna()]

In [47]:
directors_and_movies.head()

Unnamed: 0,directors,movies
0,nm0617588,tt0000091
1,nm0617588,tt0000132
2,nm0617588,tt0000138
3,nm0617588,tt0000211
4,nm0617588,tt0000218


In [48]:
directors_and_movies[directors_and_movies['directors'] == 'nm0000008']

Unnamed: 0,directors,movies
39204,nm0000008,tt0055257


In [49]:
# directors_and_movies.loc[:,'movie_id'] = (directors_and_movies.loc[:,'tconst'].str[2:]).astype(int)
# directors_and_movies.head()

In [50]:
# df_movie_tags.head()

In [51]:
df_movie_tags.head()

Unnamed: 0,fk_id,tag_id,relevance,id,kind,primary_title,original_title,release_year,runtime_minutes,genres
0,tt0000012,1,0.045,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,"Action,Documentary,Short"
1,tt0000012,2,0.04225,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,"Action,Documentary,Short"
2,tt0000012,3,0.03475,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,"Action,Documentary,Short"
3,tt0000012,4,0.0375,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,"Action,Documentary,Short"
4,tt0000012,5,0.21475,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,"Action,Documentary,Short"


In [52]:
directors_and_movies.head()

Unnamed: 0,directors,movies
0,nm0617588,tt0000091
1,nm0617588,tt0000132
2,nm0617588,tt0000138
3,nm0617588,tt0000211
4,nm0617588,tt0000218


In [53]:
directors_and_movies.directors.nunique()

5460

In [54]:
directors_and_movies.groupby('directors').count().sort_values('movies', ascending=False)

Unnamed: 0_level_0,movies
directors,Unnamed: 1_level_1
nm0455741,1788
nm0005062,1768
nm0814716,1570
nm1853544,1080
nm0400958,910
...,...
nm0000092,1
nm0829518,1
nm1671943,1
nm0254357,1


In [55]:
# directors_and_movies.groupby('movies').count().sort_values('directors', ascending=False)

In [56]:
df_movie_tags[df_movie_tags['fk_id'] == 'tt0032544']

Unnamed: 0,fk_id,tag_id,relevance,id,kind,primary_title,original_title,release_year,runtime_minutes,genres


In [57]:
merged_df = pd.merge(directors_and_movies, df_movie_tags, left_on='movies', right_on='fk_id', how='inner')
# merged_df[~merged_df.tag_id.isna()].head()
merged_df.shape

(489552, 12)

In [58]:
merged_df[merged_df['directors'] == 'nm0455741']

Unnamed: 0,directors,movies,fk_id,tag_id,relevance,id,kind,primary_title,original_title,release_year,runtime_minutes,genres


In [59]:
merged_df.shape

(489552, 12)

In [60]:
directors_and_movies.directors.nunique() * 1128

6158880

In [61]:
directors_tag_relevance = merged_df.groupby(['directors', 'tag_id'])['relevance'].mean().reset_index()
directors_tag_relevance

Unnamed: 0,directors,tag_id,relevance
0,nm0000008,1,0.04475
1,nm0000008,10,0.01650
2,nm0000008,100,0.61000
3,nm0000008,1000,0.04275
4,nm0000008,1001,0.04375
...,...,...,...
277483,nm0958387,995,0.11225
277484,nm0958387,996,0.06875
277485,nm0958387,997,0.05275
277486,nm0958387,998,0.14750


In [62]:
15584448
current rows
updated rows
15861936


SyntaxError: invalid syntax (<ipython-input-62-9ffc10d712e2>, line 2)

In [63]:
fk_id text NOT NULL,
    tag_id name text NOT NULL,
    relevance real NOT NULL

SyntaxError: invalid syntax (<ipython-input-63-2345dc278d65>, line 1)

In [64]:
directors_tag_relevance.columns = ['fk_id', 'tag_id', 'relevance']

In [65]:
directors_tag_relevance.to_sql('tag_relevance', conn, if_exists='append', index=False)

In [66]:
15861936 - 15584448

277488

In [67]:
directors_tag_relevance.shape

(277488, 3)

In [68]:
directors_tag_relevance[directors_tag_relevance['directors'] == 'nm0958387'].relevance.max()

KeyError: 'directors'

In [69]:
directors_tag_relevance[directors_tag_relevance['directors'] == 'nm0958387'][directors_tag_relevance[directors_tag_relevance['directors'] == 'nm0958387']['relevance']== 0.993]

KeyError: 'directors'

In [70]:
directors_and_movies[directors_and_movies.directors == 'nm0958387']

Unnamed: 0,directors,movies
71350,nm0958387,tt0099653
71351,nm0958387,tt0113071
71352,nm0958387,tt0250687
71353,nm0958387,tt0099653
71354,nm0958387,tt0113071
71355,nm0958387,tt0250687
71356,nm0958387,tt0099653
71357,nm0958387,tt0113071
71358,nm0958387,tt0250687
71359,nm0958387,tt0099653


In [71]:
dfs.keys()

dict_keys(['genome_scores', 'genome_tags', 'links', 'movies', 'ratings', 'tags', 'imdb_name_basics', 'imdb_title_basics', 'imdb_ratings', 'imdb_crew', 'imdb_principals'])

In [72]:
dfs['imdb_principals']

Unnamed: 0,tconst,ordering,nconst,category,job,characters


In [73]:
dfs['imdb_crew']

Unnamed: 0,tconst,directors,writers
8,tt0000009,nm0085156,nm0085156
34,tt0000036,nm0005690,nm0410331
74,tt0000076,nm0005690,nm0410331
89,tt0000091,nm0617588,nm0617588
106,tt0000108,nm0005690,nm0410331
...,...,...,...
6691700,tt9916848,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
6691701,tt9916850,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
6691702,tt9916852,"nm5519375,nm5519454","nm6182221,nm1628284,nm2921377"
6691703,tt9916856,nm10538645,nm6951431


In [74]:
dfs['genome_tags'][dfs['genome_tags'].tagId == 43]

Unnamed: 0,tagId,tag
42,43,alien


In [75]:
directors_tag_relevance[directors_tag_relevance==0.99925]

Unnamed: 0,fk_id,tag_id,relevance
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
277483,,,
277484,,,
277485,,,
277486,,,


In [76]:
directors_tag_relevance = pd.DataFrame(directors_tag_relevance).reset_index()
directors_tag_relevance.directors.nunique()

AttributeError: 'DataFrame' object has no attribute 'directors'

In [77]:
pd.DataFrame(directors_tag_relevance)

Unnamed: 0,index,fk_id,tag_id,relevance
0,0,nm0000008,1,0.04475
1,1,nm0000008,10,0.01650
2,2,nm0000008,100,0.61000
3,3,nm0000008,1000,0.04275
4,4,nm0000008,1001,0.04375
...,...,...,...,...
277483,277483,nm0958387,995,0.11225
277484,277484,nm0958387,996,0.06875
277485,277485,nm0958387,997,0.05275
277486,277486,nm0958387,998,0.14750


In [78]:
dfs.keys()

dict_keys(['genome_scores', 'genome_tags', 'links', 'movies', 'ratings', 'tags', 'imdb_name_basics', 'imdb_title_basics', 'imdb_ratings', 'imdb_crew', 'imdb_principals'])

In [79]:
dfs['imdb_name_basics'].head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0043044,tt0072308,tt0053137,tt0050419"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0037382,tt0117057,tt0071877"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0080455,tt0072562,tt0077975,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0083922,tt0050976,tt0069467,tt0050986"
5,nm0000006,Ingrid Bergman,1915,1982,"actress,soundtrack,producer","tt0036855,tt0038109,tt0038787,tt0071877"


In [102]:
needed_directors_and_names = pd.merge(cleaned_all_needed_directors,dfs['imdb_name_basics'],
                                      left_on='nconst', right_on='nconst', how='left' )
toy_data = needed_directors_and_names.head(100)

In [103]:
## dumping into csv for toy data for visualization
toy_data.to_csv('toy_director_movies.csv')

In [97]:
needed_directors_and_names = needed_directors_and_names[['nconst','primaryName']]

In [82]:
dfs['directors'] = needed_directors_and_names
dfs['directors_relevence'] = directors_tag_relevance

In [83]:
dfs['directors']

Unnamed: 0,nconst,primaryName
0,nm0617588,Georges Méliès
1,nm0692105,Edwin S. Porter
2,nm0000428,D.W. Griffith
3,nm0665737,Stuart Paton
4,nm0000122,Charles Chaplin
...,...,...
7008,nm0737517,
7009,nm0319213,
7010,nm5255757,
7011,nm0636675,


In [84]:
# 
#
directors_table = """
CREATE TABLE IF NOT EXISTS directors (
    id text PRIMARY KEY,
    name text
);
"""

director_relevence_table = """
CREATE TABLE IF NOT EXISTS director_tag_relevance (
    director_id integer,
    tag_id name text NOT NULL,
    relevance real NOT NULL
);
"""
table_sql = {
    'directors'                 : directors_table,
    'directors_relevence'       : director_relevence_table,
}

table_data_source = {
    'directors'                 : 'directors',
    'directors_relevence'       : 'directors_relevence',
}

for table_name in table_sql:
    # create table
    c.execute(table_sql[table_name])
    # populate table with data
    df = dfs[table_data_source[table_name]]
    df.to_sql(table_sql[table_name], conn, if_exists='append', index=False)

In [85]:
# get needed movie ids
sql = """
select id from movies ;
"""
c.execute(sql)
movie_ids = c.fetchall()
needed_movies = [m[0] for m in movie_ids]
needed_movies[:5]  # check first 5

['tt0000012', 'tt0000417', 'tt0000439', 'tt0004972', 'tt0006333']

In [86]:
# director aggregated tag score
# average for each tag for all of the movies they directed


In [87]:
# get needed movie ids
sql = """
select * from tags ;
"""
c.execute(sql)
tag_ids = c.fetchall()
all_tags = [t for t in tag_ids]
all_tags[:5]  # check first 5

[(1, '007'),
 (2, '007 (series)'),
 (3, '18th century'),
 (4, '1920s'),
 (5, '1930s')]

In [88]:
# get needed movie ids
sql = """
select * from tag_relevance;
"""
c.execute(sql)
movie_ids = c.fetchall()
needed_movies = [m for m in movie_ids]
needed_movies[:5]  # check first 5
# movie, tag, relevance

[('tt0000001', '1', 0.02875),
 ('tt0000001', '2', 0.023749999999999997),
 ('tt0000001', '3', 0.0625),
 ('tt0000001', '4', 0.07574999999999997),
 ('tt0000001', '5', 0.14075)]

In [107]:
movie_ids

[('tt0000001', '1', 0.02875),
 ('tt0000001', '2', 0.023749999999999997),
 ('tt0000001', '3', 0.0625),
 ('tt0000001', '4', 0.07574999999999997),
 ('tt0000001', '5', 0.14075),
 ('tt0000001', '6', 0.14675),
 ('tt0000001', '7', 0.0635),
 ('tt0000001', '8', 0.20375),
 ('tt0000001', '9', 0.20199999999999999),
 ('tt0000001', '10', 0.03075),
 ('tt0000001', '11', 0.58025),
 ('tt0000001', '12', 0.10249999999999998),
 ('tt0000001', '13', 0.20175),
 ('tt0000001', '14', 0.007000000000000006),
 ('tt0000001', '15', 0.024500000000000025),
 ('tt0000001', '16', 0.17275),
 ('tt0000001', '17', 0.016500000000000015),
 ('tt0000001', '18', 0.10399999999999998),
 ('tt0000001', '19', 0.6625),
 ('tt0000001', '20', 0.30075),
 ('tt0000001', '21', 0.31675),
 ('tt0000001', '22', 0.28600000000000003),
 ('tt0000001', '23', 0.06274999999999997),
 ('tt0000001', '24', 0.01924999999999999),
 ('tt0000001', '25', 0.05875000000000001),
 ('tt0000001', '26', 0.07924999999999999),
 ('tt0000001', '27', 0.1965),
 ('tt0000001', '

In [89]:
dfs.keys()

dict_keys(['genome_scores', 'genome_tags', 'links', 'movies', 'ratings', 'tags', 'imdb_name_basics', 'imdb_title_basics', 'imdb_ratings', 'imdb_crew', 'imdb_principals', 'directors', 'directors_relevence'])

In [90]:
dfs['']

KeyError: ''

In [None]:
# director_id, tag_id, relevence_score
# 1, 1, .5
# 1, 2, .2
# ....

In [None]:
# actor_id, tag_id, relevence_score
# 1, 1, .5
# 1, 2, .2
# ....

In [91]:
s.head()

nconst
nm0455741    1788
nm0005062    1768
nm0814716    1570
nm1853544    1080
nm0400958     910
Name: tconst, dtype: int64

Unnamed: 0,movie_id,tconst,directors,writers
1,tt0000417,tt0000417,nm0617588,"nm0617588,nm0894523,nm0920229"
2,tt0000439,tt0000439,nm0692105,"nm1145809,nm0692105"
3,tt0004972,tt0004972,nm0000428,"nm0228746,nm0000428,nm0940488"
4,tt0006333,tt0006333,nm0665737,"nm0894523,nm0665737"
5,tt0006864,tt0006864,nm0000428,"nm0048512,nm0115218,nm0000428,nm0002616,nm0640..."


In [15]:
actors

actor_id, name, tag_id, tag_value

NameError: name 'actors' is not defined