In [2]:
import pandas as pd
import numpy as np
import zipfile
import sqlite3

In [3]:
# switch to control where data is read in from
data_path = ['./data/', './'][0]

In [4]:
data_sources = {
    # movielens
#     'genome_scores'    : 'ml-25m/genome-scores.csv',
    'genome_tags'      : 'ml-25m/genome-tags.csv',
    'links'            : 'ml-25m/links.csv',
    'movies'           : 'ml-25m/movies.csv',
#    'ratings'          : 'ml-25m/ratings.csv',
    'tags'             : 'ml-25m/tags.csv',
    # imdb
    'imdb_name_basics' : 'name.basics.tsv.gz',
    'imdb_title_basics': 'title.basics.tsv.gz',
#     'imdb_ratings'     : 'title.ratings.tsv.gz',
    'imdb_crew'        : 'title.crew.tsv.gz',
    'imdb_principals'  : 'title.principals.tsv.gz',
}

In [69]:
dfs = {} if 'dfs' not in globals() else dfs
for name, path in data_sources.items():
    print(dfs.keys(), end='\r')
    if name in dfs:
        continue
    separator = '\t' if 'tsv' in path else ','
    print(name, data_path+path, separator)
    dfs[name] = pd.read_csv(data_path+path, sep=separator).replace(to_replace=r'\N', value=np.nan)

imdb_name_basics ./data/name.basics.tsv.gz 	 'tags', 'imdb_title_basics', 'imdb_crew', 'imdb_principals'])
dict_keys(['genome_tags', 'links', 'movies', 'tags', 'imdb_title_basics', 'imdb_crew', 'imdb_principals', 'imdb_name_basics'])

In [6]:
# get rid of null values
for name, path in data_sources.items():
    dfs[name] = dfs[name].dropna()

# connect to database

In [7]:
db_name = './movie_sqlite.db'
conn = sqlite3.connect(db_name)
c = conn.cursor()

In [8]:
# get needed movie ids
sql = """
select id from movies;
"""
c.execute(sql)
movie_ids = c.fetchall()
needed_movies = [m[0] for m in movie_ids]
needed_movies[:5]  # check first 5

['tt0000012', 'tt0000417', 'tt0000439', 'tt0004972', 'tt0006333']

In [9]:
len(needed_movies)

13795

In [10]:
df_needed_movies = pd.DataFrame(needed_movies)
df_needed_movies.columns = ['movie_id']
df_needed_movies.head()

Unnamed: 0,movie_id
0,tt0000012
1,tt0000417
2,tt0000439
3,tt0004972
4,tt0006333


In [11]:
needed_movies_joined_to_directors = pd.merge(df_needed_movies,
                                             dfs['imdb_crew'],
                                             left_on='movie_id',
                                             right_on='tconst',
                                             how='left')
needed_movies_joined_to_directors.head()

Unnamed: 0,movie_id,tconst,directors,writers
0,tt0000012,,,
1,tt0000417,tt0000417,nm0617588,"nm0617588,nm0894523,nm0920229"
2,tt0000439,tt0000439,nm0692105,"nm1145809,nm0692105"
3,tt0004972,tt0004972,nm0000428,"nm0228746,nm0000428,nm0940488"
4,tt0006333,tt0006333,nm0665737,"nm0894523,nm0665737"


In [12]:
unique_movies = needed_movies_joined_to_directors.movie_id.unique()
unique_movies

array(['tt0000012', 'tt0000417', 'tt0000439', ..., 'tt9412098',
       'tt9495224', 'tt9573980'], dtype=object)

In [13]:
needed_movies_joined_to_directors.movie_id.nunique()

13795

In [14]:
needed_movies_joined_to_directors.directors.nunique()

5568

In [19]:
# number of movies without a director listed
needed_movies_joined_to_directors[needed_movies_joined_to_directors.directors.isna()].shape

(283, 4)

In [20]:
# get rid of these rows without directors
needed_movies_joined_to_directors = (
    needed_movies_joined_to_directors[~needed_movies_joined_to_directors.directors.isna()])

In [21]:
# double check they are gone
needed_movies_joined_to_directors[needed_movies_joined_to_directors.directors.isna()]

Unnamed: 0,movie_id,tconst,directors,writers


In [22]:
needed_movies_joined_to_directors.shape

(13512, 4)

In [23]:
all_needed_directors = pd.Series(needed_movies_joined_to_directors.directors.unique())

In [37]:
cleaned_all_needed_directors = []
for pair in [person.split(',') for person in all_needed_directors]:
    for item in pair:
        cleaned_all_needed_directors.append(item)
cleaned_all_needed_directors = pd.DataFrame(np.array(cleaned_all_needed_directors), columns=['nconst'])
# cleaned_all_needed_directors.columns = ['entity_id']
cleaned_all_needed_directors.head()

Unnamed: 0,nconst
0,nm0617588
1,nm0692105
2,nm0000428
3,nm0665737
4,nm0000122


In [38]:
# * NOTE some directors exist in the directors field for movies but in the imdb crew are not found
#  using this we can find these cases how='left')
# 618 directors dont exist in the imdb crew dataset

In [39]:
directors_and_movies = pd.merge(cleaned_all_needed_directors, 
                                dfs['imdb_crew'],
                                left_on='nconst',
                                right_on='directors',
                                how='inner')
directors_and_movies.head()

Unnamed: 0,nconst,tconst,directors,writers
0,nm0617588,tt0000091,nm0617588,nm0617588
1,nm0617588,tt0000132,nm0617588,nm0617588
2,nm0617588,tt0000138,nm0617588,nm0617588
3,nm0617588,tt0000211,nm0617588,nm0617588
4,nm0617588,tt0000218,nm0617588,nm0617588


In [40]:
directors_and_movies.tconst.nunique()

155653

In [41]:
# filter this set down by only the movies we have access to.
directors_and_movies_filtered = directors_and_movies[directors_and_movies.tconst.isin(unique_movies)]
directors_and_movies_filtered.tconst.nunique()

12513

In [42]:
# check how many movies each director was in
s = directors_and_movies.groupby('nconst').count()['tconst'].sort_values(ascending=False)
s.head()

nconst
nm0455741    1788
nm0814716    1570
nm0005062    1540
nm1853544    1080
nm0400958     904
Name: tconst, dtype: int64

In [43]:
s.tail()

nconst
nm5169133    1
nm1027986    1
nm0834893    1
nm1027203    1
nm0664920    1
Name: tconst, dtype: int64

In [44]:
directors_movie_counts = s.reset_index()
directors_movie_counts[directors_movie_counts['nconst'] == 'nm0455741']

Unnamed: 0,nconst,tconst
0,nm0455741,1788


In [45]:
# EDA this is the average number of movies each director in our dataset directed
s.mean()

35.10498351044338

In [46]:
directors_and_movies = directors_and_movies[['directors', 'tconst']]
directors_and_movies.columns = ['directors', 'movies']

In [63]:
# load movies, ids, releveance into df
# join this with directors and movies on movies id
# aggregate on [directors, movies] and apply aggregate function (mean) (or weighted based on rating)

In [47]:
sql = '''
select *
from tag_relevance
join movies
on fk_id = id
'''
c
df_movie_tags = pd.read_sql(sql, con=conn)
df_movie_tags.head()

Unnamed: 0,fk_id,tag_id,relevance,id,kind,primary_title,original_title,release_year,runtime_minutes,genres
0,tt0000012,1,0.045,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
1,tt0000012,2,0.04225,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
2,tt0000012,3,0.03475,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
3,tt0000012,4,0.0375,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
4,tt0000012,5,0.21475,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N


In [48]:
df_movie_tags.groupby('fk_id').count().tag_id.unique()

array([1128])

In [229]:
# directors_and_movies.loc[:,'movie_id'] = (directors_and_movies.loc[:,'tconst'].str[2:]).astype(int)
# directors_and_movies.head()

In [230]:
# df_movie_tags.head()

In [49]:
directors_and_movies.directors.nunique()

5458

In [50]:
# directors_and_movies.groupby('directors').count().sort_values('movies', ascending=False)

In [51]:
# directors_and_movies.groupby('movies').count().sort_values('directors', ascending=False)

In [52]:
merged_df = pd.merge(directors_and_movies, df_movie_tags, left_on='movies', right_on='fk_id', how='inner')
# merged_df[~merged_df.tag_id.isna()].head()
merged_df.shape

(489552, 12)

In [53]:
directors_tag_relevance = merged_df.groupby(['directors', 'tag_id'])['relevance'].mean().reset_index()
directors_tag_relevance

Unnamed: 0,directors,tag_id,relevance
0,nm0000008,1,0.04475
1,nm0000008,10,0.01650
2,nm0000008,100,0.61000
3,nm0000008,1000,0.04275
4,nm0000008,1001,0.04375
...,...,...,...
277483,nm0958387,995,0.11225
277484,nm0958387,996,0.06875
277485,nm0958387,997,0.05275
277486,nm0958387,998,0.14750


In [54]:
directors_tag_relevance.columns = ['fk_id', 'tag_id', 'relevance']

In [277]:
# directors_tag_relevance.to_sql('tag_relevance', conn, if_exists='append', index=False)

In [55]:
# directors_tag_relevance[directors_tag_relevance['directors'] == 'nm0958387'].relevance.max()

In [56]:
# directors_tag_relevance[directors_tag_relevance['directors'] == 'nm0958387'][directors_tag_relevance[directors_tag_relevance['directors'] == 'nm0958387']['relevance']== 0.993]

In [57]:
# directors_and_movies[directors_and_movies.directors == 'nm0958387']

In [58]:
# dfs['genome_tags'][dfs['genome_tags'].tagId == 43]

In [59]:
# directors_tag_relevance[directors_tag_relevance==0.99925]

In [60]:
# directors_tag_relevance = pd.DataFrame(directors_tag_relevance).reset_index()
# directors_tag_relevance.directors.nunique()

In [72]:
needed_directors_and_names = pd.merge(cleaned_all_needed_directors, dfs['imdb_name_basics'],
                                      left_on='nconst', right_on='nconst', how='left' )
needed_directors_and_names.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0617588,Georges Méliès,,,"director,actor,producer","tt0000091,tt0000499,tt0223267,tt0002113"
1,nm0692105,Edwin S. Porter,,,"director,cinematographer,writer","tt0004057,tt0000757,tt0004654,tt0006279"
2,nm0000428,D.W. Griffith,,,"director,writer,producer","tt0009559,tt0006864,tt0004972,tt0010484"
3,nm0665737,Stuart Paton,,,"director,writer,actor","tt0008028,tt0021948,tt0012293,tt0028725"
4,nm0000122,Charles Chaplin,,,"writer,soundtrack,actor","tt0039631,tt0027977,tt0032553,tt0044837"


In [79]:
needed_directors_and_names = needed_directors_and_names[['nconst','primaryName']]

In [80]:
dfs['directors'] = needed_directors_and_names
# dfs['directors_relevence'] = directors_tag_relevance

In [82]:
dfs['directors'].head()

Unnamed: 0,nconst,primaryName
0,nm0617588,Georges Méliès
1,nm0692105,Edwin S. Porter
2,nm0000428,D.W. Griffith
3,nm0665737,Stuart Paton
4,nm0000122,Charles Chaplin


In [86]:
dfs['directors'].columns = ['id','name']

In [93]:
dfs['directors'] = dfs['directors'].drop_duplicates('id').reset_index()
dfs['directors'] = dfs['directors'][['id','name']]

In [94]:
dfs['directors'].id.nunique()

5869

In [95]:
dfs['directors'].head()

Unnamed: 0,id,name
0,nm0617588,Georges Méliès
1,nm0692105,Edwin S. Porter
2,nm0000428,D.W. Griffith
3,nm0665737,Stuart Paton
4,nm0000122,Charles Chaplin


In [97]:
directors_table = """
CREATE TABLE IF NOT EXISTS directors (
    id text PRIMARY KEY,
    name text
);
"""

table_sql = {
    'directors'                 : directors_table,
}

table_data_source = {
    'directors'                 : 'directors',
}

for table_name in table_sql:
    # create table
    c.execute(table_sql[table_name])
    # populate table with data
    df = dfs[table_data_source[table_name]]
    df.to_sql(table_name, conn, if_exists='fail', index=False)

ValueError: Table 'directors' already exists.