In [2]:
import pandas as pd
import numpy as np
import zipfile
import sqlite3

In [3]:
data_sources = {
    # movielens
    'genome_scores'    : './data/ml-25m/genome-scores.csv',
    'genome_tags'      : './data/ml-25m/genome-tags.csv',
    'links'            : './data/ml-25m/links.csv',
    'movies'           : './data/ml-25m/movies.csv',
    'ratings'          : './data/ml-25m/ratings.csv',
    'tags'             : './data/ml-25m/tags.csv',
#     imdb
    'imdb_name_basics' : './data/name.basics.tsv.gz',
    'imdb_title_basics': './data/title.basics.tsv.gz',
    'imdb_ratings'     : './data/title.ratings.tsv.gz',
    'imdb_crew'        : './data/title.crew.tsv.gz',
    'imdb_principals'  : './data/title.principals.tsv.gz',
}

In [4]:
dfs = {}
for name, path in data_sources.items():
    print(dfs.keys(), end='\r')
    separator = '\t' if 'tsv' in path else ','
    dfs[name] = pd.read_csv(path, sep=separator)

dict_keys(['genome_scores', 'genome_tags', 'links', 'movies', 'ratings', 'tags', 'imdb_name_basics'])

  interactivity=interactivity, compiler=compiler, result=result)


dict_keys(['genome_scores', 'genome_tags', 'links', 'movies', 'ratings', 'tags', 'imdb_name_basics', 'imdb_title_basics', 'imdb_ratings', 'imdb_crew'])

# connect to database

In [5]:
db_name = './movie_sqlite.db'
conn = sqlite3.connect(db_name)
c = conn.cursor()

In [6]:
# get needed movie ids
sql = """
select id from movies ;
"""
c.execute(sql)
movie_ids = c.fetchall()
needed_movies = [m[0] for m in movie_ids]
needed_movies[:5]  # check first 5

['100024', '100029', '100046', '100049', '100050']

In [7]:
df_needed_movies = pd.DataFrame(needed_movies)
df_needed_movies.columns = ['movie_id_int']
df_needed_movies.head()

Unnamed: 0,movie_id_int
0,100024
1,100029
2,100046
3,100049
4,100050


In [8]:
# actors table
# directors table

In [9]:
df_needed_movies.columns = ['movie_id_int']

In [10]:
df_needed_movies.head()

Unnamed: 0,movie_id_int
0,100024
1,100029
2,100046
3,100049
4,100050


In [11]:
df_needed_movies['str_id'] = 'tt' + df_needed_movies['movie_id_int'].str.zfill(7)

In [12]:
needed_movies_joined_to_directors = pd.merge(df_needed_movies,
                                             dfs['imdb_crew'],
                                             left_on='str_id',
                                             right_on='tconst',
                                             how='left')
needed_movies_joined_to_directors.head()

Unnamed: 0,movie_id_int,str_id,tconst,directors,writers
0,100024,tt0100024,tt0100024,nm0005139,nm0005139
1,100029,tt0100029,tt0100029,nm0504802,"nm0913027,nm0000241,nm0504802"
2,100046,tt0100046,tt0100046,nm0669004,nm0179896
3,100049,tt0100049,tt0100049,nm0720000,nm0524108
4,100050,tt0100050,tt0100050,nm0002132,"nm0002132,nm0411477"


In [13]:
needed_movies_joined_to_directors = (
    needed_movies_joined_to_directors[needed_movies_joined_to_directors.directors != r'\N'])

In [14]:
all_needed_directors = pd.Series(needed_movies_joined_to_directors.directors.unique())

In [15]:
cleaned_all_needed_directors = []
for pair in [person.split(',') for person in all_needed_directors]:
    for item in pair:
        cleaned_all_needed_directors.append(item)

In [16]:
cleaned_all_needed_directors = pd.DataFrame(np.array(cleaned_all_needed_directors), columns=['nconst'])
cleaned_all_needed_directors.head()

Unnamed: 0,nconst
0,nm0005139
1,nm0504802
2,nm0669004
3,nm0720000
4,nm0002132


In [17]:
directors_and_movies = pd.merge(cleaned_all_needed_directors, 
                                dfs['imdb_crew'],
                                left_on='nconst',
                                right_on='directors',
                                how='left')
directors_and_movies.head()

Unnamed: 0,nconst,tconst,directors,writers
0,nm0005139,tt0066842,nm0005139,nm0005139
1,nm0005139,tt0070153,nm0005139,nm0005139
2,nm0005139,tt0073531,nm0005139,nm0005139
3,nm0005139,tt0074759,nm0005139,nm0005139
4,nm0005139,tt0074988,nm0005139,nm0005139


In [18]:
# check how many movies each director was in
s = directors_and_movies.groupby('nconst').count()['tconst'].sort_values(ascending=False)
s.head()

nconst
nm0455741    1968
nm0005062    1617
nm0814716    1573
nm1853544    1089
nm0400958     924
Name: tconst, dtype: int64

In [19]:
directors_and_movies = directors_and_movies[['directors', 'tconst']]

In [20]:
directors_and_movies

Unnamed: 0,directors,tconst
0,nm0005139,tt0066842
1,nm0005139,tt0070153
2,nm0005139,tt0073531
3,nm0005139,tt0074759
4,nm0005139,tt0074988
...,...,...
222972,nm0955443,tt5679696
222973,nm0955443,tt6864046
222974,nm0955443,tt8959680
222975,nm0271835,tt0186537


In [21]:
# load movies, ids, releveance into df
# join this with directors and movies on movies id
# aggregate on [directors, movies] and apply aggregate function (mean) (or weighted based on rating)

In [42]:
sql = '''
select *
from tag_relevance tr
join movies m
on tr.movie_id = m.id
limit 10000
'''
c
df_movie_tags = pd.read_sql(sql, con=conn)
df_movie_tags.head()

Unnamed: 0,movie_id,tag_id,relevance,id,kind,primary_title,original_title,release_year,runtime_minutes,genres
0,12,1,0.045,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
1,12,10,0.0405,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
2,12,100,0.281,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
3,12,1000,0.04825,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
4,12,1001,0.0965,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N


In [43]:
# NOTE: fix nans from being joined above
# for now just remove

In [44]:
directors_and_movies = directors_and_movies[~directors_and_movies.tconst.isna()]

In [45]:
directors_and_movies.loc[:,'movie_id'] = (directors_and_movies.loc[:,'tconst'].str[2:]).astype(int)
directors_and_movies.head()

Unnamed: 0,directors,tconst,movie_id
0,nm0005139,tt0066842,66842
1,nm0005139,tt0070153,70153
2,nm0005139,tt0073531,73531
3,nm0005139,tt0074759,74759
4,nm0005139,tt0074988,74988


In [46]:
df_movie_tags.head()

Unnamed: 0,movie_id,tag_id,relevance,id,kind,primary_title,original_title,release_year,runtime_minutes,genres
0,12,1,0.045,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
1,12,10,0.0405,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
2,12,100,0.281,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
3,12,1000,0.04825,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
4,12,1001,0.0965,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N


In [47]:
df_movie_tags.head()

Unnamed: 0,movie_id,tag_id,relevance,id,kind,primary_title,original_title,release_year,runtime_minutes,genres
0,12,1,0.045,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
1,12,10,0.0405,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
2,12,100,0.281,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
3,12,1000,0.04825,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N
4,12,1001,0.0965,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,1896,1,\N


In [48]:
directors_and_movies.head()

Unnamed: 0,directors,tconst,movie_id
0,nm0005139,tt0066842,66842
1,nm0005139,tt0070153,70153
2,nm0005139,tt0073531,73531
3,nm0005139,tt0074759,74759
4,nm0005139,tt0074988,74988


In [51]:
merged_df = pd.merge(directors_and_movies, df_movie_tags, left_on='movie_id', right_on='movie_id', how='left')
merged_df[~merged_df.tag_id.isna()].head()

Unnamed: 0,directors,tconst,movie_id,tag_id,relevance,id,kind,primary_title,original_title,release_year,runtime_minutes,genres
48036,nm0000428,tt0004972,4972,1,0.04925,4972,movie,The Birth of a Nation,The Birth of a Nation,1915.0,195.0,\N
48037,nm0000428,tt0004972,4972,10,0.06725,4972,movie,The Birth of a Nation,The Birth of a Nation,1915.0,195.0,\N
48038,nm0000428,tt0004972,4972,100,0.19575,4972,movie,The Birth of a Nation,The Birth of a Nation,1915.0,195.0,\N
48039,nm0000428,tt0004972,4972,1000,0.108,4972,movie,The Birth of a Nation,The Birth of a Nation,1915.0,195.0,\N
48040,nm0000428,tt0004972,4972,1001,0.06675,4972,movie,The Birth of a Nation,The Birth of a Nation,1915.0,195.0,\N


In [57]:
directors_tag_relevance = merged_df.groupby(['directors', 'tag_id'])['relevance'].mean()
directors_tag_relevance.max()

0.99925

In [59]:
dfs['genome_tags'][dfs['genome_tags'].tagId == 688]

Unnamed: 0,tagId,tag
687,688,mutants


In [58]:
directors_tag_relevance[directors_tag_relevance==0.99925]

directors  tag_id
nm0665737  688       0.99925
Name: relevance, dtype: float64

In [73]:
directors_tag_relevance = pd.DataFrame(directors_tag_relevance).reset_index()
directors_tag_relevance.directors.nunique()

6

In [68]:
pd.DataFrame(directors_tag_relevance)

directors  tag_id
nm0000122  1         0.031500
           10        0.030625
           100       0.106750
           1000      0.140125
           1001      0.035625
                       ...   
nm0903049  858       0.063750
           859       0.018250
           86        0.453000
           860       0.017000
           861       0.007000
Name: relevance, Length: 6616, dtype: float64

In [76]:
dfs.keys()

dict_keys(['genome_scores', 'genome_tags', 'links', 'movies', 'ratings', 'tags', 'imdb_name_basics', 'imdb_title_basics', 'imdb_ratings', 'imdb_crew', 'imdb_principals'])

In [78]:
dfs['imdb_name_basics'].head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,\N,\N,"soundtrack,actor,miscellaneous","tt0053137,tt0072308,tt0050419,tt0043044"
1,nm0000002,Lauren Bacall,\N,\N,"actress,soundtrack","tt0071877,tt0117057,tt0037382,tt0038355"
2,nm0000003,Brigitte Bardot,\N,\N,"actress,soundtrack,producer","tt0059956,tt0057345,tt0054452,tt0049189"
3,nm0000004,John Belushi,\N,\N,"actor,soundtrack,writer","tt0072562,tt0077975,tt0078723,tt0080455"
4,nm0000005,Ingmar Bergman,\N,\N,"writer,director,actor","tt0050976,tt0069467,tt0083922,tt0050986"


In [81]:
needed_directors_and_names = pd.merge(cleaned_all_needed_directors,dfs['imdb_name_basics'],
                                      left_on='nconst', right_on='nconst', how='left' )
needed_directors_and_names.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0005139,Mike Leigh,\N,\N,"director,writer,actor","tt1431181,tt0117589,tt0107653,tt1045670"
1,nm0504802,Sheldon Lettich,\N,\N,"writer,director,producer","tt0100029,tt0095956,tt3369806,tt0101764"
2,nm0669004,Richard Pearce,\N,\N,"director,cinematographer,producer","tt0406429,tt0079261,tt1286537,tt0082508"
3,nm0720000,Norman René,\N,\N,"director,producer","tt0105165,tt0176357,tt0114241,tt0100049"
4,nm0002132,Amy Heckerling,\N,\N,"writer,director,producer","tt0115137,tt0097778,tt0083929,tt3973820"


In [84]:
needed_directors_and_names = needed_directors_and_names[['nconst','primaryName']]

In [87]:
dfs['directors'] = needed_directors_and_names
dfs['directors_relevence'] = directors_tag_relevance

In [91]:
dfs['directors']

Unnamed: 0,nconst,primaryName
0,nm0005139,Mike Leigh
1,nm0504802,Sheldon Lettich
2,nm0669004,Richard Pearce
3,nm0720000,Norman René
4,nm0002132,Amy Heckerling
...,...,...
7290,nm0540330,Bruce Malmuth
7291,nm0822582,Richard Stanley
7292,nm0663489,James D. Parriott
7293,nm0955443,Yimou Zhang


In [93]:
# 
#
directors_table = """
CREATE TABLE IF NOT EXISTS directors (
    id text PRIMARY KEY,
    name text
);
"""

director_relevence_table = """
CREATE TABLE IF NOT EXISTS director_tag_relevance (
    director_id integer,
    tag_id name text NOT NULL,
    relevance real NOT NULL
);
"""
table_sql = {
    'directors'                 : directors_table,
    'directors_relevence'       : director_relevence_table,
}

table_data_source = {
    'directors'                 : 'directors',
    'directors_relevence'       : 'directors_relevence',
}

for table_name in table_sql:
    # create table
    c.execute(table_sql[table_name])
    # populate table with data
    df = dfs[table_data_source[table_name]]
    df.to_sql(table_sql[table_name], conn, if_exists='append', index=False)

In [None]:
# get needed movie ids
sql = """
select id from movies ;
"""
c.execute(sql)
movie_ids = c.fetchall()
needed_movies = [m[0] for m in movie_ids]
needed_movies[:5]  # check first 5

In [20]:
# director aggregated tag score
# average for each tag for all of the movies they directed


In [21]:
# get needed movie ids
sql = """
select * from tags ;
"""
c.execute(sql)
tag_ids = c.fetchall()
all_tags = [t for t in tag_ids]
all_tags[:5]  # check first 5

[(1, '007'),
 (2, '007 (series)'),
 (3, '18th century'),
 (4, '1920s'),
 (5, '1930s')]

In [22]:
# get needed movie ids
sql = """
select * from tag_relevance;
"""
c.execute(sql)
movie_ids = c.fetchall()
needed_movies = [m for m in movie_ids]
needed_movies[:5]  # check first 5
# movie, tag, relevance

[(1, '1', 0.02875),
 (1, '2', 0.023749999999999997),
 (1, '3', 0.0625),
 (1, '4', 0.07574999999999997),
 (1, '5', 0.14075)]

In [58]:
dfs.keys()

dict_keys(['genome_scores', 'genome_tags', 'links', 'movies', 'ratings', 'tags', 'imdb_name_basics', 'imdb_title_basics', 'imdb_ratings', 'imdb_crew', 'imdb_principals'])

In [None]:
dfs['']

In [None]:
# director_id, tag_id, relevence_score
# 1, 1, .5
# 1, 2, .2
# ....

In [None]:
# actor_id, tag_id, relevence_score
# 1, 1, .5
# 1, 2, .2
# ....

In [None]:
s.head()

In [15]:
actors

actor_id, name, tag_id, tag_value

NameError: name 'actors' is not defined