In [1]:
import pickle
import numpy as np
import pandas as pd
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')
tqdm.pandas()
%matplotlib inline

In [2]:
imdb_filenames = ['name.basics.tsv.gz', 'title.akas.tsv.gz', 
                  'title.basics.tsv.gz', 'title.crew.tsv.gz', 
                  'title.principals.tsv.gz', 'title.ratings.tsv.gz']

In [3]:
title_basics_df = pd.read_csv('data/title.basics.tsv.gz', compression='gzip', header=0, sep='\t', quotechar='"')
title_cast_df = pd.read_csv('data/title.principals.tsv.gz', compression='gzip', header=0, sep='\t', quotechar='"',
                           usecols = ['tconst', 'nconst'])
title_ratings_df = pd.read_csv('data/title.ratings.tsv.gz', compression='gzip', header=0, sep='\t', quotechar='"')

In [4]:
title_basics_df = title_basics_df[title_basics_df['startYear'] !='\\N']

In [5]:
title_basics_df = title_basics_df[title_basics_df['runtimeMinutes'] !='\\N']

In [6]:
title_basics_df.dtypes

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear         object
endYear           object
runtimeMinutes    object
genres            object
dtype: object

In [7]:
title_basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [18]:
# title_basics_df[title_basics_df['primaryTitle']=='Iron Man 3']

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
2686023,tt1300854,movie,Iron Man 3,Iron Man Three,0,2013,\N,130,"Action,Adventure,Sci-Fi"
4622805,tt2877040,tvEpisode,Iron Man 3,Iron Man 3,0,2013,\N,7,Talk-Show
4633399,tt2901620,tvEpisode,Iron Man 3,Iron Man 3,0,2013,\N,4,"Comedy,Short"
6025815,tt6059698,tvEpisode,Iron Man 3,Iron Man 3,0,2013,\N,10,Comedy


In [20]:
title_basics_df['titleType'] = title_basics_df['titleType'].astype(str)
title_basics_df['isAdult'] = title_basics_df['isAdult'].astype(int)
title_basics_df['runtimeMinutes'] = title_basics_df['runtimeMinutes'].astype(int)
title_basics_df['startYear'] = title_basics_df['startYear'].astype(int)

In [21]:
title_basics_df['startYear'].unique()

array([1894, 1892, 1893, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1904,
       1902, 1903, 1905, 1912, 1907, 1908, 1910, 1906, 1909, 1914, 1913,
       1911, 1919, 1916, 1915, 1936, 1917, 1925, 1918, 1922, 1920, 1921,
       1923, 2019, 1924, 1927, 1929, 2000, 1926, 1993, 1935, 2014, 1928,
       2004, 1942, 1930, 2011, 1934, 1931, 1932, 1939, 1937, 1933, 1950,
       1938, 1951, 1945, 1946, 1940, 1944, 1949, 1947, 1941, 1952, 1970,
       1957, 1943, 1959, 1948, 2001, 1953, 2008, 1954, 1965, 1983, 1980,
       1973, 1961, 1995, 1955, 1962, 1958, 1956, 1977, 1960, 1964, 1967,
       1968, 1963, 1969, 1985, 1971, 1966, 2021, 1976, 1990, 1986, 1972,
       2020, 1979, 1974, 1978, 1981, 1988, 1975, 1989, 2009, 1987, 2010,
       2018, 1984, 1982, 1992, 1991, 1997, 1994, 1999, 2005, 1998, 2002,
       1996, 2017, 2016, 2006, 2007, 2003, 2012, 2013, 2015, 1891, 1888,
       1890, 2023, 2022, 2026, 1878, 2027, 2025, 1887, 1874, 2024, 1883,
       1881, 1885])

In [23]:
mask = ((title_basics_df['startYear'] >= 2005) &
        (title_basics_df['startYear'] <= 2021) &
        (title_basics_df['titleType'] == 'movie') &
        (title_basics_df['isAdult'] == 0) & 
        (title_basics_df['runtimeMinutes'] >80) &
        (title_basics_df['genres'] != '') &
        (title_basics_df['genres'] != 'Documentary'))

In [24]:
## Helper Functions
def clean_year(y):
    # Return year as an integer or 'NaN' if empty
    import numpy as np
    try:
        return int(y)
    except:
        return np.nan

def clean_genre(y):
    # Return only the first genre listed
    y = str(y)
    if y == '\\N':
        return ''
    return y.split(',')[0].strip()

title_basics_df.drop('endYear', axis=1, inplace=True)
title_basics_df['startYear'] = title_basics_df['startYear'].apply(clean_year)
title_basics_df['runtimeMinutes'] = title_basics_df['runtimeMinutes'].apply(clean_year)
title_basics_df['genres'] = title_basics_df['genres'].apply(clean_genre)
title_basics_df.dropna(inplace=True, how='any', subset=['startYear', 'runtimeMinutes'])

In [12]:
title_basics_df.shape

(2162551, 8)

In [27]:
title_basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,1,Documentary
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,5,Animation
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,4,Animation
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,12,Animation
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,1,Comedy


In [28]:
title_cast_df.head()

Unnamed: 0,tconst,nconst
0,tt0000001,nm1588970
1,tt0000001,nm0005690
2,tt0000001,nm0374658
3,tt0000002,nm0721526
4,tt0000002,nm1335271


In [29]:
title_cast_df.shape

(44252270, 2)

In [16]:
def f(df):
    keys, values = df.sort_values('tconst').values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values, index[1:])
    df2 = pd.DataFrame({'tconst':ukeys, 'nconst':[list(a) for a in arrays]})
    return df2

In [17]:
title_cast_df = f(title_cast_df)

In [18]:
title_cast_df.head()

Unnamed: 0,tconst,nconst
0,tt0000001,"[nm1588970, nm0005690, nm0374658]"
1,tt0000002,"[nm0721526, nm1335271]"
2,tt0000003,"[nm1335271, nm5442200, nm0721526, nm1770680]"
3,tt0000004,"[nm0721526, nm1335271]"
4,tt0000005,"[nm0443482, nm0653042, nm0005690, nm0249379]"


In [19]:
title_basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,1,Documentary
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,5,Animation
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,4,Animation
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,12,Animation
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,1,Comedy


In [20]:
titles = title_basics_df[mask].merge(title_cast_df, on='tconst')
titles = titles.merge(title_ratings_df, on='tconst')

In [21]:
titles.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,nconst,averageRating,numVotes
0,tt0019996,movie,Hongxia,Hongxia,0,2011,94,Action,"[nm0266662, nm0920761, nm0955817, nm0946300]",6.3,52
1,tt0036177,movie,Muhomatsu no issho,Muhomatsu no issho,0,2008,100,Action,"[nm0411632, nm0594335, nm1156001, nm0412616, n...",7.3,81
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama,"[nm0004372, nm0006166, nm1475059, nm0550881, n...",6.8,6464
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,100,Comedy,"[nm1848380, nm0006100, nm0628399, nm0078540, n...",5.4,287
4,tt0110476,movie,Master i Margarita,Master i Margarita,0,2006,125,Drama,"[nm0804691, nm2157023, nm0119888, nm0438671, n...",6.6,577


In [22]:
titles.shape

(75314, 11)

In [23]:
with open('imdb_spider/data.pkl', 'wb') as picklefile:
    pickle.dump(titles['tconst'].values, picklefile)