In [135]:
import pandas as pd
import numpy as np

In [136]:
# These gzip files have trailing garbage.
# Python's gzip module does not read GZIP files with trailing garbage.
# Let's create an equivalent of pandas.read_csv() that works around it.
# See https://stackoverflow.com/a/54608126/100904
import zlib
import io
import pandas as pd

def read_csv(path, **kwargs):
    with open(path, 'rb') as handle:
        raw = handle.read()
    stream = io.BytesIO(zlib.decompress(raw, zlib.MAX_WBITS|16))
    return pd.read_csv(stream, **kwargs)

In [137]:
akas = read_csv('title.akas.tsv.gz',sep='\t',
                  na_values='\\N', 
                  dtype={'title':'str','region':'str','language':'str'},
                  usecols=['titleId','title','region','language']).set_index('titleId')

In [138]:
akas=akas[(akas.language=='hi')&(akas.region=='IN')]
print(akas.shape)
akas.head()

(3690568, 3)


Unnamed: 0_level_0,title,region,language
titleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0000302,"गोभी की परी, या बच्चों का जन्म",IN,hi
tt0011965,Dharma Vijay,IN,hi
tt0013568,Sadhu Aur Shaitaan,IN,hi
tt0014142,The Hunchback of Notre Dame,IN,hi
tt0015324,Sherlock Jr.,IN,hi


In [139]:
basics = read_csv('title.basics.tsv.gz', sep='\t',
                  na_values='\\N',
                  dtype={'titleType':'str','originalTitle':'str','startYear':'Int64'},
                  usecols=['tconst','titleType','originalTitle','startYear']
                 ).set_index('tconst')

In [140]:
basics=basics.dropna()

In [141]:
basics = basics[(basics['titleType'] == 'movie')&(basics['startYear'] >= 2005)&(basics.index.isin(akas.index))]
basics.head()

Unnamed: 0_level_0,titleType,originalTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0120667,movie,Fantastic Four,2005
tt0121164,movie,Corpse Bride,2005
tt0121766,movie,Star Wars: Episode III - Revenge of the Sith,2005
tt0200465,movie,The Bank Job,2008
tt0205380,movie,Sanam Teri Kasam,2009


In [142]:
principals  = read_csv('title.principals.tsv.gz', sep = '\t',na_values='\\N',usecols=['tconst', 'nconst', 'category'],
                      dtype={'tconst':'str', 'nconst':'str', 'category':'category'})
principals = principals[(principals.category=='actor')&(principals.tconst.isin(akas.index))]
principals.head()

Unnamed: 0,tconst,nconst,category
82078,tt0011965,nm0760044,actor
82079,tt0011965,nm0665381,actor
82080,tt0011965,nm0557630,actor
82081,tt0011965,nm0694890,actor
82085,tt0011965,nm0304314,actor


In [143]:
name = read_csv('name.basics.tsv.gz', sep = '\t', na_values = '\\N', dtype={'birthYear': float},usecols=['nconst','primaryName','knownForTitles']).set_index('nconst')
name=name[name.index.isin(principals.nconst)]
name.head()

Unnamed: 0_level_0,primaryName,knownForTitles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1
nm0000001,Fred Astaire,"tt0072308,tt0050419,tt0031983,tt0053137"
nm0000004,John Belushi,"tt0078723,tt0080455,tt0077975,tt0072562"
nm0000007,Humphrey Bogart,"tt0037382,tt0042593,tt0043265,tt0034583"
nm0000008,Marlon Brando,"tt0047296,tt0068646,tt0070849,tt0078788"
nm0000009,Richard Burton,"tt0061184,tt0057877,tt0059749,tt0087803"


In [144]:
from scipy.sparse import csr_matrix

In [145]:
graph=principals
min_acted=45
min_pairings=30

In [146]:
p = graph.copy()
p['title'] = p['tconst'].astype('category')
p['name'] = p['nconst'].astype('category')
name_freq = graph['nconst'].value_counts()
top_names = name_freq[name_freq >= min_acted]
top_actors = graph[graph['nconst'].isin(top_names.index)]

In [147]:
p=top_actors.copy()
p['title'] = p['tconst'].astype('category')
p['name'] = p['nconst'].astype('category')
row = p['title'].cat.codes.values
col = p['name'].cat.codes.values
data = np.ones(len(p), dtype='int')

In [148]:
matrix = csr_matrix((data, (row, col)))
square = matrix.T * matrix
square.setdiag(0)
square = square.tocoo()

In [149]:
pairs = pd.DataFrame({
'row': square.row,
'col': square.col,
'n': square.data
})
pairs = pairs[pairs.n >= min_pairings].reset_index(drop=True)
cat = name.reindex(p['name'].cat.categories)

In [150]:
cat

Unnamed: 0,primaryName,knownForTitles
nm0000168,Samuel L. Jackson,"tt0417148,tt4154664,tt3460252,tt0110912"
nm0000246,Bruce Willis,"tt0095016,tt0088571,tt0110912,tt0167404"
nm0000293,Sean Bean,"tt1181791,tt0944947,tt0167261,tt0120737"
nm0000297,Tom Berenger,"tt0091763,tt0097815,tt1375666,tt0085244"
nm0000334,Chow Yun-Fat,"tt1098327,tt0120008,tt0190332,tt0473444"
...,...,...
nm9971729,Ashish Dixit,"tt5465370,tt8832228,tt8649832,tt8273526"
nm9975198,Jie Su,"tt8481368,tt8623788,tt8481302,tt8481032"
nm9976178,Sebastián Carvajal,"tt9327242,tt9701670,tt14972766,tt11434706"
nm9982695,Connor Leong,"tt10773262,tt15056268,tt8690890,tt10904418"


In [151]:
pairs

Unnamed: 0,row,col,n
0,3531,4,30
1,3008,4,30
2,4659,4,31
3,445,5,51
4,3074,6,54
...,...,...,...
37529,5653,11811,91
37530,11692,11811,48
37531,10430,11811,93
37532,11591,11812,31


In [152]:
def lookup(pairs, cat):
    pairs = pd.concat([
        pairs,
        cat.iloc[pairs.row].reset_index(drop=True),
        cat.iloc[pairs.col].reset_index(drop=True),
    ], axis=1)
    pairs = pairs.drop(columns=['row', 'col'])
    pairs.columns = ['count', 'name1', 'year1', 'name2', 'year2']
    return pairs.sort_values('count', ascending=False)

In [153]:
ForKumu = lookup(pairs, cat)
ForKumu

Unnamed: 0,count,name1,year1,name2,year2
23701,9946,Tito Sotto,"tt0494772,tt0474744,tt1320341,tt0155528",Marvic Valentin Castelo Sotto,"tt5830556,tt1782526,tt2529956,tt2104022"
17250,9946,Marvic Valentin Castelo Sotto,"tt5830556,tt1782526,tt2529956,tt2104022",Tito Sotto,"tt0494772,tt0474744,tt1320341,tt0155528"
29868,9944,Tito Sotto,"tt0494772,tt0474744,tt1320341,tt0155528",Joel de Leon,"tt0344642,tt0757914"
17249,9944,Joel de Leon,"tt0344642,tt0757914",Tito Sotto,"tt0494772,tt0474744,tt1320341,tt0155528"
23700,9944,Joel de Leon,"tt0344642,tt0757914",Marvic Valentin Castelo Sotto,"tt5830556,tt1782526,tt2529956,tt2104022"
...,...,...,...,...,...
17756,30,Marcin Krajewski,"tt8845294,tt0262975,tt0439389,tt7965876",Maciej Tomaszewski,"tt1808454,tt0330243,tt0095203,tt0439389"
33331,30,Amr Youssef,"tt3957098,tt1634509,tt5857914,tt4840206",Mohamed Mamdouh,"tt16311220,tt4003070,tt5263322,tt3461252"
33332,30,Ahmed Dawood,"tt10151260,tt4840206,tt5240732,tt20877986",Mohamed Mamdouh,"tt16311220,tt4003070,tt5263322,tt3461252"
17740,30,Peter Howell,"tt0066666,tt0375920,tt0423700,tt0079871",Charles 'Bud' Tingwell,"tt0080310,tt0057334,tt0277941,tt0055205"


In [154]:
ForKumu = ForKumu[['name1', 'name2', 'count']]
ForKumu = ForKumu.rename(columns={'name1':'From',
                                  'name2':'To',
                                  'count':'Strength'})
ForKumu

Unnamed: 0,From,To,Strength
23701,Tito Sotto,Marvic Valentin Castelo Sotto,9946
17250,Marvic Valentin Castelo Sotto,Tito Sotto,9946
29868,Tito Sotto,Joel de Leon,9944
17249,Joel de Leon,Tito Sotto,9944
23700,Joel de Leon,Marvic Valentin Castelo Sotto,9944
...,...,...,...
17756,Marcin Krajewski,Maciej Tomaszewski,30
33331,Amr Youssef,Mohamed Mamdouh,30
33332,Ahmed Dawood,Mohamed Mamdouh,30
17740,Peter Howell,Charles 'Bud' Tingwell,30


In [155]:
ForKumu.to_csv("pairs_6.csv", index = False)