In [None]:
import numpy as np
import pandas as pd

In [None]:
!pip install scikit-network==0.24.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-network==0.24.0
  Downloading scikit_network-0.24.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 4.4 MB/s 
Installing collected packages: scikit-network
Successfully installed scikit-network-0.24.0


In [None]:
!rm *.tsv.gz
!wget -q https://datasets.imdbws.com/name.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.principals.tsv.gz
!wget -q https://datasets.imdbws.com/title.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.akas.tsv.gz
!ls -la

rm: cannot remove '*.tsv.gz': No such file or directory
total 1066168
drwxr-xr-x 1 root root      4096 Oct 18 14:17 .
drwxr-xr-x 1 root root      4096 Oct 18 14:16 ..
drwxr-xr-x 4 root root      4096 Oct 14 19:04 .config
-rw-r--r-- 1 root root 235014251 Oct 17 13:24 name.basics.tsv.gz
drwxr-xr-x 1 root root      4096 Oct 14 19:05 sample_data
-rw-r--r-- 1 root root 279878496 Oct 17 13:24 title.akas.tsv.gz
-rw-r--r-- 1 root root 162648231 Oct 18 13:23 title.basics.tsv.gz
-rw-r--r-- 1 root root 414184308 Oct 18 13:23 title.principals.tsv.gz


In [None]:
# load the titles
title = pd.read_csv('title.basics.tsv.gz', sep='\t', low_memory=True).set_index('tconst')[['titleType','primaryTitle','startYear']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
title.head()

Unnamed: 0_level_0,titleType,primaryTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0000001,short,Carmencita,1894
tt0000002,short,Le clown et ses chiens,1892
tt0000003,short,Pauvre Pierrot,1892
tt0000004,short,Un bon bock,1892
tt0000005,short,Blacksmith Scene,1893


In [None]:
title.dtypes

titleType       object
primaryTitle    object
startYear       object
dtype: object

In [None]:
title = title[title['startYear'] > '1950'] # filter by year

In [None]:
title.head()

Unnamed: 0_level_0,titleType,primaryTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0011801,movie,Tötet nicht mehr,2019
tt0013274,movie,Istoriya grazhdanskoy voyny,2021
tt0015414,movie,La tierra de los toros,2000
tt0015724,movie,Dama de noche,1993
tt0022064,movie,Lebbra bianca,1951


In [None]:
# load the cast of each film
cast  = pd.read_csv('title.principals.tsv.gz', sep = '\t')[['tconst', 'nconst', 'category']]
# only consider actors, not directors, composors etc.
cast = cast[cast.category.isin({'actor','actress'})]
cast.head()

Unnamed: 0,tconst,nconst,category
11,tt0000005,nm0443482,actor
12,tt0000005,nm0653042,actor
16,tt0000007,nm0179163,actor
17,tt0000007,nm0183947,actor
21,tt0000008,nm0653028,actor


In [None]:
# only consider movies, not series, etc
movies = title[title['titleType'] == 'movie']
cast = cast[cast['tconst'].isin(movies.index)]
cast.head()

Unnamed: 0,tconst,nconst,category
80688,tt0011801,nm0459029,actor
80689,tt0011801,nm0681726,actor
80690,tt0011801,nm0692612,actress
80691,tt0011801,nm0726256,actor
80692,tt0011801,nm0776458,actor


In [None]:
region = pd.read_csv('title.akas.tsv.gz',sep='\t',usecols=['titleId','region'],low_memory = True,dtype={'region':'str','titleId':'str'}).set_index('titleId')['region']
region.value_counts().head(10)

JP    4002434
FR    4001268
DE    3988545
IN    3932614
ES    3924355
IT    3904983
PT    3840735
\N    1865253
US    1360525
GB     422184
Name: region, dtype: int64

In [None]:
region.head()

titleId
tt0000001    UA
tt0000001    DE
tt0000001    HU
tt0000001    GR
tt0000001    RU
Name: region, dtype: object

In [None]:
name = pd.read_csv('name.basics.tsv.gz', sep = '\t', low_memory=True, na_values = '\\N', dtype={'birthYear': float}).set_index('nconst')[['primaryName', 'birthYear']]
name.head()

Unnamed: 0_level_0,primaryName,birthYear
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1
nm0000001,Fred Astaire,1899.0
nm0000002,Lauren Bacall,1924.0
nm0000003,Brigitte Bardot,1934.0
nm0000004,John Belushi,1949.0
nm0000005,Ingmar Bergman,1918.0


In [None]:
from scipy.sparse import csr_matrix

In [None]:
def get_pairs(lang=None, min_acted=25, min_pairings=1):
    '''
    Returns an adjacency matrix and actor mapping of actor pairs where:
    - Each actor has acted in at least min_acted films
    - The two actors have acted together in at least min_pairings films
    - And (optionally), belong to a region `lang` (IN, UN, etc)
    '''
    graph = cast
    if lang is not None:
        graph = graph[graph['tconst'].isin(region[region == lang].index)]
        # graph = graph[graph['tconst'].isin(region[region == lang].index)]
    name_freq = graph['nconst'].value_counts()
    top_names = name_freq[name_freq >= min_acted]
    top_actors = graph[graph['nconst'].isin(top_names.index)]

    p = top_actors.copy()
    p['title'] = p['tconst'].astype('category')
    p['name'] = p['nconst'].astype('category')

    row = p['title'].cat.codes.values
    col = p['name'].cat.codes.values
    data = np.ones(len(p), dtype='int')

    matrix = csr_matrix((data, (row, col)))
    square = matrix.T * matrix
    square.setdiag(0)
    square = square.tocoo()

    pairs = pd.DataFrame({
        'row': square.row,
        'col': square.col,
        'n': square.data
    })
    pairs = pairs[pairs.n >= min_pairings].reset_index(drop=True)
    return pairs, name.reindex(p['name'].cat.categories)

def lookup(pairs, cat):
    pairs = pd.concat([
        pairs,
        cat.iloc[pairs.row].reset_index(drop=True),
        cat.iloc[pairs.col].reset_index(drop=True),
    ], axis=1)
    pairs = pairs.drop(columns=['row', 'col'])
    pairs.columns = ['count', 'name1', 'year1', 'name2', 'year2']
    return pairs.sort_values('count', ascending=False)

In [None]:
pairs, cat = get_pairs(lang='IN', min_acted=1, min_pairings=1) # setting lang = 'IN' will apply a filter allowing only INdian movies

In [None]:
ForKumu = lookup(pairs, cat)

In [None]:
ForKumu = ForKumu[['name1', 'name2', 'count']]
ForKumu = ForKumu.rename(columns={'name1':'From',
                                  'name2':'To',
                                  'count':'Strength'})


In [None]:
ForKumu.head()

Unnamed: 0,From,To,Strength
49374,Bahadur,Adoor Bhasi,172
71837,Adoor Bhasi,Bahadur,172
161691,Adoor Bhasi,Jayabharati,125
49363,Jayabharati,Adoor Bhasi,125
49354,Prem Nazir,Adoor Bhasi,124


In [None]:
ForKumu.shape # if the kumu website crashes when you try to upload this massive dataset, try uploading a sample of this, like the first 10,000 rows.

(862080, 3)