In [1]:
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings coming from Arrow optimizations.

In [2]:
import pandas as pd
import numpy as np
import pyspark.pandas as ps
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('explore_spark').config('spark.master', 'local').getOrCreate()

ps.set_option('display.max_rows', 10)

prev = spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")  # Keep its default value.
ps.set_option("compute.default_index_type", "distributed")  # Use default index prevent overhead.

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)
%timeit ps.range(300000).to_pandas()

The slowest run took 4.53 times longer than the fastest. This could mean that an intermediate result is being cached.
146 ms ± 105 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Carregar dados dos ratings/votos dos títulos

In [3]:
# title_ratings = pd.read_csv('../data/title.ratings.tsv', sep='\t', low_memory=False)
title_ratings = ps.read_csv('../Data/title.ratings.tsv', sep='\t', inferSchema=True)

In [4]:
title_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2007
1,tt0000002,5.8,270
2,tt0000003,6.5,1920
3,tt0000004,5.5,178
4,tt0000005,6.2,2695
5,tt0000006,5.0,183
6,tt0000007,5.4,843
7,tt0000008,5.4,2155
8,tt0000009,5.3,208
9,tt0000010,6.9,7360


## Carregar dados dos títulos
- somente aqueles que possuem rating
- somente filmes
- somente os que possuem algum gênero

In [5]:
title_basics = ps.read_csv('../Data/title.basics.tsv', sep='\t', inferSchema=True)

In [6]:
title_basics['titleType'].unique()

0        tvSeries
1    tvMiniSeries
2         tvMovie
3       tvEpisode
4           movie
5       tvSpecial
6           video
7       videoGame
8         tvShort
9           short
Name: titleType, dtype: object
Showing only the first 10

In [7]:
title_basics = title_basics.loc[title_basics['titleType'] == 'movie']

In [8]:
title_basics = title_basics.loc[title_basics['genres'] != '\\N']

In [9]:
title_basics = title_basics.to_pandas()
title_ratings = title_ratings.to_pandas()
print(type(title_basics))
print(type(title_ratings))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [10]:
title_basics = title_basics.loc[title_basics['tconst'].isin(title_ratings['tconst'])]

In [11]:
title_basics = ps.from_pandas(title_basics)
title_ratings = ps.from_pandas(title_ratings)
print(type(title_basics))
print(type(title_ratings))

<class 'pyspark.pandas.frame.DataFrame'>
<class 'pyspark.pandas.frame.DataFrame'>


In [12]:
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama
668,tt0000675,movie,Don Quijote,Don Quijote,0,1908,\N,\N,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,\N,120,"Adventure,Fantasy"
876,tt0000886,movie,"Hamlet, Prince of Denmark",Hamlet,0,1910,\N,\N,Drama
930,tt0000941,movie,Locura de amor,Locura de amor,0,1909,\N,45,Drama


## Concatena os ratings/votos com as informações dos titulos

In [13]:
movies = ps.merge(title_basics, title_ratings, on='tconst')

In [16]:
movies.drop(columns=['titleType', 'endYear'])

In [None]:
movies

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000009,Miss Jerry,Miss Jerry,1894,45,Romance,5.2,200
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography",6.0,791
2,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90,Drama,5.1,20
3,tt0000615,Robbery Under Arms,Robbery Under Arms,1907,\N,Drama,4.3,23
4,tt0000630,Hamlet,Amleto,1908,\N,Drama,2.9,25
...,...,...,...,...,...,...,...,...
271487,tt9916190,Safeguard,Safeguard,2020,95,"Action,Adventure,Thriller",3.6,238
271488,tt9916270,Il talento del calabrone,Il talento del calabrone,2020,84,Thriller,5.8,1365
271489,tt9916362,Coven,Akelarre,2020,92,"Drama,History",6.4,4921
271490,tt9916428,The Secret of China,Hong xing zhao yao Zhong guo,2019,\N,"Adventure,History,War",3.8,14


In [None]:
movies.describe()

Unnamed: 0,averageRating,numVotes
count,271492.0,271492.0
mean,6.180838,3692.816
std,1.359374,35187.59
min,1.0,5.0
25%,5.4,20.0
50%,6.3,66.0
75%,7.1,330.0
max,10.0,2651547.0


## Carregar dados dos atores principais/equipe dos títulos
- somente dos titulos que possuem rating
- somente filmes
- somente os que possuem algum gênero
- somente atores/atrizes

In [None]:
title_principals = ps.read_csv('../Data/title.principals.tsv', sep='\t', inferSchema=True)

In [None]:
title_principals = title_principals.loc[title_principals['tconst'].isin(movies['tconst'])]

In [None]:
title_principals['category'].unique()

array(['actor', 'actress', 'director', 'cinematographer', 'producer',
       'composer', 'editor', 'writer', 'production_designer', 'self',
       'archive_footage'], dtype=object)

In [None]:
roles_category = ['actress', 'actor']
# roles_category = ['actress', 'actor', 'director', 'writer', 'cinematographer', 'producer']
title_principals = title_principals.loc[title_principals['category'].isin(roles_category)]
title_principals.drop(columns=['category','job'], inplace=True)

In [None]:
title_principals

Unnamed: 0,tconst,ordering,nconst,characters
429803,tt0052077,10,nm0542611,"[""Eros""]"
429804,tt0052077,1,nm0906966,"[""Jeff Trent""]"
429805,tt0052077,2,nm0444740,"[""Colonel Edwards""]"
429806,tt0052077,3,nm0571958,"[""Paula Trent""]"
429807,tt0052077,4,nm0601146,"[""Lieutenant Harper""]"
...,...,...,...,...
50530642,tt9196192,4,nm10509268,"[""Jess""]"
52373645,tt9820556,1,nm4181215,"[""Noah""]"
52373646,tt9820556,2,nm0000246,"[""Clay""]"
52373647,tt9820556,3,nm0629697,"[""Chambers""]"


In [None]:
#filmes
len(title_principals['tconst'].unique())

176

In [None]:
#atores
len(title_principals['nconst'].unique())

644

## Carregar dados dos nomes dos atores

In [None]:
name_basics = ps.read_csv('../Data/name.basics.tsv', sep='\t', inferSchema=True)

In [None]:
title_principals = pd.merge(title_principals, name_basics, on='nconst')

In [None]:
title_principals

Unnamed: 0,tconst,ordering,nconst,characters,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,tt0052077,10,nm0542611,"[""Eros""]",Dudley Manlove,1914,1996,actor,"tt0047708,tt0055872,tt0050396,tt0052077"
1,tt0052077,1,nm0906966,"[""Jeff Trent""]",Gregory Walcott,1928,2015,"actor,producer,soundtrack","tt0077523,tt0072926,tt0052077,tt0072288"
2,tt0052077,2,nm0444740,"[""Colonel Edwards""]",Tom Keene,1896,1963,"actor,soundtrack","tt0024524,tt0052077,tt0026489,tt0025610"
3,tt0052077,3,nm0571958,"[""Paula Trent""]",Mona McKinnon,1929,1990,actress,"tt0047127,tt0048902,tt0052077,tt0046066"
4,tt0052077,4,nm0601146,"[""Lieutenant Harper""]",Duke Moore,1913,1976,actor,"tt0052077,tt0055452,tt0156843,tt0350184"
...,...,...,...,...,...,...,...,...,...
705,tt9196192,3,nm10509267,"[""Coumba""]",Esther Gohourou,\N,\N,,"tt22463404,tt9196192"
706,tt9196192,4,nm10509268,"[""Jess""]",Ilanah Cami-Goursolas,\N,\N,,"tt9196192,tt14118848"
707,tt9820556,1,nm4181215,"[""Noah""]",Cody Kearsley,1991,\N,"actor,producer,writer","tt8755226,tt5420376"
708,tt9820556,3,nm0629697,"[""Chambers""]",Rachel Nichols,1980,\N,"actress,producer","tt0796366,tt1046173,tt0384806,tt1954347"


## Salvar os dados selecionados

In [None]:
# movies
# title_principals