In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.functions import col, asc,desc

In [3]:
#Start session Context com nome filme
spark = SparkSession.builder.appName("EDC").getOrCreate()

In [4]:
%%time
df_titles = (
    spark
    .read
    .format("csv")
    .options(header=True, sep="\t", encoding="latin1")
    .load("title_basics.tsv")
 )


Wall time: 5.45 s


In [5]:
df_titles.show()

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [6]:
df_titles.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [7]:
df_titles.count()

8203690

In [9]:
%%time
#Quantos filmes (incluindo os da televisão)  no ano de 2015 por tipo?  
consulta1 = (df_titles
             .filter((df_titles.startYear == '2015'))
             .groupby('titleType')
             .agg( f.count('titleType') ))
consulta1.show()

+------------+----------------+
|   titleType|count(titleType)|
+------------+----------------+
|    tvSeries|           10171|
|tvMiniSeries|            2288|
|     tvMovie|            3558|
|   tvEpisode|          263239|
|       movie|           16429|
|   tvSpecial|            1275|
|       video|           13314|
|   videoGame|            1190|
|     tvShort|             642|
|       short|           45948|
+------------+----------------+

Wall time: 3.45 s


In [10]:
%%time
#Quantos filmes (incluindo os da televisão) foram lançados no ano de 2015?  
consulta2 = df_titles.filter((df_titles.startYear == '2015')&((df_titles.titleType == 'tvMovie') 
                                                              |(df_titles.titleType == 'movie')))
consulta2.count()

Wall time: 2.41 s


19987

In [11]:
%%time
#qual o gênero de títulos mais frequente? 
consulta3 = (df_titles
             .groupby('genres')
             .agg( f.count('genres').alias('qtde') )
             .sort(col('qtde').desc()) )
consulta3.show()

+-----------------+------+
|           genres|  qtde|
+-----------------+------+
|            Drama|880649|
|               \N|643012|
|           Comedy|568956|
|        Talk-Show|467788|
|      Documentary|392359|
|    Drama,Romance|354735|
|             News|266279|
|       Reality-TV|251369|
|            Adult|221116|
|            Short|169026|
|      Drama,Short|161288|
|           Family|136195|
|   News,Talk-Show|135690|
|     Comedy,Short|132338|
|Documentary,Short|125081|
|        Game-Show|122741|
|      Music,Short| 97140|
|          Romance| 90742|
|            Music| 88697|
|            Sport| 88461|
+-----------------+------+
only showing top 20 rows

Wall time: 5.68 s


In [12]:
%%time
df_rating = (
    spark
    .read
    .format("csv")
    .options(header=True, sep="\t", encoding="latin1")
    .load("title_ratings.tsv")
 )

Wall time: 188 ms


In [36]:
df_rating.show()

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    1872|
|tt0000002|          5.9|     247|
|tt0000003|          6.5|    1646|
|tt0000004|          5.8|     160|
|tt0000005|          6.2|    2474|
|tt0000006|          5.2|     164|
|tt0000007|          5.5|     770|
|tt0000008|          5.4|    2014|
|tt0000009|          5.4|     193|
|tt0000010|          6.9|    6764|
|tt0000011|          5.3|     343|
|tt0000012|          7.4|   11616|
|tt0000013|          5.7|    1795|
|tt0000014|          7.1|    5211|
|tt0000015|          6.2|     991|
|tt0000016|          5.9|    1403|
|tt0000017|          4.6|     305|
|tt0000018|          5.3|     561|
|tt0000019|          5.2|      30|
|tt0000020|          4.8|     332|
+---------+-------------+--------+
only showing top 20 rows



In [13]:
df_rating.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- averageRating: string (nullable = true)
 |-- numVotes: string (nullable = true)



In [14]:
consulta4 = df_titles.join(df_rating,df_titles.tconst == df_rating.tconst,"outer")
consulta4.count()

8203690

In [48]:
consulta4.show()

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+---------+-------------+--------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|   tconst|averageRating|numVotes|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+---------+-------------+--------+
|tt0000658|    short|The Puppet's Nigh...|Le cauchemar de F...|      0|     1908|     \N|             2|     Animation,Short|tt0000658|          6.5|     227|
|tt0000839|    short|  The Curse of Money|  The Curse of Money|      0|     1909|     \N|            \N|         Drama,Short|     null|         null|    null|
|tt0001170|    short|A Cowboy's Vindic...|A Cowboy's Vindic...|      0|     1910|     \N|            \N|       Short,Western|     null|         null|    null|
|tt0001581|    short|    A Devoted Friend|    

In [15]:
%%time
#Qual o gênero com a melhor nota média de títulos?
consulta5 = (consulta4
             .groupby('genres')
             .agg( f.count('genres').alias('qtde') ,
                   f.mean('averageRating').alias('AVG'))
             .sort(col('AVG').desc()) )
consulta5.show()

+--------------------+----+-----------------+
|              genres|qtde|              AVG|
+--------------------+----+-----------------+
|Comedy,History,We...|   8|              9.8|
|Biography,Comedy,...|  49|              9.8|
|News,Reality-TV,S...| 116|              9.7|
|Music,Musical,Tal...| 348|9.409756097560974|
|Adventure,Music,M...|  25|9.406666666666666|
|Comedy,Game-Show,...| 211|9.290476190476193|
|Biography,Crime,R...|   6|             9.25|
|Biography,Reality...| 215|9.209999999999999|
|Action,Reality-TV...|  17|              9.2|
|Comedy,Fantasy,Ga...|  42|              9.2|
|Horror,News,Talk-...|  13|              9.2|
|           Music,War|  17|              9.2|
|Mystery,Sci-Fi,Ta...|  20|              9.1|
|   Fantasy,Talk-Show|  74|              9.1|
|Game-Show,History...|  18|              9.0|
|        History,News| 482|8.999999999999998|
|   Drama,News,Sci-Fi|   3|              8.9|
|    Music,News,Short|  13|              8.9|
|History,Horror,Ro...|   4|       

In [16]:
#Qual o vídeo game do gênero aventura mais bem avaliado em 2020? 
consulta6 = (consulta4
             .filter((consulta4.titleType == 'videoGame')&(consulta4.startYear == '2020')
                     &(consulta4.genres.rlike ('.*Adventure')))
             .select('primaryTitle','originalTitle','averageRating','genres')
             .sort(col('averageRating').desc()) )
consulta6.show()

+--------------------+--------------------+-------------+--------------------+
|        primaryTitle|       originalTitle|averageRating|              genres|
+--------------------+--------------------+-------------+--------------------+
|     Half-Life: Alyx|     Half-Life: Alyx|          9.5|Action,Adventure,...|
|   Ghost of Tsushima|   Ghost of Tsushima|          9.3|Action,Adventure,...|
|               Omori|               Omori|          9.2|Adventure,Drama,F...|
|Final Fantasy VII...|Final Fantasy VII...|          9.1|Action,Adventure,...|
|Ori and the Will ...|Ori and the Will ...|          9.1|   Adventure,Fantasy|
|Mega Man Zero/ZX ...|Mega Man Zero/ZX ...|          8.9|Action,Adventure,...|
|There Is No Game:...|There Is No Game:...|          8.9|    Adventure,Comedy|
|       Demon's Souls|       Demon's Souls|          8.8|Action,Adventure,...|
|Xenoblade Chronic...|Xenoblade Chronic...|          8.8|Action,Adventure,...|
|Yakuza: Like a Dr...|Ryu ga gotoku 7: ...|         

In [19]:
#Deseja-se utilizar um join para retornar somente as linhas referentes a títulos que estão sem nota, 
#isto é, não aparecem no df_ratings.
consulta7 = df_titles.join(df_rating, 'tconst', 'anti')

In [20]:
consulta7.show()

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000180|    short|  Le chemin de croix|  Le chemin de croix|      0|     1898|     \N|            \N|               Short|
|tt0000185|    short|La crÃ¨che Ã  Bet...|La crÃ¨che Ã  Bet...|      0|     1898|     \N|            \N|               Short|
|tt0000189|    short|             Dorotea|             Dorotea|      0|     1898|     \N|            \N|               Short|
|tt0000191|    short|DÃ©mÃ©nagement Ã ...|DÃ©mÃ©nagement Ã ...|      0|     1898|     \N|            \N|               Short|
|tt0000193|    short|L'entrÃ©e Ã  JÃ©r...|L'entrÃ©e Ã  JÃ©r...|      0|     1898|     \N|            \N|              

In [21]:
#sem nota
consulta7.count()

7021051