In [16]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [2]:
import findspark

findspark.init()

In [3]:
# iniciando a sessão spark
spark = SparkSession.builder.getOrCreate()

## Carregando os Dataframe

In [4]:
df_titles = spark.read.csv('./data/title_basics.tsv', header=True, sep='\t')

In [5]:
df_ratings = spark.read.csv('./data/title_ratings.tsv', header=True, sep='\t')

In [6]:
df_titles.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [7]:
df_ratings.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- averageRating: string (nullable = true)
 |-- numVotes: string (nullable = true)



In [8]:
# Pergunta 3
df1 = spark.createDataFrame([('Maria', 1), ('Joao', 2)], schema = 'nome STRING, id INTEGER')
df2 = spark.createDataFrame([('Pedro', 3), ('Clara', 4)], schema = 'nome STRING, id INTEGER')
df1.union(df2).show()

+-----+---+
| nome| id|
+-----+---+
|Maria|  1|
| Joao|  2|
|Pedro|  3|
|Clara|  4|
+-----+---+



In [9]:
df_ratings.show(5)

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    1809|
|tt0000002|          6.0|     233|
|tt0000003|          6.5|    1560|
|tt0000004|          6.1|     152|
|tt0000005|          6.2|    2383|
+---------+-------------+--------+
only showing top 5 rows



In [116]:
df_titles.show(10)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [11]:
df_titles\
    .select('titleType')\
    .distinct()\
    .show()

+------------+
|   titleType|
+------------+
|    tvSeries|
|tvMiniSeries|
|     tvMovie|
|     tvPilot|
|   tvEpisode|
|       movie|
|   tvSpecial|
|       video|
|   videoGame|
|     tvShort|
|       short|
| radioSeries|
|radioEpisode|
+------------+



In [12]:

# Pergunta 06
df_titles\
    .filter((f.col('startYear').cast('Int') == 2015) & ((f.col('titleType') == 'movie') | (f.col('titleType') == 'tvMovie')))\
    .count()

19987

In [13]:
# Pergunta 07
df_titles\
    .withColumn('genero', f.explode(f.split(f.col('genres'), ',')))\
    .drop('genres')\
    .groupBy('genero')\
    .count()\
    .orderBy(f.desc('count'))\
    .show()

+-----------+-------+
|     genero|  count|
+-----------+-------+
|      Drama|2247995|
|     Comedy|1653725|
|      Short|1021850|
|  Talk-Show| 900198|
|Documentary| 764885|
|    Romance| 724729|
|         \N| 643012|
|     Family| 571470|
|       News| 524662|
| Reality-TV| 423455|
|  Animation| 406284|
|      Music| 394008|
|      Crime| 351447|
|     Action| 334580|
|  Adventure| 324325|
|  Game-Show| 252533|
|      Adult| 242704|
|      Sport| 178594|
|    Fantasy| 174119|
|    Mystery| 162448|
+-----------+-------+
only showing top 20 rows



In [14]:
# Pergunta 08
df_titles\
    .join(df_ratings, 'tconst')\
    .withColumn('genero', f.explode(f.split(f.col('genres'), ',')))\
    .drop('genres')\
    .groupBy('genero')\
    .agg(f.mean('averageRating').alias('Nota_Media'))\
    .orderBy(f.desc('Nota_Media'))\
    .show()

+-----------+------------------+
|     genero|        Nota_Media|
+-----------+------------------+
|    History| 7.353780102645086|
|Documentary| 7.240198535554575|
|  Biography|  7.17553191489362|
|    Mystery| 7.170086406897942|
|      Crime|7.1598428684859385|
|  Adventure| 7.107629703351738|
|    Fantasy| 7.095145650845386|
|  Animation| 7.089381171483224|
|    Western| 7.080683426568711|
|     Family|  7.07005492603448|
|      Drama| 7.040979155040203|
|        War|7.0091151344149205|
|     Action|7.0070981387478835|
|      Sport| 6.966792418526429|
|     Comedy|6.9600165509184135|
|      Music| 6.927469624015715|
| Reality-TV| 6.892611170895967|
|  Game-Show| 6.876828101904185|
|    Romance| 6.864016164703973|
|      Short| 6.791292438368555|
+-----------+------------------+
only showing top 20 rows



In [15]:
# Pergunta 09
df_titles\
    .join(df_ratings, 'tconst')\
    .withColumn('genero', f.explode(f.split(f.col('genres'), ',')))\
    .drop('genres')\
    .filter(f.col('titleType') == 'videoGame')\
    .filter(f.col('genero') == 'Adventure')\
    .filter(f.col('startYear').cast('Int') == 2020)\
    .groupBy('primaryTitle')\
    .agg(f.max('averageRating').alias('Nota_Media'))\
    .orderBy(f.desc('Nota_Media'))\
    .show(10)

+--------------------+----------+
|        primaryTitle|Nota_Media|
+--------------------+----------+
|     Half-Life: Alyx|       9.5|
|   Ghost of Tsushima|       9.3|
|               Omori|       9.2|
|Ori and the Will ...|       9.1|
|Final Fantasy VII...|       9.1|
|Mega Man Zero/ZX ...|       8.9|
|There Is No Game:...|       8.9|
|Yakuza: Like a Dr...|       8.8|
|Xenoblade Chronic...|       8.8|
|       Demon's Souls|       8.8|
+--------------------+----------+
only showing top 10 rows



In [129]:
# Pergunta 11
df_titles\
   .filter(f.col('startYear').cast('Int') == 2018)\
   .withColumn('genres', f.explode(f.split('genres', ',')))\
   .withColumn('comedy', f.when(f.col('genres') == 'Comedy',1).otherwise(0))\
   .select(((f.sum(f.col('comedy')) / f.countDistinct(f.col('tconst'))) * 100).alias('comedy'))\
   .show(10)
   

+------------------+
|            comedy|
+------------------+
|19.592336989488967|
+------------------+



In [139]:
# Pergunta 13
df_titles.join(df_ratings, 'tconst', 'anti').count() #7021051

7021051

In [147]:
# Pergunta 14
def sqr_divide(value): 

    return (value**2)/2 

sqr_divide_udf = f.udf(sqr_divide, DoubleType())

df_ratings\
    .withColumn('averageRating', f.col('averageRating').cast('Double'))\
    .select(sqr_divide_udf('averageRating').alias('averageRating'))\
    .agg(f.mean('averageRating').alias('averageRating'))\
    .show(10)

+------------------+
|     averageRating|
+------------------+
|24.899137999842086|
+------------------+

