In [52]:
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings coming from Arrow optimizations.

In [53]:
# import os
# os.environ['HADOOP_HOME'] = 'Caminho/para/o/Hadoop'
# os.environ['JAVA_HOME'] = 'Caminho/para/o/Java'

In [54]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName('explore_spark') \
    .config('spark.master', 'local') \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.instances", "3") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()


# # Configuração manual das bibliotecas do Hadoop no classpath
# spark = SparkSession.builder \
#     .appName("NomeDaAplicacao") \
#     .config("spark.driver.extraClassPath", "Caminho/para/as/bibliotecas/hadoop.dll") \
#     .getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)
# %timeit ps.range(300000).to_pandas()

## Carregar dados dos ratings/votos dos títulos

In [55]:
title_ratings = spark.read.load('../Data/title.ratings.tsv', format='csv', sep='\t', inferSchema=True, header=True)

In [56]:
title_ratings.show(2)

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    2007|
|tt0000002|          5.8|     270|
+---------+-------------+--------+
only showing top 2 rows



## Carregar dados dos títulos e concatena os ratings/votos com as informações dos titulos
- somente aqueles que possuem rating
- somente filmes
- somente os que possuem algum gênero

In [57]:
title_basics = spark.read.load('../Data/title.basics.tsv', format='csv', sep='\t', inferSchema=True, header=True)

In [58]:
# title_basics.select(col('titleType')).distinct().show()
title_basics.groupBy('titleType').count().orderBy(col('count').desc()).show()

+------------+-------+
|   titleType|  count|
+------------+-------+
|   tvEpisode|7909714|
|       short| 964958|
|       movie| 664652|
|       video| 283511|
|    tvSeries| 253097|
|     tvMovie| 143596|
|tvMiniSeries|  51367|
|   tvSpecial|  44591|
|   videoGame|  36627|
|     tvShort|  10084|
|     tvPilot|      1|
+------------+-------+



In [59]:
title_basics = title_basics.filter(title_basics['titleType'] == 'movie')
title_basics = title_basics.filter(title_basics['genres'] != '\\N')

In [60]:
# Realiza um join usando 'tconst' como chave e 'inner' como tipo de join
# https://sparkbyexamples.com/pyspark/pyspark-join-explained-with-examples/
title_basics_filtered = title_basics.join(title_ratings, ['tconst'], 'inner').drop('titleType', 'endYear')

In [61]:
title_basics.count()

590391

In [62]:
title_basics_filtered.count()

290648

In [63]:
title_basics_filtered.groupBy('isAdult').count().show()

+-------+------+
|isAdult| count|
+-------+------+
|      0|286094|
|      1|  4554|
+-------+------+



In [64]:
# Criação da nova coluna com a condição de igualdade
title_basics_filtered = title_basics_filtered.withColumn('popularIsOriginal', when(title_basics_filtered['primaryTitle'] == title_basics_filtered['originalTitle'], 1).otherwise(0))

In [65]:
# Filtrar as linhas onde 'runtimeMinutes' é igual a '\N' e contar o número de ocorrências
title_basics_filtered.filter(title_basics_filtered['runtimeMinutes'] == '\\N').count()

25405

In [66]:
title_basics_filtered.show()

+---------+--------------------+--------------------+-------+---------+--------------+--------------------+-------------+--------+-----------------+
|   tconst|        primaryTitle|       originalTitle|isAdult|startYear|runtimeMinutes|              genres|averageRating|numVotes|popularIsOriginal|
+---------+--------------------+--------------------+-------+---------+--------------+--------------------+-------------+--------+-----------------+
|tt0000009|          Miss Jerry|          Miss Jerry|      0|     1894|            45|             Romance|          5.3|     208|                1|
|tt0000147|The Corbett-Fitzs...|The Corbett-Fitzs...|      0|     1897|           100|Documentary,News,...|          5.3|     485|                1|
|tt0000574|The Story of the ...|The Story of the ...|      0|     1906|            70|Action,Adventure,...|          6.0|     855|                1|
|tt0000591|    The Prodigal Son|   L'enfant prodigue|      0|     1907|            90|               Drama

In [67]:
title_basics_filtered.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- averageRating: double (nullable = true)
 |-- numVotes: integer (nullable = true)
 |-- popularIsOriginal: integer (nullable = false)



## dumificar 'genres'

In [68]:
# Dividir a coluna 'genres' por vírgulas e expandir em colunas
genres_split = title_basics_filtered.withColumn('genres', split('genres', ','))

# Usar a função explode() para criar múltiplas linhas para cada gênero
genres_exploded = genres_split.withColumn('genre', explode('genres'))

# Criar dummies para cada gênero usando pivot()
dummies = genres_exploded.groupBy('tconst').pivot('genre').agg(lit(1)).fillna(0)

# Mostrar o DataFrame resultante com as colunas de dummies para gêneros
dummies.show(5)

+---------+------+-----+---------+---------+---------+------+-----+-----------+-----+------+-------+---------+-------+------+-----+-------+-------+----+----------+-------+------+-----+---------+--------+---+-------+
|   tconst|Action|Adult|Adventure|Animation|Biography|Comedy|Crime|Documentary|Drama|Family|Fantasy|Film-Noir|History|Horror|Music|Musical|Mystery|News|Reality-TV|Romance|Sci-Fi|Sport|Talk-Show|Thriller|War|Western|
+---------+------+-----+---------+---------+---------+------+-----+-----------+-----+------+-------+---------+-------+------+-----+-------+-------+----+----------+-------+------+-----+---------+--------+---+-------+
|tt0000009|     0|    0|        0|        0|        0|     0|    0|          0|    0|     0|      0|        0|      0|     0|    0|      0|      0|   0|         0|      1|     0|    0|        0|       0|  0|      0|
|tt0000147|     0|    0|        0|        0|        0|     0|    0|          1|    0|     0|      0|        0|      0|     0|    0|     

In [69]:
dummies.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- Action: integer (nullable = true)
 |-- Adult: integer (nullable = true)
 |-- Adventure: integer (nullable = true)
 |-- Animation: integer (nullable = true)
 |-- Biography: integer (nullable = true)
 |-- Comedy: integer (nullable = true)
 |-- Crime: integer (nullable = true)
 |-- Documentary: integer (nullable = true)
 |-- Drama: integer (nullable = true)
 |-- Family: integer (nullable = true)
 |-- Fantasy: integer (nullable = true)
 |-- Film-Noir: integer (nullable = true)
 |-- History: integer (nullable = true)
 |-- Horror: integer (nullable = true)
 |-- Music: integer (nullable = true)
 |-- Musical: integer (nullable = true)
 |-- Mystery: integer (nullable = true)
 |-- News: integer (nullable = true)
 |-- Reality-TV: integer (nullable = true)
 |-- Romance: integer (nullable = true)
 |-- Sci-Fi: integer (nullable = true)
 |-- Sport: integer (nullable = true)
 |-- Talk-Show: integer (nullable = true)
 |-- Thriller: integer (nullable = tru

## Carregar dados dos atores principais/equipe dos títulos
- somente dos titulos que possuem rating
- somente filmes
- somente os que possuem algum gênero
- somente atores/atrizes

In [70]:
title_principals = spark.read.load('../Data/title.principals.tsv', format='csv', sep='\t', inferSchema=True, header=True)

In [71]:
title_principals.count()

59388712

In [72]:
# Obter os 'tconst' do DataFrame movies
tconst_movies = title_basics_filtered.select('tconst')

# Filtrar o DataFrame title_principals para manter apenas as linhas em que 'tconst' está presente em movies
title_principals_filtered = title_principals.join(tconst_movies, 'tconst', 'inner').drop('job', 'ordering', 'characters')

In [73]:
title_principals_filtered.count()

2601778

In [74]:
title_principals_filtered.show()

+---------+---------+---------------+
|   tconst|   nconst|       category|
+---------+---------+---------------+
|tt0000941|nm0034453|          actor|
|tt0000941|nm0140054|          actor|
|tt0000941|nm0243918|          actor|
|tt0000941|nm0294022|        actress|
|tt0000941|nm0063413|       director|
|tt0000941|nm0550220|       director|
|tt0000941|nm0848502|         writer|
|tt0002026|nm0115982|          actor|
|tt0002026|nm0418086|        actress|
|tt0002026|nm0027708|          actor|
|tt0002026|nm0526167|        actress|
|tt0002026|nm0959066|          actor|
|tt0002026|nm0259235|       director|
|tt0002026|nm0084804|       producer|
|tt0002026|nm0064944|          actor|
|tt0002026|nm0348052|        actress|
|tt0002026|nm0959065|        actress|
|tt0002423|nm0913298|cinematographer|
|tt0002423|nm0624470|        actress|
|tt0002423|nm0417837|          actor|
+---------+---------+---------------+
only showing top 20 rows



In [75]:
title_principals_filtered.select('category').distinct().show()

+-------------------+
|           category|
+-------------------+
|            actress|
|           producer|
|             writer|
|           composer|
|           director|
|               self|
|              actor|
|             editor|
|    cinematographer|
|      archive_sound|
|production_designer|
|    archive_footage|
+-------------------+



In [76]:
# separar as categorias
actors = ['actress', 'actor', 'self']
producers = ['writer', 'director', 'producer']
crew = ['composer', 'editor', 'cinematographer', 'archive_sound', 'production_designer', 'archive_footage']

title_principals_filtered_actors = title_principals_filtered.filter(col('category').isin(actors))
title_principals_filtered_producers = title_principals_filtered.filter(col('category').isin(producers))
title_principals_filtered_crew = title_principals_filtered.filter(col('category').isin(crew))

In [77]:
title_principals_filtered_actors.count()

1183790

In [78]:
title_principals_filtered_producers.count()

856025

In [79]:
title_principals_filtered_crew.count()

561963

In [80]:
title_principals_filtered_actors.show(2)

+---------+---------+--------+
|   tconst|   nconst|category|
+---------+---------+--------+
|tt0000630|nm0624446| actress|
|tt0000941|nm0034453|   actor|
+---------+---------+--------+
only showing top 2 rows



## Fazer a media de cada grupo pros filmes

In [81]:
from pyspark.sql.functions import avg

In [82]:
# Etapa 1: Obter todos os títulos associados aos nomes presentes no dataset title_principals_filtered_actors
titles_for_names = title_principals_filtered_producers.select('nconst', 'tconst').distinct()

### Actors

In [83]:
# Etapa 2: Calcular as médias de 'averageRating' e 'numVotes' para esses títulos
avg_ratings_votes_actor = titles_for_names.join(title_basics_filtered, 'tconst', 'inner') \
    .groupBy('nconst') \
    .agg(avg('averageRating').alias('avgRating'), avg('numVotes').alias('avgNumVotes'))

# Etapa 3: Juntar esses nomes com as médias já calculadas
joined_data = avg_ratings_votes_actor.join(titles_for_names, 'nconst', 'inner')

joined_data = joined_data.join(title_principals_filtered_actors.select('nconst'), 'nconst', 'inner')

# Etapa 4: Calcular as médias dos filmes com base nos nomes associados
avg_ratings_per_film_actor = joined_data.groupBy('tconst') \
    .agg(avg('avgRating').alias('avgRatingPerFilm'), avg('avgNumVotes').alias('avgNumVotesPerFilm'))

In [84]:
joined_data.filter(joined_data['tconst'] == 'tt0001184').show()

+------+---------+-----------+------+
|nconst|avgRating|avgNumVotes|tconst|
+------+---------+-----------+------+
+------+---------+-----------+------+



In [85]:
avg_ratings_per_film_actor.filter(avg_ratings_per_film_actor['tconst'] == 'tt0001184').show()

+------+----------------+------------------+
|tconst|avgRatingPerFilm|avgNumVotesPerFilm|
+------+----------------+------------------+
+------+----------------+------------------+



In [86]:
test = titles_for_names.join(title_basics_filtered, 'tconst', 'inner')

test.filter(test['tconst'] == 'tt0001184').show()

+---------+---------+--------------------+--------------------+-------+---------+--------------+---------------+-------------+--------+-----------------+
|   tconst|   nconst|        primaryTitle|       originalTitle|isAdult|startYear|runtimeMinutes|         genres|averageRating|numVotes|popularIsOriginal|
+---------+---------+--------------------+--------------------+-------+---------+--------------+---------------+-------------+--------+-----------------+
|tt0001184|nm0063413|Don Juan de Serra...|Don Juan de Serra...|      0|     1910|            58|Adventure,Drama|          3.8|      21|                1|
|tt0001184|nm0550220|Don Juan de Serra...|Don Juan de Serra...|      0|     1910|            58|Adventure,Drama|          3.8|      21|                1|
|tt0001184|nm0049370|Don Juan de Serra...|Don Juan de Serra...|      0|     1910|            58|Adventure,Drama|          3.8|      21|                1|
+---------+---------+--------------------+--------------------+-------+-----

In [87]:
avg_ratings_per_film_actor.filter(avg_ratings_per_film_actor['tconst'] == 'tt0001184').show()

+------+----------------+------------------+
|tconst|avgRatingPerFilm|avgNumVotesPerFilm|
+------+----------------+------------------+
+------+----------------+------------------+



In [88]:
title_basics_filtered_final = title_basics_filtered.join(avg_ratings_per_film_actor, 'tconst', 'inner')

title_basics_filtered_final = title_basics_filtered_final.withColumnRenamed("avgRatingPerFilm", "avgRatingPerFilmActors")
title_basics_filtered_final = title_basics_filtered_final.withColumnRenamed("avgNumVotesPerFilm", "avgNumVotesPerFilmActors")

title_basics_filtered_final.show(5)

+---------+--------------------+--------------------+-------+---------+--------------+--------------------+-------------+--------+-----------------+----------------------+------------------------+
|   tconst|        primaryTitle|       originalTitle|isAdult|startYear|runtimeMinutes|              genres|averageRating|numVotes|popularIsOriginal|avgRatingPerFilmActors|avgNumVotesPerFilmActors|
+---------+--------------------+--------------------+-------+---------+--------------+--------------------+-------------+--------+-----------------+----------------------+------------------------+
|tt0000147|The Corbett-Fitzs...|The Corbett-Fitzs...|      0|     1897|           100|Documentary,News,...|          5.3|     485|                1|     6.312499999999999|                  865.75|
|tt0000630|              Hamlet|              Amleto|      0|     1908|            \N|               Drama|          2.9|      27|                0|                   4.7|                    25.0|
|tt0001028|    

### Producers

In [89]:
# Etapa 2: Calcular as médias de 'averageRating' e 'numVotes' para esses títulos
avg_ratings_votes_producer = titles_for_names.join(title_basics_filtered, 'tconst', 'inner') \
    .groupBy('nconst') \
    .agg(avg('averageRating').alias('avgRating'), avg('numVotes').alias('avgNumVotes'))

# Etapa 3: Juntar esses nomes com as médias já calculadas
joined_data = avg_ratings_votes_producer.join(titles_for_names, 'nconst', 'inner')

joined_data = joined_data.join(title_principals_filtered_producers.select('nconst'), 'nconst', 'inner')

# Etapa 4: Calcular as médias dos filmes com base nos nomes associados
avg_ratings_per_film_producer = joined_data.groupBy('tconst') \
    .agg(avg('avgRating').alias('avgRatingPerFilm'), avg('avgNumVotes').alias('avgNumVotesPerFilm'))

In [90]:
joined_data.filter(joined_data['tconst'] == 'tt0001184').show()

+---------+------------------+-----------+---------+
|   nconst|         avgRating|avgNumVotes|   tconst|
+---------+------------------+-----------+---------+
|nm0049370|               4.9|       31.0|tt0001184|
|nm0049370|               4.9|       31.0|tt0001184|
|nm0063413| 4.766666666666667|       24.0|tt0001184|
|nm0063413| 4.766666666666667|       24.0|tt0001184|
|nm0063413| 4.766666666666667|       24.0|tt0001184|
|nm0550220|3.9333333333333336|       21.0|tt0001184|
|nm0550220|3.9333333333333336|       21.0|tt0001184|
|nm0550220|3.9333333333333336|       21.0|tt0001184|
+---------+------------------+-----------+---------+



In [91]:
avg_ratings_per_film_actor.filter(avg_ratings_per_film_actor['tconst'] == 'tt0001184').show()

+------+----------------+------------------+
|tconst|avgRatingPerFilm|avgNumVotesPerFilm|
+------+----------------+------------------+
+------+----------------+------------------+



In [92]:
title_basics_filtered_final = title_basics_filtered_final.join(avg_ratings_per_film_producer, 'tconst', 'inner')

title_basics_filtered_final = title_basics_filtered_final.withColumnRenamed("avgRatingPerFilm", "avgRatingPerFilmProducers")
title_basics_filtered_final = title_basics_filtered_final.withColumnRenamed("avgNumVotesPerFilm", "avgNumVotesPerFilmProducers")

title_basics_filtered_final.show(5)

+---------+--------------------+--------------------+-------+---------+--------------+--------------------+-------------+--------+-----------------+----------------------+------------------------+-------------------------+---------------------------+
|   tconst|        primaryTitle|       originalTitle|isAdult|startYear|runtimeMinutes|              genres|averageRating|numVotes|popularIsOriginal|avgRatingPerFilmActors|avgNumVotesPerFilmActors|avgRatingPerFilmProducers|avgNumVotesPerFilmProducers|
+---------+--------------------+--------------------+-------+---------+--------------+--------------------+-------------+--------+-----------------+----------------------+------------------------+-------------------------+---------------------------+
|tt0000147|The Corbett-Fitzs...|The Corbett-Fitzs...|      0|     1897|           100|Documentary,News,...|          5.3|     485|                1|     6.312499999999999|                  865.75|        6.199999999999999|          823.44444444444

### Crew

In [93]:
# Etapa 2: Calcular as médias de 'averageRating' e 'numVotes' para esses títulos
avg_ratings_votes_crew = titles_for_names.join(title_basics_filtered, 'tconst', 'inner') \
    .groupBy('nconst') \
    .agg(avg('averageRating').alias('avgRating'), avg('numVotes').alias('avgNumVotes'))

# Etapa 3: Juntar esses nomes com as médias já calculadas
joined_data = avg_ratings_votes_crew.join(titles_for_names, 'nconst', 'inner')

joined_data = joined_data.join(title_principals_filtered_crew.select('nconst'), 'nconst', 'inner')

# Etapa 4: Calcular as médias dos filmes com base nos nomes associados
avg_ratings_per_film_crew = joined_data.groupBy('tconst') \
    .agg(avg('avgRating').alias('avgRatingPerFilm'), avg('avgNumVotes').alias('avgNumVotesPerFilm'))

In [94]:
joined_data.filter(joined_data['tconst'] == 'tt0001184').show()

+------+---------+-----------+------+
|nconst|avgRating|avgNumVotes|tconst|
+------+---------+-----------+------+
+------+---------+-----------+------+



In [95]:
avg_ratings_per_film_crew.filter(avg_ratings_per_film_crew['tconst'] == 'tt0001184').show()

+------+----------------+------------------+
|tconst|avgRatingPerFilm|avgNumVotesPerFilm|
+------+----------------+------------------+
+------+----------------+------------------+



In [96]:
title_basics_filtered_final = title_basics_filtered_final.join(avg_ratings_per_film_crew, 'tconst', 'inner')

title_basics_filtered_final = title_basics_filtered_final.withColumnRenamed("avgRatingPerFilm", "avgRatingPerFilmCrew")
title_basics_filtered_final = title_basics_filtered_final.withColumnRenamed("avgNumVotesPerFilm", "avgNumVotesPerFilmCrew")

title_basics_filtered_final.show(5)

+---------+--------------------+--------------------+-------+---------+--------------+--------------------+-------------+--------+-----------------+----------------------+------------------------+-------------------------+---------------------------+--------------------+----------------------+
|   tconst|        primaryTitle|       originalTitle|isAdult|startYear|runtimeMinutes|              genres|averageRating|numVotes|popularIsOriginal|avgRatingPerFilmActors|avgNumVotesPerFilmActors|avgRatingPerFilmProducers|avgNumVotesPerFilmProducers|avgRatingPerFilmCrew|avgNumVotesPerFilmCrew|
+---------+--------------------+--------------------+-------+---------+--------------+--------------------+-------------+--------+-----------------+----------------------+------------------------+-------------------------+---------------------------+--------------------+----------------------+
|tt0001911|         Nell Gwynne|Sweet Nell of Old...|      0|     1911|            50|Biography,Drama,H...|        

## Analisar titulos localizados

In [97]:
title_akas = spark.read.load('../Data/title.akas.tsv', format='csv', sep='\t', inferSchema=True, header=True)

In [98]:
title_akas.show()

+---------+--------+--------------------+------+--------+-----------+--------------------+---------------+
|  titleId|ordering|               title|region|language|      types|          attributes|isOriginalTitle|
+---------+--------+--------------------+------+--------+-----------+--------------------+---------------+
|tt0000001|       1|          Карменсіта|    UA|      \N|imdbDisplay|                  \N|              0|
|tt0000001|       2|          Carmencita|    DE|      \N|         \N|       literal title|              0|
|tt0000001|       3|Carmencita - span...|    HU|      \N|imdbDisplay|                  \N|              0|
|tt0000001|       4|          Καρμενσίτα|    GR|      \N|imdbDisplay|                  \N|              0|
|tt0000001|       5|          Карменсита|    RU|      \N|imdbDisplay|                  \N|              0|
|tt0000001|       6|          Carmencita|    US|      \N|imdbDisplay|                  \N|              0|
|tt0000001|       7|          Carmenc

In [99]:
# Obter os 'tconst' do DataFrame movies
# tconst_movies contém os IDs de filmes já selecionados

# filtrar somente as entradas de titulos que estamos considerando
title_akas_filtered = title_akas.join(tconst_movies, title_akas['titleId'] == tconst_movies['tconst'], 'inner')

In [100]:
title_akas_filtered.show()

+---------+--------+--------------------+------+--------+-----------+--------------------+---------------+---------+
|  titleId|ordering|               title|region|language|      types|          attributes|isOriginalTitle|   tconst|
+---------+--------+--------------------+------+--------+-----------+--------------------+---------------+---------+
|tt0000941|       1|      Locura de amor|    \N|      \N|   original|                  \N|              1|tt0000941|
|tt0000941|       2|          Love Crazy|   XWW|      en|         \N|informal literal ...|              0|tt0000941|
|tt0000941|       3|      Locura de amor|    ES|      \N|imdbDisplay|                  \N|              0|tt0000941|
|tt0002026|       1|Anny - en gatepig...|    NO|      \N|imdbDisplay|                  \N|              0|tt0002026|
|tt0002026|       2|Anny - Story of a...|   XWW|      en|         \N|                  \N|              0|tt0002026|
|tt0002026|       3|Anny - en gatepig...|    \N|      \N|   orig

In [101]:
title_akas_filtered_translations = title_akas_filtered.groupBy('tconst').count().withColumnRenamed('count', 'numberOfTranslations')
title_akas_filtered_translations.show()

+---------+--------------------+
|   tconst|numberOfTranslations|
+---------+--------------------+
|tt0000147|                   3|
|tt0000574|                   9|
|tt0000630|                   5|
|tt0000675|                   2|
|tt0000886|                   4|
|tt0000941|                   3|
|tt0001028|                   2|
|tt0001049|                   3|
|tt0001112|                   4|
|tt0001115|                   2|
|tt0001184|                   2|
|tt0001277|                   2|
|tt0001338|                   3|
|tt0001370|                   2|
|tt0001440|                   3|
|tt0001498|                   4|
|tt0001530|                   3|
|tt0001790|                   6|
|tt0001812|                   2|
|tt0001911|                   5|
+---------+--------------------+
only showing top 20 rows



## Agregar dataset de features

In [102]:
title_basics_filtered_final.show()

+---------+--------------------+--------------------+-------+---------+--------------+--------------------+-------------+--------+-----------------+----------------------+------------------------+-------------------------+---------------------------+--------------------+----------------------+
|   tconst|        primaryTitle|       originalTitle|isAdult|startYear|runtimeMinutes|              genres|averageRating|numVotes|popularIsOriginal|avgRatingPerFilmActors|avgNumVotesPerFilmActors|avgRatingPerFilmProducers|avgNumVotesPerFilmProducers|avgRatingPerFilmCrew|avgNumVotesPerFilmCrew|
+---------+--------------------+--------------------+-------+---------+--------------+--------------------+-------------+--------+-----------------+----------------------+------------------------+-------------------------+---------------------------+--------------------+----------------------+
|tt0001911|         Nell Gwynne|Sweet Nell of Old...|      0|     1911|            50|Biography,Drama,H...|        

In [103]:
titles_features_complete = title_basics_filtered_final.join(title_akas_filtered_translations, 'tconst', 'inner')
titles_features_complete = titles_features_complete.join(dummies, 'tconst', 'inner')

In [104]:
titles_features_complete.show()

+---------+--------------------+--------------------+-------+---------+--------------+--------------------+-------------+--------+-----------------+----------------------+------------------------+-------------------------+---------------------------+--------------------+----------------------+--------------------+------+-----+---------+---------+---------+------+-----+-----------+-----+------+-------+---------+-------+------+-----+-------+-------+----+----------+-------+------+-----+---------+--------+---+-------+
|   tconst|        primaryTitle|       originalTitle|isAdult|startYear|runtimeMinutes|              genres|averageRating|numVotes|popularIsOriginal|avgRatingPerFilmActors|avgNumVotesPerFilmActors|avgRatingPerFilmProducers|avgNumVotesPerFilmProducers|avgRatingPerFilmCrew|avgNumVotesPerFilmCrew|numberOfTranslations|Action|Adult|Adventure|Animation|Biography|Comedy|Crime|Documentary|Drama|Family|Fantasy|Film-Noir|History|Horror|Music|Musical|Mystery|News|Reality-TV|Romance|Sci-F

In [105]:
titles_features_complete.count()

25600

## Salvar os dados selecionados

In [106]:
# Reduzir o número de partições para 1 antes de salvar
titles_features_complete = titles_features_complete.coalesce(1)

# Salvar como CSV
titles_features_complete.write.mode('overwrite').option('header', 'true').csv('../Data/movies.features.complete.csv')

Py4JJavaError: An error occurred while calling o864.csv.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:735)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:270)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:286)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:978)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:660)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:700)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.ChecksumFileSystem.mkdirs(ChecksumFileSystem.java:788)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.setupJob(FileOutputCommitter.java:356)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.setupJob(HadoopMapReduceCommitProtocol.scala:188)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:269)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:374)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:402)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:374)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:859)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:388)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:361)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:240)
	at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:850)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:547)
	at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:568)
	at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:688)
	at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
	at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
	at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
	at org.apache.hadoop.util.ShutdownHookManager$HookEntry.<init>(ShutdownHookManager.java:207)
	at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
	at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
	at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
	at org.apache.spark.util.ShutdownHookManager$.<init>(ShutdownHookManager.scala:58)
	at org.apache.spark.util.ShutdownHookManager$.<clinit>(ShutdownHookManager.scala)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:242)
	at org.apache.spark.util.SparkFileUtils.createTempDir(SparkFileUtils.scala:103)
	at org.apache.spark.util.SparkFileUtils.createTempDir$(SparkFileUtils.scala:102)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:94)
	at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:372)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:964)
	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:194)
	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:217)
	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91)
	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1120)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1129)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.
	at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:467)
	at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:438)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:515)
	... 25 more
