<a href="https://colab.research.google.com/github/elbyvaz/data_engineering/blob/main/spark/IMDB_dataset_movies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install Google Colab pyspark
!pip install pyspark



In [None]:
# import pyspark
from pyspark.sql import SparkSession

# Spark entry point
spark = SparkSession \
    .builder \
    .appName("IMDB dataset movies") \
    .getOrCreate()

spark.version

'3.5.0'

In [None]:
# read movie metadata file
df_titles = spark.read.csv('/content/sample_data/title_basics.tsv', header=True, inferSchema=True, sep='\t')
df_titles.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: integer (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [None]:
# total of lines
df_titles.count()

2910502

In [3]:
# read movie ratings file
df_ratings = spark.read.csv('/content/sample_data/title_ratings.tsv', header=True, inferSchema=True, sep='\t')
df_ratings.printSchema()

NameError: name 'spark' is not defined

In [None]:
# total of lines
df_ratings.count()

1182639

In [None]:
# join dataframes without tconst column duplicated
df_result = df_titles.join(df_ratings, df_titles.tconst == df_ratings.tconst, 'left').drop(df_ratings.tconst)
df_result.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: integer (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- averageRating: double (nullable = true)
 |-- numVotes: integer (nullable = true)



In [None]:
# total of lines
df_result.count()

2910502

In [None]:
# first lines
df_result.show(5)

+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+---------------+-------------+--------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|         genres|averageRating|numVotes|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+---------------+-------------+--------+
| tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|Animation,Short|          6.0|     233|
| tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|Animation,Short|          6.1|     152|
|tt10925802|    movie|Pray: The Story o...|Pray: The Story o...|      0|     2020|     \N|            71|    Documentary|          7.6|      28|
|tt10925806|    video|Loch Ness Monster...|Loch Ness Monster...|      0|     2019|     \N|            84|    Documentary|         

In [None]:
# to use as sql
df_result.createOrReplaceTempView("imdb")

In [None]:
# distinct title type
spark.sql("SELECT DISTINCT titleType FROM imdb").show()

+------------+
|   titleType|
+------------+
|    tvSeries|
|tvMiniSeries|
|     tvMovie|
|   tvEpisode|
|       movie|
|   tvSpecial|
|       video|
|   videoGame|
|     tvShort|
|       short|
|           0|
+------------+



In [None]:
# total of movies started in 2015
spark.sql("SELECT COUNT(tconst) FROM imdb WHERE startYear = 2015 AND titleType in ('tvMovie', 'movie')").show()
# df_result.filter(df_result['startYear'] == 2015).count()

+--------+
|count(1)|
+--------+
|    1159|
+--------+



In [None]:
# more frequent genre

In [None]:
# genre with best rating

In [None]:
# video game of adventure with best rating in 2020

In [None]:
# total distinct movie titles

In [None]:
# media duration of adult movies

In [None]:
# total of movies with primaty title different original title

In [None]:
# movie with longest name

In [None]:
# movie with highest number of votes

In [None]:
# lowest average grade of a movie