In [18]:
# excercise solutions for Chapter 2 in Applied DS using PySpark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
# read the data & get the required columns
spark = SparkSession.builder.appName('Excercise').getOrCreate()

df = spark.read.format('csv').option('inferSchema', False).option('header', True).option('sep', '|').load('movie_data.csv')
select_columns=['id','budget','popularity','release_date','revenue','title', 'original_language']
df = df.select(*select_columns)
df.show(10,False)

+-----+------+----------+------------+-------+---------------------------------------+-----------------+
|id   |budget|popularity|release_date|revenue|title                                  |original_language|
+-----+------+----------+------------+-------+---------------------------------------+-----------------+
|43000|0     |2.503     |1962-05-23  |0      |The Elusive Corporal                   |fr               |
|43001|0     |5.51      |1962-11-12  |0      |Sundays and Cybele                     |fr               |
|43002|0     |5.62      |1962-05-24  |0      |Lonely Are the Brave                   |en               |
|43003|0     |7.159     |1975-03-12  |0      |F for Fake                             |fr               |
|43004|500000|3.988     |1962-10-09  |0      |Long Day's Journey Into Night          |en               |
|43006|0     |3.194     |1962-03-09  |0      |My Geisha                              |en               |
|43007|0     |2.689     |1962-10-31  |0      |Period of

In [3]:
# Question 1
# Identify the repeated titles between the years 2000 to 2015, and count number of titles
df = df.withColumn('year', year(df['release_date']))
total_titles = df.filter((df['year'] >= 2000) & (df['year'] <= 2015)).count()
distinct_titles = df.filter((df['year'] >= 2000) & (df['year'] <= 2015)).select('title').distinct().count()

ans = total_titles - distinct_titles
ans
''' 
1. Get the total no of titles between 2000-2015
2. Get count of distinct titles between 2000-2015
3. subtract the second value from the first. That's the answer.
'''

" \n1. Get the total no of titles between 2000-2015\n2. Get count of distinct titles between 2000-2015\n3. subtract the second value from the first. That's the answer.\n"

In [4]:
# Question 2
# Identify all titles that contain Harry in the title name.  

# you can use either of the 2 codes below, comment out 1 of them
df.filter(df['title'].contains('Harry')).show(10, False) # code 1

df.filter(df['title'].rlike('\w*Harry')).show(10, False) # code 2
'''
this  outputs only 'titles' that contain the word 'HARRY'
'''

+-----+------+------------------+------------+-------+---------------------------------------------------------------------------+-----------------+----+
|id   |budget|popularity        |release_date|revenue|title                                                                      |original_language|year|
+-----+------+------------------+------------+-------+---------------------------------------------------------------------------+-----------------+----+
|43269|0     |5.169             |1945-08-17  |0      |The Strange Affair of Uncle Harry                                          |en               |1945|
|43714|0     |4.525             |1998-01-27  |0      |The Harryhausen Chronicles                                                 |en               |1998|
|53079|0     |6.785             |2010-10-26  |28200  |Who Is Harry Nilsson (And Why Is Everybody Talkin' About Him?)             |en               |2010|
|91607|0     |3.226             |1971-06-15  |0      |Who Is Harry Kellerman

"\nthis  outputs only 'titles' that contain the word 'HARRY'\n"

In [5]:
# Question 3
# Create a new column as a binary indicator of whether the original language is English
df = df.withColumn('english_indicator', when(df['original_language']=='en', 1).otherwise(0))
df.show(15, False)

'''
We can see that the 'english_indicator' column has value 1 for ENGLISH and 0 for others
'''

+-----+-------+------------------+------------+-------+---------------------------------------+-----------------+----+-----------------+
|id   |budget |popularity        |release_date|revenue|title                                  |original_language|year|english_indicator|
+-----+-------+------------------+------------+-------+---------------------------------------+-----------------+----+-----------------+
|43000|0      |2.503             |1962-05-23  |0      |The Elusive Corporal                   |fr               |1962|0                |
|43001|0      |5.51              |1962-11-12  |0      |Sundays and Cybele                     |fr               |1962|0                |
|43002|0      |5.62              |1962-05-24  |0      |Lonely Are the Brave                   |en               |1962|1                |
|43003|0      |7.159             |1975-03-12  |0      |F for Fake                             |fr               |1975|0                |
|43004|500000 |3.988             |1962-10

"\nWe can see that the 'english_indicator' column has value 1 for ENGLISH and 0 for others\n"

In [6]:
# Question 4
# tabulate the mean of popularity by year

df = df.withColumn('year', year(df['release_date'])) # get the 'year' value from the  'release_date' column
distinct_year = sorted([x[0] for x in df.select('year').distinct().collect() if x[0] != None]) # get the distinct values for each year
mean_years = []

for x in distinct_year:
    mean_value = df.filter(df['year'] == x).agg({'popularity': 'mean'}).collect()[0]['avg(popularity)']
    mean_years.append(mean_value)

zipped_mean_years = list(zip(distinct_year, mean_years))

# this is the mean Popularity for each year
zipped_mean_years[:10]

[(1911, 2.5),
 (1913, 4.3309999999999995),
 (1914, 3.0374999999999996),
 (1915, 5.093),
 (1916, 3.381666666666667),
 (1917, 2.1675000000000004),
 (1918, 2.592111111111111),
 (1919, 3.044153846153846),
 (1920, 3.622315789473684),
 (1921, 4.6906923076923075)]

# All excercise questions are solved above. The rest are additions where I will input the mean value for each year.

In [25]:
# zipped_mean_years = dict(zipped_mean_years)
# df = df.withColumn('mean_years', lit(None).cast(StringType()))

for i, y in enumerate(df.select('mean_years', 'year').collect()):
    #contunnue from here
    print('{}: {}'.format(i, y[1]))
    break


0: 1962


In [17]:
df.select('year').take(3)

[Row(year=1962), Row(year=1962), Row(year=1962)]