# Task two

The client requires a function to detect similarity between films. The function will take in a
film’s `film_id`, and a `threshold percentage` as input, and will return a `dataframe` that contains all
films with a similarity percentage above the threshold. The way similarity is calculated is up
to you, but the output should be sensible. 
(For example, any star wars film should be similar to all other star wars films, or films by the 
same director have a similar style etc.)


In [72]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [73]:
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import Normalizer
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix

In [74]:
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "15g") \
    .appName('imdb-munging') \
    .getOrCreate()

sc = spark.sparkContext

In [75]:
# load the IMDb films data prepared in previous task
input_path = "../output/%s"

file = input_path % "films"
df_film = spark.read.parquet(file)


In [76]:
df_film.printSchema()
df_film.count()

root
 |-- film_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- year: date (nullable = true)
 |-- duration: integer (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rating: decimal(4,2) (nullable = true)
 |-- vote_count: integer (nullable = true)
 |-- persons: array (nullable = true)
 |    |-- element: string (containsNull = true)



39427

In [77]:
# A few sample queries

#df_film.sample(withReplacement=False, fraction=0.10, seed=2).show(truncate=False).show(truncate=False)

#df_film.filter( df_film.year >= '2022-01-01').filter( ~(f.array_contains( df_film['genres'], 'Documentary')) ).show(100, truncate=False)
#df_film.filter(  f.regexp_count( f.array_join('persons', ','), f.lit(r'(?i)Spielberg') ) >= 1 ).show(100, truncate=False)a
#df_film.filter(  f.regexp_count( f.array_join('persons', ','), f.lit(r'(?i)George Lucas') ) >= 1 ).show(100, truncate=False)
#df_film.filter(  f.regexp_count( f.array_join('persons', ','), f.lit(r'(?i)Ridley Scott') ) >= 1 ).show(100, truncate=False)
#df_film.filter(  f.regexp_count( f.array_join('persons', ','), f.lit(r'(?i)James Cameron') ) >= 1 ).show(100, truncate=False)
#df_film.filter(  f.regexp_count( f.array_join('persons', ','), f.lit(r'(?i)Keanu') ) >= 1 ).show(100, truncate=False)
#df_film.filter(  f.regexp_count( f.array_join('persons', ','), f.lit(r'(?i)Sigourney Weaver') ) >= 1 ).show(100, truncate=False)
#df_film.filter(  f.regexp_count( f.array_join('persons', ','), f.lit(r'(?i)Jennifer Lawrence') ) >= 1 ).show(100, truncate=False)
#df_film.filter(  f.regexp_count( f.array_join('persons', ','), f.lit(r'(?i)Dave Bautista') ) >= 1 ).show(100, truncate=False)
#df_film.filter(  f.regexp_count( f.array_join('persons', ','), f.lit(r'(?i)Harrison Ford') ) >= 1 ).show(100, truncate=False)
df_film.filter(  df_film['title'].rlike(r'(?i)hunger games') ).show(100, truncate=False)
#df_film.filter(  df_film['title'].rlike(r'(?i)terminator') ).show(100, truncate=False)
#df_film.filter(  df_film['title'].rlike(r'(?i)star wars') ).show(100, truncate=False)
#df_film.groupBy('genres').count().sort(f.desc('count')).show(100, truncate=False)
#df_film.count() # 39_427

+--------+--------------------------------------------------+----------+--------+---------------------------+------+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|film_id |title                                             |year      |duration|genres                     |rating|vote_count|persons                                                                                                                                                                                                                                                                                                                                                              |
+--------+------------------

## stage 2

In [78]:
# Load the cosine similarity (martix dot product) data

sc.setLogLevel("WARN")

#input_path = "../output/%s"
file = input_path % "csfilm"

#df_cos_sim = spark.read.parquet(input_path).withColumnsRenamed({'i': 'film_id', 'j': 'other_id', 'dot': 'similarity'})
df_cos_sim = spark.read.parquet(file)

df_cos_sim.count() # 777_224_451

777224451

In [79]:
#df_cos_sim = df_cos_sim.withColumnsRenamed({'i': 'film_id', 'j': 'other_id', 'dot': 'similarity'})
df_cos_sim.printSchema()

root
 |-- film_id: integer (nullable = true)
 |-- other_id: integer (nullable = true)
 |-- similarity: double (nullable = true)



In [80]:
# Search for a film id, then use the get_similar_films() below

#df_film.show(30, truncate=False)
#df_film.sample(withReplacement=False, fraction=0.10, seed=47).show(truncate=False)
#df_film.filter(  f.regexp_count( f.array_join('persons', ','), f.lit(r'(?i)Dave\sBautista') ) >= 1 ).show(20, False)
df_film.filter(  f.regexp_count( f.array_join('persons', ','), f.lit(r'(?i)Jennifer Lawrence') ) >= 1 ).show(20, False)
#df_film.filter(  df_film['title'].rlike(r'(?i)Conan.*') ).sort(f.desc('rating')).show(20, False) 
#df_film.filter(  df_film['title'].rlike(r'(?i)Hunger.*') ).sort(f.desc('rating')).show(20, False) 
#df_film.filter(f.col('film_id') == "133093").show(truncate=False)

+--------+-------------------------------------+----------+--------+---------------------------+------+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|film_id |title                                |year      |duration|genres                     |rating|vote_count|persons                                                                                                                                                                                                                                                                                                                                                         

## Function to detect similarities between films

In [81]:
# Function to detect similarities between films
from pyspark.sql import DataFrame


In [82]:
def get_similar_films(film_id:int, threshold:float=0.1) -> DataFrame:
    #df_film.filter(f.col('film_id') == film_id).show(truncate=False)
    df_rec = df_cos_sim.alias('reco')\
            .filter( (f.col('reco.film_id') == film_id) & (f.col('reco.similarity') >= threshold))\
            .join(df_film.alias('films'), f.col('reco.other_id') == f.col('films.film_id'), how='left')\
            .sort(f.desc('reco.similarity'))\
            .limit(10)\
            .show(truncate=False)

    return df_rec

In [83]:
#get_similar_films(88763) # "BTTF"
#get_similar_films(990372, 0.45) # "Detective Conan"
#get_similar_films(462699) # "Conan the Future Boy"
get_similar_films(1392170) # "The Hunger Games"
#get_similar_films(99785) # Home Alone
#get_similar_films(11286314) # "Don't Look Up"
#get_similar_films(15410318, 0.1) # "Amy's bucket list"
#get_similar_films(1517268, 0.05) # "Barbie"
#get_similar_films(120915, 0.1) # Star Wars I
#get_similar_films(133093, 0.2) # The Matrix
#get_similar_films(6791350) # Guardians of the Galaxy 3
#get_similar_films(film_id=11286314, threshold=0.1)

#get_similar_films(120915, 0.97) # Star Wars I

+-------+--------+-------------------+--------+--------------------------------------------------+----------+--------+---------------------------+------+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|film_id|other_id|similarity         |film_id |title                                             |year      |duration|genres                     |rating|vote_count|persons                                                                                                                                                                                                                                                                                                           

In [84]:
#sc.stop()