In [1]:
import pyspark
import math
import itertools
import os
from decimal import Decimal
from operator import add
from pyspark.sql import Row

sc = pyspark.SparkContext.getOrCreate()
sqlContext = pyspark.sql.SQLContext(sc)

# File Reading into DataFrame

In [2]:
scripts_directory = '/usr/data/movie_scripts/'
files_list = os.listdir(scripts_directory)
files_list_rdd = sc.parallelize(files_list)

In [51]:
def map_script_to_meta(filename):
    if "txt" not in filename:
        return []
    script = open(scripts_directory + filename, 'r', encoding='utf8').read()
    actor, movie = filename[:-4].split("_")
    return [Row(actor=actor, movie_name=movie, script=script)]

scripts_df = files_list_rdd.flatMap(map_script_to_meta).toDF()
scripts_df.toPandas()

Unnamed: 0,actor,movie_name,script
0,Al Pacino,The Humbling,Ten minutes to curtain. Ten minutes. All the w...
1,Adam Sandler,Mixed Nuts,* [ group singing doo-wop ] * [ doo-wop contin...
2,Al Pacino,Donnie Brasco,You're not saying things that mean anything. I...
3,Anthony Hopkins,Instinct,- Are you listening? Are you listening to me? ...
4,Anne Hathaway,Brokeback Mountain,Shit. You pair of deuces lookin' for work... I...
...,...,...,...
221,Anne Hathaway,Bride Wars,- # I found # - # I found # # So many things #...
222,Anne Hathaway,Colossal,(CRICKETS CHIRPING) (GIRL SPEAKING KOREAN) (WO...
223,Arnold Schwarzenegger,Collateral Damage,OCD from engine 35. On scene at 902 Sunnyvale....
224,Angelina Jolie,Hell's Kitchen,"Fuck it! Shut up, all right? Listen, we gotta ..."


# Characters Sanitation and Tokenization

In [50]:
from pyspark.ml.feature import Tokenizer,  RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType


def tokenize(input_col_name: str, tokenized_col_name: str, output_columns: list, with_count: bool,df):    
    tokenizer = Tokenizer(inputCol=input_col_name, outputCol=tokenized_col_name)
    count_tokens = udf(lambda words: len(words), IntegerType())
    tokenized = tokenizer.transform(df)

    regexTokenizer = RegexTokenizer(inputCol=input_col_name, outputCol=tokenized_col_name, pattern="\\W")
    regexTokenized = regexTokenizer.transform(df)

    tokenized = regexTokenized.select(*output_columns)
    if with_count:
        tokenized = tokenized.withColumn("tokens_count", count_tokens(col("tokenized_script")))
    return tokenized

tokenized = tokenize("script", "tokenized_script", ["actor", "movie_name", "script", "tokenized_script"], True, scripts_df)
tokenized.toPandas()

Unnamed: 0,actor,movie_name,script,tokenized_script,tokens_count
0,Al Pacino,The Humbling,Ten minutes to curtain. Ten minutes. All the w...,"[ten, minutes, to, curtain, ten, minutes, all,...",11778
1,Adam Sandler,Mixed Nuts,* [ group singing doo-wop ] * [ doo-wop contin...,"[group, singing, doo, wop, doo, wop, continues...",12118
2,Al Pacino,Donnie Brasco,You're not saying things that mean anything. I...,"[you, re, not, saying, things, that, mean, any...",14465
3,Anthony Hopkins,Instinct,- Are you listening? Are you listening to me? ...,"[are, you, listening, are, you, listening, to,...",4083
4,Anne Hathaway,Brokeback Mountain,Shit. You pair of deuces lookin' for work... I...,"[shit, you, pair, of, deuces, lookin, for, wor...",8189
...,...,...,...,...,...
221,Anne Hathaway,Bride Wars,- # I found # - # I found # # So many things #...,"[i, found, i, found, so, many, things, i, drea...",11401
222,Anne Hathaway,Colossal,(CRICKETS CHIRPING) (GIRL SPEAKING KOREAN) (WO...,"[crickets, chirping, girl, speaking, korean, w...",9671
223,Arnold Schwarzenegger,Collateral Damage,OCD from engine 35. On scene at 902 Sunnyvale....,"[ocd, from, engine, 35, on, scene, at, 902, su...",5943
224,Angelina Jolie,Hell's Kitchen,"Fuck it! Shut up, all right? Listen, we gotta ...","[fuck, it, shut, up, all, right, listen, we, g...",6201


# Stop-Word Sanitation

In [92]:
from pyspark.ml.feature import StopWordsRemover

def remove_stop_words(input_col_name: str, sanitized_col_name: str, output_columns: list, with_count: bool,df):
    stop_word_remover = StopWordsRemover(inputCol=input_col_name, outputCol=sanitized_col_name)
    sanitized = stop_word_remover.transform(df)
    sanitized = sanitized.select(*output_columns)
    if with_count:
        sanitized = sanitized.withColumn("sanitized_count", count_tokens(col("sanitized_script")))
    return sanitized

sanitized = remove_stop_words("tokenized_script", "sanitized_script", ["actor", "movie_name", "script", "tokenized_script", "tokens_count", "sanitized_script"], True, tokenized)
sanitized.toPandas()

Unnamed: 0,actor,movie_name,script,tokenized_script,tokens_count,sanitized_script,sanitized_count
0,Al Pacino,The Humbling,Ten minutes to curtain. Ten minutes. All the w...,"[ten, minutes, to, curtain, ten, minutes, all,...",11778,"[ten, minutes, curtain, ten, minutes, world, w...",4956
1,Adam Sandler,Mixed Nuts,* [ group singing doo-wop ] * [ doo-wop contin...,"[group, singing, doo, wop, doo, wop, continues...",12118,"[group, singing, doo, wop, doo, wop, continues...",6133
2,Al Pacino,Donnie Brasco,You're not saying things that mean anything. I...,"[you, re, not, saying, things, that, mean, any...",14465,"[re, saying, things, mean, anything, even, deb...",6557
3,Anthony Hopkins,Instinct,- Are you listening? Are you listening to me? ...,"[are, you, listening, are, you, listening, to,...",4083,"[listening, listening, yes, one, gorilla, see,...",1786
4,Anne Hathaway,Brokeback Mountain,Shit. You pair of deuces lookin' for work... I...,"[shit, you, pair, of, deuces, lookin, for, wor...",8189,"[shit, pair, deuces, lookin, work, suggest, ge...",3969
...,...,...,...,...,...,...,...
221,Anne Hathaway,Bride Wars,- # I found # - # I found # # So many things #...,"[i, found, i, found, so, many, things, i, drea...",11401,"[found, found, many, things, dreamed, dreamed,...",5425
222,Anne Hathaway,Colossal,(CRICKETS CHIRPING) (GIRL SPEAKING KOREAN) (WO...,"[crickets, chirping, girl, speaking, korean, w...",9671,"[crickets, chirping, girl, speaking, korean, w...",4832
223,Arnold Schwarzenegger,Collateral Damage,OCD from engine 35. On scene at 902 Sunnyvale....,"[ocd, from, engine, 35, on, scene, at, 902, su...",5943,"[ocd, engine, 35, scene, 902, sunnyvale, six, ...",3033
224,Angelina Jolie,Hell's Kitchen,"Fuck it! Shut up, all right? Listen, we gotta ...","[fuck, it, shut, up, all, right, listen, we, g...",6201,"[fuck, shut, right, listen, gotta, get, roll, ...",2775


# Selecting Only What Really Matters
And adding an ID column

In [144]:
from pyspark.sql.functions import monotonically_increasing_id, lit, row_number
from pyspark.sql.window import *

# It could be just that way but because monotonically_increasing_id() produces unique values that might be huge,
# its much simpler on the eye to use "row id" as following
# sanitized = sanitized.select(monotonically_increasing_id().alias('id'), "actor", "movie_name", "sanitized_script")


sanitized = sanitized.withColumn("temp_lit",lit("ABC"))
w = Window().partitionBy('temp_lit').orderBy(lit('A'))
sanitized = sanitized.withColumn("id", row_number().over(w)).drop("temp_lit")
sanitized = sanitized.select("id" ,"actor", "movie_name", "sanitized_script")
sanitized.toPandas()


Unnamed: 0,id,actor,movie_name,sanitized_script
0,1,Al Pacino,The Humbling,"[ten, minutes, curtain, ten, minutes, world, w..."
1,2,Adam Sandler,Mixed Nuts,"[group, singing, doo, wop, doo, wop, continues..."
2,3,Al Pacino,Donnie Brasco,"[re, saying, things, mean, anything, even, deb..."
3,4,Anthony Hopkins,Instinct,"[listening, listening, yes, one, gorilla, see,..."
4,5,Anne Hathaway,Brokeback Mountain,"[shit, pair, deuces, lookin, work, suggest, ge..."
...,...,...,...,...
221,222,Anne Hathaway,Bride Wars,"[found, found, many, things, dreamed, dreamed,..."
222,223,Anne Hathaway,Colossal,"[crickets, chirping, girl, speaking, korean, w..."
223,224,Arnold Schwarzenegger,Collateral Damage,"[ocd, engine, 35, scene, 902, sunnyvale, six, ..."
224,225,Angelina Jolie,Hell's Kitchen,"[fuck, shut, right, listen, gotta, get, roll, ..."


# Creating An Inverted Index

In [7]:
def create_index (row):
    index = {}
    for token in row[3]:
        if row[0] not in index.get(token, []):
            if index.get(token):
                index[token].append(row[0])
            else:
                index[token] = [row[0]]
    return index

indexes_per_doc = sanitized.rdd.map(create_index)
print(f"outputs a list of dicts, of the following form: \n "
          "[{'tell': [0], 'hawaii': [0], 'unbelievable': [0], 'oh': [0], 'yeah': [0], 'well': [0]},"
              "'happened': [1], 'met': [1], 'guy': [1], 'best': [1], 'week': [1], 'life': [1]...")

outputs a list of dicts, of the following form: 
 [{'tell': [0], 'hawaii': [0], 'unbelievable': [0], 'oh': [0], 'yeah': [0], 'well': [0]},'happened': [1], 'met': [1], 'guy': [1], 'best': [1], 'week': [1], 'life': [1]...


In [8]:
flattened = indexes_per_doc.flatMap(lambda doc: (doc.items()))
flattened.take(10)

[('ten', [1]),
 ('minutes', [1]),
 ('curtain', [1]),
 ('world', [1]),
 ('stage', [1]),
 ('men', [1]),
 ('women', [1]),
 ('merely', [1]),
 ('players', [1]),
 ('exits', [1])]

In [53]:
inverted_index = flattened.reduceByKey(lambda a, b: a+b)
inverted_index.toDF(["token", "docs"]).toPandas()

Unnamed: 0,token,docs
0,everything,"[1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
1,sometimes,"[1, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15, ..."
2,10,"[1, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 17, ..."
3,step,"[1, 3, 4, 7, 8, 10, 11, 16, 17, 18, 19, 21, 23..."
4,destroyed,"[1, 2, 18, 22, 26, 27, 29, 31, 33, 37, 38, 39,..."
...,...,...
37722,sunbathe,[216]
37723,repentant,"[217, 220]"
37724,uric,[220]
37725,vegetarianism,[220]


# TF
### Term Frequency

In [54]:
all_tokens = inverted_index.toDF(["token", "docs"]).select("token")
all_tokens.toPandas()

Unnamed: 0,token
0,everything
1,sometimes
2,10
3,step
4,destroyed
...,...
37722,sunbathe
37723,repentant
37724,uric
37725,vegetarianism


In [11]:
def gather_tf(data):
    tf = {}
    for term in data.sanitized_script:
        tf[term] = data.sanitized_script.count(term)
    return (data.id, tf)

tf = sanitized.rdd.map(gather_tf)
tf.take(1)

[(1,
  {'ten': 4,
   'minutes': 8,
   'curtain': 1,
   'world': 5,
   'stage': 15,
   'men': 7,
   'women': 8,
   'merely': 2,
   'players': 4,
   'exits': 3,
   'entrances': 3,
   'one': 22,
   'man': 20,
   'time': 26,
   'plays': 3,
   'many': 4,
   'parts': 2,
   'acts': 2,
   'seven': 5,
   'ages': 4,
   'believe': 15,
   'real': 13,
   'honest': 5,
   'yes': 41,
   'wasn': 9,
   'well': 63,
   'good': 21,
   'try': 10,
   'better': 2,
   'affecting': 1,
   'really': 24,
   'saying': 17,
   'life': 21,
   'mean': 54,
   'anything': 10,
   'anyone': 2,
   'ever': 12,
   'existed': 2,
   'fooling': 1,
   'committing': 1,
   'instincts': 1,
   'supposed': 6,
   'jump': 2,
   'know': 162,
   'jig': 1,
   'say': 25,
   'words': 4,
   'shakespeare': 4,
   'rest': 4,
   'five': 5,
   'face': 6,
   'masks': 1,
   'tragedy': 2,
   'comedy': 1,
   'let': 16,
   'put': 6,
   'together': 3,
   'tonight': 4,
   'buddy': 1,
   'love': 15,
   'first': 7,
   'whining': 2,
   'school': 1,
   'boy'

In [12]:
all_tokens_list = all_tokens.rdd.flatMap(lambda x: x).collect()
all_tokens_list

['everything',
 'sometimes',
 '10',
 'step',
 'destroyed',
 'led',
 'depressed',
 'solve',
 'healthy',
 'whimpering',
 'horrible',
 'orange',
 'commander',
 'brush',
 'kerry',
 'marbles',
 'felony',
 'mar',
 'restaurant',
 'zeros',
 'interested',
 'detainees',
 'quality',
 'huffington',
 'matters',
 'soldier',
 'kaput',
 'eventually',
 'metal',
 'ears',
 'urges',
 'hillsdale',
 'itsy',
 'purse',
 'herbal',
 'unnecessary',
 'safely',
 'tummy',
 'immediate',
 'tends',
 'wanting',
 'dedicates',
 'heinous',
 'pressures',
 'theaters',
 'bereft',
 'clancy',
 'melon',
 'stivie',
 'twitchy',
 'wealth',
 'gaoler',
 'unwillingly',
 'mending',
 'jill',
 'damsel',
 'cadets',
 'marcla',
 'perry',
 'overcooked',
 'orbital',
 'decommissioned',
 'navigatin',
 'policy',
 'biting',
 'wickedness',
 'joyfully',
 'styx',
 'males',
 'plates',
 'thorzetta',
 'standards',
 'backed',
 'upgraded',
 'musk',
 'bamboo',
 'olympiad',
 'catholic',
 'bisexual',
 'lyin',
 'icebox',
 'diver',
 'soak',
 'mrstasker',
 'r

In [13]:
all_tokens_bc = sc.broadcast(all_tokens_list)

In [14]:
def map_to_huge_tuple(row):
    word_count_list = [('doc_id', [row[0]])]
    for token in all_tokens_bc.value:
        word_count_list.append((token, [row[1].get(token,0)]))
    return word_count_list


In [15]:
full_vocabulary_tf_per_doc = tf.map(map_to_huge_tuple)
full_vocabulary_tf_per_doc.take(2)

[[('doc_id', [1]),
  ('everything', [10]),
  ('sometimes', [3]),
  ('10', [2]),
  ('step', [1]),
  ('destroyed', [1]),
  ('led', [1]),
  ('depressed', [0]),
  ('solve', [0]),
  ('healthy', [0]),
  ('whimpering', [0]),
  ('horrible', [0]),
  ('orange', [0]),
  ('commander', [0]),
  ('brush', [0]),
  ('kerry', [0]),
  ('marbles', [0]),
  ('felony', [0]),
  ('mar', [0]),
  ('restaurant', [0]),
  ('zeros', [0]),
  ('interested', [0]),
  ('detainees', [0]),
  ('quality', [0]),
  ('huffington', [0]),
  ('matters', [0]),
  ('soldier', [0]),
  ('kaput', [0]),
  ('eventually', [0]),
  ('metal', [0]),
  ('ears', [0]),
  ('urges', [0]),
  ('hillsdale', [0]),
  ('itsy', [0]),
  ('purse', [0]),
  ('herbal', [0]),
  ('unnecessary', [0]),
  ('safely', [0]),
  ('tummy', [0]),
  ('immediate', [0]),
  ('tends', [0]),
  ('wanting', [0]),
  ('dedicates', [0]),
  ('heinous', [0]),
  ('pressures', [0]),
  ('theaters', [0]),
  ('bereft', [0]),
  ('clancy', [0]),
  ('melon', [0]),
  ('stivie', [0]),
  ('twitc

In [16]:
full_vocabulary_tf_per_doc = full_vocabulary_tf_per_doc.flatMap(lambda x: x)

In [17]:
doc_ids = full_vocabulary_tf_per_doc.filter(lambda x: x[0] == 'doc_id').reduceByKey(lambda a, b: a + b).collect()
doc_ids_list = doc_ids[0][1]
doc_ids_list = [str(i) for i in doc_ids_list]

In [18]:
tf_rdd = full_vocabulary_tf_per_doc.reduceByKey(lambda a, b: a + b).map(lambda x: (x[0], *x[1]))

In [19]:
tf_table = tf_rdd.toDF(['token'] + doc_ids_list)

In [20]:
tf_table.toPandas()

Unnamed: 0,token,1,2,3,4,5,6,7,8,9,...,217,218,219,220,221,222,223,224,225,226
0,everything,10,6,11,0,1,5,9,7,4,...,5,8,4,7,4,8,10,3,5,9
1,sometimes,3,1,4,1,2,2,0,3,0,...,2,1,3,1,1,5,0,0,2,1
2,10,2,0,1,0,1,1,1,3,1,...,12,4,1,2,3,4,1,2,1,1
3,step,1,0,2,1,0,0,1,1,0,...,1,2,0,3,2,1,0,0,1,2
4,destroyed,1,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37723,sunbathe,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37724,repentant,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
37725,uric,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
37726,vegetarianism,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0


# IDF

In [55]:
def counter(row):
    c = 0
    for i in row[1]:
        if i != 0:
            c += 1
    return row[0], c


idf_table = inverted_index.map(counter).map(lambda x: (x[0], x[1], math.log(226/(1+x[1])))).toDF(["token", "docs_count", "idf"])
idf_table.toPandas()

Unnamed: 0,token,docs_count,idf
0,everything,219,0.026907
1,sometimes,165,0.308547
2,10,152,0.390097
3,step,126,0.576348
4,destroyed,40,1.706963
...,...,...,...
37722,sunbathe,1,4.727388
37723,repentant,2,4.321923
37724,uric,1,4.727388
37725,vegetarianism,1,4.727388


# TF-IDF

In [22]:
tfidf_table = tf_table.join(idf_table, ["token"])


def calculate_tfidf(row):
    result = [row[0]]
    for i in range(1, len(row)-2):
        result.append(row[i]*row[-1])
    result.extend(row[-2:])
    return result

tfidf_table = tfidf_table.rdd.map(calculate_tfidf).toDF(['token'] + doc_ids_list + ["docs_count", "idf"])
tfidf_table.toPandas()


Unnamed: 0,token,1,2,3,4,5,6,7,8,9,...,219,220,221,222,223,224,225,226,docs_count,idf
0,1970s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,4.727388
1,296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,4.727388
2,57th,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,4.727388
3,675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,4.727388
4,829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,4.727388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37722,wack,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,3.811097
37723,wane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,4.727388
37724,weed,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15,2.647946
37725,widening,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,4.727388


In [23]:
from math import sqrt

def cosine_similarity(v1, v2):
    dot_product = 0
    v1i_sqr = 0
    v2i_sqr = 0
    
    for i in range(len(v1)):
        dot_product += v1[i] * v2[i]
    
    for i in range(len(v1)):
        v1i_sqr += v1[i] ** 2
        v2i_sqr += v2[i] ** 2
    
    v1i_sqr = sqrt(v1i_sqr)
    v2i_sqr = sqrt(v2i_sqr)
    
    return dot_product / (v1i_sqr * v2i_sqr)
    

In [24]:
import numpy as np
from scipy import spatial
from pyspark.mllib.linalg import Vectors

#@ TODO -  run with full DF
column_names = tfidf_table.schema.names[1:10]
vectors_list = []
for column_name in column_names:
    vectors_list.append(Vectors.dense(tfidf_table.select(column_name).rdd.map(lambda x: x[0]).collect()))

In [25]:
def get_max_cos_similarity_for_every_doc():
    max_similarity_per_doc = []
    for i,v1 in enumerate(vectors_list):
        top_5_list = []
        for j,v2 in enumerate(vectors_list):
            if i==j: continue
            cos = cosine_similarity(v1, v2)
            if len(top_5_list) < 5 or any([cos > tup[0] for tup in top_5_list]):
                top_5_list.append((cos, i, j))
            if len(top_5_list) > 5: top_5_list.remove(min(top_5_list))
        max_similarity_per_doc.append(top_5_list)
    return max_similarity_per_doc

max_similarity_per_doc = get_max_cos_similarity_for_every_doc()

In [26]:
all_highest_similarities = []
for top5_of_doc in max_similarity_per_doc:
        all_highest_similarities += top5_of_doc
all_highest_similarities

[(0.012909531210438583, 0, 1),
 (0.013232893527129698, 0, 2),
 (0.011892218753428222, 0, 3),
 (0.012811577917237289, 0, 5),
 (0.024395276171523355, 0, 7),
 (0.012909531210438583, 1, 0),
 (0.016409018482637344, 1, 2),
 (0.01331722952838461, 1, 5),
 (0.03158123803499432, 1, 6),
 (0.01098323509416561, 1, 7),
 (0.013232893527129698, 2, 0),
 (0.016409018482637344, 2, 1),
 (0.019073137521866696, 2, 4),
 (0.01708524057919423, 2, 5),
 (0.0436118398378908, 2, 7),
 (0.011892218753428222, 3, 0),
 (0.007028978048106481, 3, 1),
 (0.007670549907149207, 3, 4),
 (0.006700769426539351, 3, 5),
 (0.012339258213451908, 3, 7),
 (0.010658643751709539, 4, 0),
 (0.019073137521866696, 4, 2),
 (0.007670549907149207, 4, 3),
 (0.009769455401242185, 4, 5),
 (0.02078286885185777, 4, 7),
 (0.012811577917237289, 5, 0),
 (0.01331722952838461, 5, 1),
 (0.01708524057919423, 5, 2),
 (0.010503372659554424, 5, 6),
 (0.01750284788070137, 5, 7),
 (0.007133098123776154, 6, 0),
 (0.03158123803499432, 6, 1),
 (0.010503372659554

In [27]:
def find_top_5():
    sorted_similarities = sorted(all_highest_similarities, key=lambda x: x[0], reverse=True)
    return sorted_similarities[:5]
print(find_top_5())

[(0.0436118398378908, 2, 7), (0.0436118398378908, 7, 2), (0.03158123803499432, 1, 6), (0.03158123803499432, 6, 1), (0.024395276171523355, 0, 7)]


# Since this method generates cosine similarity for every column vector N times, it takes too long.....

In [28]:
# import pyspark.sql.functions as func

# def cosine_similarity_alternative(df, col1, col2):
#     df_cosine = df.select(func.sum(df[col1] * df[col2]).alias('dot'), 
#                           func.sqrt(func.sum(df[col1]**2)).alias('norm1'), 
#                           func.sqrt(func.sum(df[col2] **2)).alias('norm2'))
#     d = df_cosine.rdd.collect()[0].asDict()
#     return d['dot']/(d['norm1'] * d['norm2'])

# results = []
# for i in range(1, 15):
#     for j in range(1, 15):
#         if i == j: continue
#         results.append(cosine_similarity_alternative(tfidf_table, str(i), str(j)))
# results
        
        

# Section 4

In [139]:
def search(search_term: str, k_results = 10):
    results = []
    search_vector = prepare_for_search(search_term)
    for i, v2 in enumerate(vectors_list):
        results.append((i, cosine_similarity(search_vector, v2)))
    results = sorted(results, key=lambda x: x[1], reverse=True)
    return results[:k_results]
    
def prepare_for_search(search_term: str):  
    cleaned = clean(search_term)
    return vectorize(cleaned)
    
def clean(search_term):    
    term = search_term.split(" ")
    term_df = sc.parallelize(term).map(lambda x: [x]).toDF(["text"])
    tokenized_search = tokenize("text", "tokens", ["tokens"], False, term_df)
    cleared_term = remove_stop_words("tokens", "clear_text", ["clear_text"], False, tokenized_search)
    cleared_term = cleared_term.rdd.map(lambda x: x[0]).filter(lambda x: len(x) != 0).flatMap(lambda list: list)
    return cleared_term.collect()

def vectorize(cleaned_term: list):
    result = tfidf_table.withColumn("score", F.when((tfidf_table.token.isin(cleaned_term)), F.col("idf")).otherwise(0))
    result = result.select("score")
    return Vectors.dense(result.rdd.map(lambda x: x[0]).collect())
    
    
    
s = "So tell me. How was Hawaii? - It was unbelievable. - Oh, yeah? - Well, what happened? - I met this guy. It was the best week of my life. It was just a little vacation romance. But he was so sweet. He took me to all these cool local places. We went scuba diving... - Snorkeling. - Mountain climbing. We went cliff diving. Well, we got a little drunk. - He gave me... - A back rub. We slow danced... ...in the rain. But it wasn't just about the sex. He pounded me like a mallard duck. It ended kind of weird, though. When I asked for his number, he said he's... - Married. - Gay. - Entering the priesthood. - He doesn't believe in phones. He just kind of ran away. You know, it was just a little fling, but... I won't forget my week... ...with Henry Roth. - Henry Roth. - Harry. Harry Paratesticles. - Henry Roth. - Henry Roth. - Henry Roth. - Henry Roth. Henry Roth, why didn't you tell me you were a secret agent? I prefer intelligence operative, and I couldn't tell you until I knew you. Well, can I call you when I land? You can call me, but I'll be in Peru. I said that a little loud. Come on, that's a 187, code blue. We got the wolf sleeping at night. He's slipping his arm in the drawer"    
search(s)

#@TODO - Run 10 examples

[(4, 0.007108698291731857),
 (1, 0.005959977492674367),
 (7, 0.0047730930825604315),
 (8, 0.004462832709302788),
 (2, 0.0041997589703851234),
 (5, 0.0032734892173835943),
 (0, 0.0028497647484616822),
 (3, 0.0012153752589736206),
 (6, 0.0007481628352198741)]

# Section 5 - Actors as categories

In [154]:
import pyspark.sql.functions as func

sanitized.select("id", "actor").orderBy(["actor"]).toPandas()

Unnamed: 0,id,actor
0,2,Adam Sandler
1,12,Adam Sandler
2,24,Adam Sandler
3,26,Adam Sandler
4,30,Adam Sandler
...,...,...
221,201,Arnold Schwarzenegger
222,207,Arnold Schwarzenegger
223,208,Arnold Schwarzenegger
224,221,Arnold Schwarzenegger


In [161]:
sanitized.select(func.countDistinct("actor")).show()

+---------------------+
|count(DISTINCT actor)|
+---------------------+
|                    7|
+---------------------+

