In [1]:
import pyspark
import math
import itertools
import os
from decimal import Decimal
from operator import add
from pyspark.sql import Row

sc = pyspark.SparkContext.getOrCreate()
sqlContext = pyspark.sql.SQLContext(sc)

# File Reading into DataFrame

In [2]:
scripts_directory = '/usr/data/movie_scripts/'
files_list = os.listdir(scripts_directory)
files_list_rdd = sc.parallelize(files_list)

In [3]:

def map_script_to_meta(filename):
    if "txt" not in filename:
        return []
    script = open(scripts_directory + filename, 'r', encoding='utf8').read()
    actor, movie = filename[:-4].split("_")
    return [Row(actor=actor, movie_name=movie, script=script)]

scripts_df = files_list_rdd.flatMap(map_script_to_meta).toDF()
scripts_df.show()

+---------------+--------------------+--------------------+
|          actor|          movie_name|              script|
+---------------+--------------------+--------------------+
|      Al Pacino|        The Humbling|Ten minutes to cu...|
|   Adam Sandler|          Mixed Nuts|* [ group singing...|
|      Al Pacino|       Donnie Brasco|You're not saying...|
|Anthony Hopkins|            Instinct|- Are you listeni...|
|  Anne Hathaway|  Brokeback Mountain|Shit. You pair of...|
|      Al Pacino|       People I Know|[Receiver rattles...|
|  Anne Hathaway|Alice Through the...|(THUNDER RUMBLING...|
|      Al Pacino|       Danny Collins|Wow, you're Danny...|
| Angelina Jolie|             Beowulf|Hrothgar! Hrothga...|
|      Amy Adams|     American Hustle|Play one of the b...|
| Angelina Jolie|       Kung Fu Panda|Legend tells of a...|
|   Adam Sandler|Hotel Transylvania 2|Welcome! Welcome!...|
| Angelina Jolie|         Pushing Tin|Continental 901, ...|
|      Amy Adams|             Pumpkin|Th

# Characters Sanitation and Tokenization

In [4]:
from pyspark.ml.feature import Tokenizer,  RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

tokenizer = Tokenizer(inputCol="script", outputCol="tokenized_script")
count_tokens = udf(lambda words: len(words), IntegerType())
tokenized = tokenizer.transform(scripts_df)

regexTokenizer = RegexTokenizer(inputCol="script", outputCol="tokenized_script", pattern="\\W")
regexTokenized = regexTokenizer.transform(scripts_df)

tokenized = regexTokenized.select("actor", "movie_name", "script", "tokenized_script").withColumn("tokens_count", count_tokens(col("tokenized_script")))
tokenized.show()

+---------------+--------------------+--------------------+--------------------+------------+
|          actor|          movie_name|              script|    tokenized_script|tokens_count|
+---------------+--------------------+--------------------+--------------------+------------+
|      Al Pacino|        The Humbling|Ten minutes to cu...|[ten, minutes, to...|       11778|
|   Adam Sandler|          Mixed Nuts|* [ group singing...|[group, singing, ...|       12118|
|      Al Pacino|       Donnie Brasco|You're not saying...|[you, re, not, sa...|       14465|
|Anthony Hopkins|            Instinct|- Are you listeni...|[are, you, listen...|        4083|
|  Anne Hathaway|  Brokeback Mountain|Shit. You pair of...|[shit, you, pair,...|        8189|
|      Al Pacino|       People I Know|[Receiver rattles...|[receiver, rattle...|       11378|
|  Anne Hathaway|Alice Through the...|(THUNDER RUMBLING...|[thunder, rumblin...|        7986|
|      Al Pacino|       Danny Collins|Wow, you're Danny...|[

# Stop-Word Sanitation

In [5]:
from pyspark.ml.feature import StopWordsRemover

stop_word_remover = StopWordsRemover(inputCol="tokenized_script", outputCol="sanitized_script")
sanitized = stop_word_remover.transform(tokenized)
sanitized = sanitized.select("actor", "movie_name", "script", "tokenized_script", "tokens_count", "sanitized_script").withColumn("sanitized_count", count_tokens(col("sanitized_script")))
sanitized.show()

+---------------+--------------------+--------------------+--------------------+------------+--------------------+---------------+
|          actor|          movie_name|              script|    tokenized_script|tokens_count|    sanitized_script|sanitized_count|
+---------------+--------------------+--------------------+--------------------+------------+--------------------+---------------+
|      Al Pacino|        The Humbling|Ten minutes to cu...|[ten, minutes, to...|       11778|[ten, minutes, cu...|           4956|
|   Adam Sandler|          Mixed Nuts|* [ group singing...|[group, singing, ...|       12118|[group, singing, ...|           6133|
|      Al Pacino|       Donnie Brasco|You're not saying...|[you, re, not, sa...|       14465|[re, saying, thin...|           6557|
|Anthony Hopkins|            Instinct|- Are you listeni...|[are, you, listen...|        4083|[listening, liste...|           1786|
|  Anne Hathaway|  Brokeback Mountain|Shit. You pair of...|[shit, you, pair,...|   

# Selecting Only What Really Matters
And adding an ID column

In [71]:
from pyspark.sql.functions import monotonically_increasing_id

sanitized = sanitized.select(monotonically_increasing_id().alias('id')+1, "actor", "movie_name", "sanitized_script")
sanitized.show()

+-------------------------------------------+---------------+--------------------+--------------------+
|(monotonically_increasing_id() AS `id` + 1)|          actor|          movie_name|    sanitized_script|
+-------------------------------------------+---------------+--------------------+--------------------+
|                                          1|      Al Pacino|        The Humbling|[ten, minutes, cu...|
|                                          2|   Adam Sandler|          Mixed Nuts|[group, singing, ...|
|                                          3|      Al Pacino|       Donnie Brasco|[re, saying, thin...|
|                                          4|Anthony Hopkins|            Instinct|[listening, liste...|
|                                          5|  Anne Hathaway|  Brokeback Mountain|[shit, pair, deuc...|
|                                          6|      Al Pacino|       People I Know|[receiver, rattle...|
|                                          7|  Anne Hathaway|Ali

# Creating An Inverted Index

In [73]:
def create_index (row):
    index = {}
    for token in row[3]:
        if row[0] not in index.get(token, []):
            if index.get(token):
                index[token].append(row[0])
            else:
                index[token] = [row[0]]
    return index

indexes_per_doc = sanitized.rdd.map(create_index)
print(f"outputs a list of dicts, of the following form: \n "
          "[{'tell': [0], 'hawaii': [0], 'unbelievable': [0], 'oh': [0], 'yeah': [0], 'well': [0]},"
              "'happened': [1], 'met': [1], 'guy': [1], 'best': [1], 'week': [1], 'life': [1]...")

outputs a list of dicts, of the following form: 
 [{'tell': [0], 'hawaii': [0], 'unbelievable': [0], 'oh': [0], 'yeah': [0], 'well': [0]},'happened': [1], 'met': [1], 'guy': [1], 'best': [1], 'week': [1], 'life': [1]...


In [74]:
flattened = indexes_per_doc.flatMap(lambda doc: (doc.items()))
flattened.take(10)

[('ten', [1]),
 ('minutes', [1]),
 ('curtain', [1]),
 ('world', [1]),
 ('stage', [1]),
 ('men', [1]),
 ('women', [1]),
 ('merely', [1]),
 ('players', [1]),
 ('exits', [1])]

In [75]:
inverted_index = flattened.reduceByKey(lambda a, b: a+b)
inverted_index.toDF(["token", "docs"]).show()

+----------+--------------------+
|     token|                docs|
+----------+--------------------+
|   curtain|[1, 8589934595, 8...|
|     stage|[1, 12, 22, 23, 2...|
|     women|[1, 2, 3, 6, 8, 9...|
|     plays|[1, 6, 7, 10, 13,...|
|       yes|[1, 2, 3, 4, 5, 6...|
|      wasn|[1, 2, 3, 4, 5, 6...|
|    saying|[1, 2, 3, 6, 7, 8...|
|      mean|[1, 2, 3, 4, 5, 6...|
|      ever|[1, 2, 3, 4, 5, 6...|
|   existed|[1, 10, 858993459...|
|committing|[1, 17179869216, ...|
|      know|[1, 2, 3, 4, 5, 6...|
|       let|[1, 2, 3, 4, 5, 6...|
|      love|[1, 2, 3, 4, 5, 6...|
|      like|[1, 2, 3, 4, 5, 6...|
|      open|[1, 2, 3, 4, 5, 6...|
|      didn|[1, 2, 3, 4, 5, 6...|
|     axler|                 [1]|
|   dressed|[1, 2, 7, 8, 10, ...|
|     three|[1, 2, 3, 4, 5, 6...|
+----------+--------------------+
only showing top 20 rows



# TF
### Term Frequency

In [10]:
all_tokens = inverted_index.toDF(["token", "docs"]).select("token")
all_tokens.show()

+----------+
|     token|
+----------+
|   curtain|
|     stage|
|     women|
|     plays|
|       yes|
|      wasn|
|    saying|
|      mean|
|      ever|
|   existed|
|committing|
|      know|
|       let|
|      love|
|      like|
|      open|
|      didn|
|     axler|
|   dressed|
|     three|
+----------+
only showing top 20 rows



In [11]:
def gather_tf(data):
    tf = {}
    for term in data.sanitized_script:
        tf[term] = data.sanitized_script.count(term)
    return (data.id, tf)

tf = sanitized.rdd.map(gather_tf)
tf.take(1)

[(0,
  {'ten': 4,
   'minutes': 8,
   'curtain': 1,
   'world': 5,
   'stage': 15,
   'men': 7,
   'women': 8,
   'merely': 2,
   'players': 4,
   'exits': 3,
   'entrances': 3,
   'one': 22,
   'man': 20,
   'time': 26,
   'plays': 3,
   'many': 4,
   'parts': 2,
   'acts': 2,
   'seven': 5,
   'ages': 4,
   'believe': 15,
   'real': 13,
   'honest': 5,
   'yes': 41,
   'wasn': 9,
   'well': 63,
   'good': 21,
   'try': 10,
   'better': 2,
   'affecting': 1,
   'really': 24,
   'saying': 17,
   'life': 21,
   'mean': 54,
   'anything': 10,
   'anyone': 2,
   'ever': 12,
   'existed': 2,
   'fooling': 1,
   'committing': 1,
   'instincts': 1,
   'supposed': 6,
   'jump': 2,
   'know': 162,
   'jig': 1,
   'say': 25,
   'words': 4,
   'shakespeare': 4,
   'rest': 4,
   'five': 5,
   'face': 6,
   'masks': 1,
   'tragedy': 2,
   'comedy': 1,
   'let': 16,
   'put': 6,
   'together': 3,
   'tonight': 4,
   'buddy': 1,
   'love': 15,
   'first': 7,
   'whining': 2,
   'school': 1,
   'boy'

In [12]:
all_tokens_list = all_tokens.rdd.flatMap(lambda x: x).collect()
all_tokens_list

['curtain',
 'stage',
 'women',
 'plays',
 'yes',
 'wasn',
 'saying',
 'mean',
 'ever',
 'existed',
 'committing',
 'know',
 'let',
 'love',
 'like',
 'open',
 'didn',
 'axler',
 'dressed',
 'three',
 'living',
 'heartily',
 'came',
 'hither',
 'eventful',
 'oblivion',
 'teeth',
 'everything',
 'genius',
 'subtle',
 'ready',
 'possible',
 '16',
 'family',
 'maybe',
 'always',
 'gun',
 'person',
 'must',
 'kind',
 'pretty',
 'nice',
 'hate',
 'nights',
 'everywhere',
 'fall',
 'gift',
 'recede',
 'lose',
 'ago',
 'start',
 'listening',
 'anymore',
 'happening',
 'even',
 'mind',
 'sure',
 'two',
 'free',
 'east',
 'league',
 'drove',
 'head',
 'cartoons',
 'probably',
 'make',
 'therapy',
 'says',
 'burning',
 'ask',
 'psychiatric',
 'd',
 'job',
 'willing',
 'pay',
 'sybil',
 'neighbors',
 'socially',
 'yummy',
 'instructions',
 'worry',
 'habit',
 'pegeen',
 '40',
 'space',
 'seen',
 'kills',
 'live',
 'westcott',
 'scenic',
 'precisely',
 'co',
 'played',
 'heard',
 '000',
 'away',
 

In [13]:
all_tokens_bc = sc.broadcast(all_tokens_list)

In [49]:
def map_to_huge_tuple(row):
    word_count_list = [('doc_id', [row[0]])]
    for token in all_tokens_bc.value:
        word_count_list.append((token, [row[1].get(token,0)]))
    return word_count_list


In [50]:
full_vocabulary_tf_per_doc = tf.map(map_to_huge_tuple)
full_vocabulary_tf_per_doc.take(2)

[[('doc_id', [0]),
  ('curtain', [1]),
  ('stage', [15]),
  ('women', [8]),
  ('plays', [3]),
  ('yes', [41]),
  ('wasn', [9]),
  ('saying', [17]),
  ('mean', [54]),
  ('ever', [12]),
  ('existed', [2]),
  ('committing', [1]),
  ('know', [162]),
  ('let', [16]),
  ('love', [15]),
  ('like', [68]),
  ('open', [3]),
  ('didn', [13]),
  ('axler', [16]),
  ('dressed', [3]),
  ('three', [6]),
  ('living', [3]),
  ('heartily', [1]),
  ('came', [3]),
  ('hither', [1]),
  ('eventful', [1]),
  ('oblivion', [1]),
  ('teeth', [1]),
  ('everything', [10]),
  ('genius', [4]),
  ('subtle', [1]),
  ('ready', [7]),
  ('possible', [3]),
  ('16', [4]),
  ('family', [4]),
  ('maybe', [14]),
  ('always', [9]),
  ('gun', [4]),
  ('person', [4]),
  ('must', [3]),
  ('kind', [9]),
  ('pretty', [3]),
  ('nice', [8]),
  ('hate', [1]),
  ('nights', [1]),
  ('everywhere', [1]),
  ('fall', [2]),
  ('gift', [1]),
  ('recede', [2]),
  ('lose', [1]),
  ('ago', [2]),
  ('start', [2]),
  ('listening', [1]),
  ('anymor

In [51]:
full_vocabulary_tf_per_doc = full_vocabulary_tf_per_doc.flatMap(lambda x: x)

In [59]:
doc_ids = full_vocabulary_tf_per_doc.filter(lambda x: x[0] == 'doc_id').reduceByKey(lambda a, b: a + b).collect()
doc_ids_list = doc_ids[0][1]
doc_ids_list = [str(i) for i in doc_ids_list]

In [31]:
tf_rdd = full_vocabulary_tf_per_doc.reduceByKey(lambda a, b: a + b).map(lambda x: (x[0], *x[1]))

In [63]:
tf_table = tf_rdd.toDF(['token'] + doc_ids_list)

In [64]:
tf_table.take(2)

[Row(token='curtain', 0=1, 1=0, 2=0, 3=0, 4=0, 5=0, 6=0, 7=0, 8=0, 9=0, 10=0, 11=0, 12=0, 13=0, 14=0, 15=0, 16=0, 17=0, 18=0, 19=0, 20=0, 21=0, 22=0, 23=0, 24=0, 25=0, 26=0, 27=0, 28=0, 29=0, 30=0, 31=0, 32=0, 33=0, 34=0, 35=0, 36=0, 8589934592=0, 8589934593=0, 8589934594=1, 8589934595=0, 8589934596=0, 8589934597=0, 8589934598=0, 8589934599=0, 8589934600=0, 8589934601=0, 8589934602=0, 8589934603=0, 8589934604=0, 8589934605=0, 8589934606=0, 8589934607=0, 8589934608=0, 8589934609=0, 8589934610=0, 8589934611=0, 8589934612=0, 8589934613=0, 8589934614=0, 8589934615=8, 8589934616=0, 8589934617=0, 8589934618=0, 8589934619=0, 8589934620=0, 8589934621=0, 8589934622=0, 8589934623=0, 8589934624=0, 8589934625=0, 8589934626=0, 8589934627=0, 8589934628=0, 17179869184=0, 17179869185=0, 17179869186=0, 17179869187=0, 17179869188=0, 17179869189=0, 17179869190=0, 17179869191=0, 17179869192=0, 17179869193=0, 17179869194=0, 17179869195=0, 17179869196=0, 17179869197=0, 17179869198=0, 17179869199=1, 17179869

# IDF

In [81]:
def counter(row):
    c = 0
    for i in row[1]:
        if i != 0:
            c += 1
    return row[0], c


inverted_index.map(counter).map(lambda x: (x[0], x[1], math.log(226/x[1]))).toDF(["token", "docs_count", "idf"]).show()

+----------+----------+--------------------+
|     token|docs_count|                 idf|
+----------+----------+--------------------+
|   curtain|        11|  3.0226397264739155|
|     stage|        57|  1.3774837314377357|
|     women|       125|  0.5922212619699848|
|     plays|        49|  1.5287147011616593|
|       yes|       220|0.026907452919924402|
|      wasn|       201| 0.11723009121321011|
|    saying|       185| 0.20017917419396103|
|      mean|       220|0.026907452919924402|
|      ever|       219|0.031463269455785106|
|   existed|        15|  2.7124847981700757|
|committing|        10|  3.1179499062782403|
|      know|       225|0.004434597067865775|
|       let|       226|                 0.0|
|      love|       217| 0.04063764573182626|
|      like|       224|0.008888947417245994|
|      open|       193| 0.15784481036740028|
|      didn|       218| 0.03603993648319687|
|     axler|         1|   5.420534999272286|
|   dressed|        65|  1.2461477293766487|
|     thre