In [268]:
import pyspark
import math
import itertools
import os
from decimal import Decimal
from operator import add
sc = pyspark.SparkContext.getOrCreate()
sqlContext = pyspark.sql.SQLContext(sc)

# File Reading into DataFrame

In [269]:
scripts_directory = '/usr/data/movie_scripts/'
files_list = os.listdir(scripts_directory)
files_list_rdd = sc.parallelize(files_list)

In [270]:
def map_script_to_meta(filename):
    if "txt" not in filename:
        return []
    script = open(scripts_directory + filename, 'r', encoding='utf8').read()
    actor, movie = filename[:-4].split("_")
    return [Row(actor=actor, movie_name=movie, script=script)]

scripts_df = rdd.flatMap(map_script_to_meta).toDF()
scripts_df.show()

+------------+--------------------+--------------------+
|       actor|          movie_name|              script|
+------------+--------------------+--------------------+
|Adam Sandler|      50 First Dates|So tell me. How w...|
|Adam Sandler|            Airheads|Oh, yeah! That wa...|
|Adam Sandler|    Anger Management|<font color=orang...|
|Adam Sandler|     Bedtime Stories|[man] I'm going t...|
|Adam Sandler|           Big Daddy|- Hello? - Sonny,...|
|Adam Sandler|       Billy Madison|~ Suntan lotion i...|
|Adam Sandler|             Blended|Yes. Uh-huh. I'm ...|
|Adam Sandler|         Bulletproof|I believe in Amer...|
|Adam Sandler|               Click|Advertise your pr...|
|Adam Sandler|           Coneheads|Do you copy? - Ro...|
|Adam Sandler|          Dirty Work|Hand over the mil...|
|Adam Sandler|  Eight Crazy Nights|Well, all right. ...|
|Adam Sandler|        Funny People|Hello, ladies. So...|
|Adam Sandler|         Grown Ups 2|<font color="#D90...|
|Adam Sandler|           Grown 

# Characters Sanitation and Tokenization

In [271]:
from pyspark.ml.feature import Tokenizer,  RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

tokenizer = Tokenizer(inputCol="script", outputCol="tokenized_script")
count_tokens = udf(lambda words: len(words), IntegerType())
tokenized = tokenizer.transform(scripts_df)
regexTokenized = regexTokenizer.transform(scripts_df)
tokenized = regexTokenized.select("actor", "movie_name", "script", "tokenized_script").withColumn("tokens_count", count_tokens(col("tokenized_script")))
tokenized.show()

+------------+--------------------+--------------------+--------------------+------------+
|       actor|          movie_name|              script|    tokenized_script|tokens_count|
+------------+--------------------+--------------------+--------------------+------------+
|Adam Sandler|      50 First Dates|So tell me. How w...|[so, tell, me, ho...|        9853|
|Adam Sandler|            Airheads|Oh, yeah! That wa...|[oh, yeah, that, ...|       10230|
|Adam Sandler|    Anger Management|<font color=orang...|[font, color, ora...|       12020|
|Adam Sandler|     Bedtime Stories|[man] I'm going t...|[man, i, m, going...|       12265|
|Adam Sandler|           Big Daddy|- Hello? - Sonny,...|[hello, sonny, it...|       10934|
|Adam Sandler|       Billy Madison|~ Suntan lotion i...|[suntan, lotion, ...|        8081|
|Adam Sandler|             Blended|Yes. Uh-huh. I'm ...|[yes, uh, huh, i,...|       12809|
|Adam Sandler|         Bulletproof|I believe in Amer...|[i, believe, in, ...|        7295|

# Stop-Word Sanitation

In [272]:
from pyspark.ml.feature import StopWordsRemover

stop_word_remover = StopWordsRemover(inputCol="tokenized_script", outputCol="sanitized_script")
sanitized = stop_word_remover.transform(tokenized)
sanitized = sanitized.select("actor", "movie_name", "script", "tokenized_script", "tokens_count", "sanitized_script").withColumn("sanitized_count", count_tokens(col("sanitized_script")))
sanitized.show()

+------------+--------------------+--------------------+--------------------+------------+--------------------+---------------+
|       actor|          movie_name|              script|    tokenized_script|tokens_count|    sanitized_script|sanitized_count|
+------------+--------------------+--------------------+--------------------+------------+--------------------+---------------+
|Adam Sandler|      50 First Dates|So tell me. How w...|[so, tell, me, ho...|        9853|[tell, hawaii, un...|           4650|
|Adam Sandler|            Airheads|Oh, yeah! That wa...|[oh, yeah, that, ...|       10230|[oh, yeah, pinhea...|           5013|
|Adam Sandler|    Anger Management|<font color=orang...|[font, color, ora...|       12020|[font, color, ora...|           5941|
|Adam Sandler|     Bedtime Stories|[man] I'm going t...|[man, i, m, going...|       12265|[man, m, going, t...|           6368|
|Adam Sandler|           Big Daddy|- Hello? - Sonny,...|[hello, sonny, it...|       10934|[hello, sonny,

# Selecting Only What Really Matters
And adding an ID column

In [290]:
from pyspark.sql.functions import monotonically_increasing_id

sanitized = sanitized.select(monotonically_increasing_id().alias('id'), "actor", "movie_name", "sanitized_script")
sanitized.show()

+---+------------+--------------------+--------------------+
| id|       actor|          movie_name|    sanitized_script|
+---+------------+--------------------+--------------------+
|  0|Adam Sandler|      50 First Dates|[tell, hawaii, un...|
|  1|Adam Sandler|            Airheads|[oh, yeah, pinhea...|
|  2|Adam Sandler|    Anger Management|[font, color, ora...|
|  3|Adam Sandler|     Bedtime Stories|[man, m, going, t...|
|  4|Adam Sandler|           Big Daddy|[hello, sonny, da...|
|  5|Adam Sandler|       Billy Madison|[suntan, lotion, ...|
|  6|Adam Sandler|             Blended|[yes, uh, huh, m,...|
|  7|Adam Sandler|         Bulletproof|[believe, america...|
|  8|Adam Sandler|               Click|[advertise, produ...|
|  9|Adam Sandler|           Coneheads|[copy, roger, cap...|
| 10|Adam Sandler|          Dirty Work|[hand, milk, mone...|
| 11|Adam Sandler|  Eight Crazy Nights|[well, right, loo...|
| 12|Adam Sandler|        Funny People|[hello, ladies, l...|
| 13|Adam Sandler|      

# Creating An Inverted Index

In [313]:
def create_index (row):
    index = {}
    for token in row[3]:
        if row[0] not in index.get(token, []):
            if index.get(token):
                index[token].append(row[0])
            else:
                index[token] = [row[0]]
    return index

indexes_per_doc = sanitized.rdd.map(create_index)
print(f"outputs a list of dicts, of the following form: \n "
          "[{'tell': [0],"
              "'hawaii': [0],"
              "'unbelievable': [0],"
              "'oh': [0],"
              "'yeah': [0],"
              "'well': [0]},"
              "'happened': [1],"
              "'met': [1],"
              "'guy': [1],"
              "'best': [1],"
              "'week': [1],"
              "'life': [1],")

outputs a list of dicts, of the following form: 
 [{'tell': [0],'hawaii': [0],'unbelievable': [0],'oh': [0],'yeah': [0],'well': [0]},'happened': [1],'met': [1],'guy': [1],'best': [1],'week': [1],'life': [1],


In [306]:
flattened = indexes_per_doc.flatMap(lambda doc: (doc.items()))
flattened.take(10)

[('tell', [0]),
 ('hawaii', [0]),
 ('unbelievable', [0]),
 ('oh', [0]),
 ('yeah', [0]),
 ('well', [0]),
 ('happened', [0]),
 ('met', [0]),
 ('guy', [0]),
 ('best', [0])]

In [308]:
inverted_index = flattened.reduceByKey(lambda a, b: a+b)
inverted_index.toDF(["token", "docs"]).show()

+----------+--------------------+
|     token|                docs|
+----------+--------------------+
|      tell|[0, 1, 2, 3, 4, 5...|
|  happened|[0, 1, 2, 3, 4, 5...|
|      best|[0, 1, 2, 3, 4, 5...|
|snorkeling|                 [0]|
|       got|[0, 1, 2, 3, 4, 5...|
|    danced|[0, 11, 40, 17179...|
|      rain|[0, 1, 3, 4, 6, 1...|
|      wasn|[0, 2, 3, 4, 5, 6...|
|   pounded|    [0, 25769803824]|
|      like|[0, 1, 2, 3, 4, 5...|
|   mallard|                 [0]|
|      duck|[0, 3, 5, 8, 10, ...|
|   married|[0, 1, 4, 6, 7, 8...|
|   believe|[0, 1, 2, 3, 4, 5...|
|    phones|[0, 1, 3, 13, 14,...|
|      away|[0, 1, 2, 3, 4, 5...|
|     fling|[0, 17, 20, 31, 5...|
|     agent|[0, 14, 19, 22, 2...|
|    couldn|[0, 1, 2, 3, 4, 5...|
|  sleeping|[0, 2, 3, 4, 6, 7...|
+----------+--------------------+
only showing top 20 rows



# TF
### Term Frequency

In [192]:
def invert_doc(tokens_list):
    inverted_index = {}
    for token in tokens_list:
        inverted_index[token] = inverted_index.get(token, 0) + 1
    return inverted_index

In [314]:
# sanitized.rdd.map(lambda row: row[2]).map(invert_doc).take(2)