In [145]:
import pyspark
import math
import itertools
import os
from decimal import Decimal
from operator import add
sc = pyspark.SparkContext.getOrCreate()
sqlContext = pyspark.sql.SQLContext(sc)

# File Reading into DataFrame

In [158]:
scripts_directory = '/usr/data/movie_scripts/'
files_list = os.listdir(scripts_directory)
files_list_rdd = sc.parallelize(files_list)

In [165]:
def map_script_to_meta(filename):
    if "txt" not in filename:
        return []
    script = open(scripts_directory + filename, 'r', encoding='utf8').read()
    actor, movie = filename[:-4].split("_")
    return [Row(actor=actor, movie_name=movie, script=script)]

scripts_df = rdd.flatMap(map_script_to_meta).toDF()
scripts_df.show()

+------------+--------------------+--------------------+
|       actor|          movie_name|              script|
+------------+--------------------+--------------------+
|Adam Sandler|      50 First Dates|So tell me. How w...|
|Adam Sandler|            Airheads|Oh, yeah! That wa...|
|Adam Sandler|    Anger Management|<font color=orang...|
|Adam Sandler|     Bedtime Stories|[man] I'm going t...|
|Adam Sandler|           Big Daddy|- Hello? - Sonny,...|
|Adam Sandler|       Billy Madison|~ Suntan lotion i...|
|Adam Sandler|             Blended|Yes. Uh-huh. I'm ...|
|Adam Sandler|         Bulletproof|I believe in Amer...|
|Adam Sandler|               Click|Advertise your pr...|
|Adam Sandler|           Coneheads|Do you copy? - Ro...|
|Adam Sandler|          Dirty Work|Hand over the mil...|
|Adam Sandler|  Eight Crazy Nights|Well, all right. ...|
|Adam Sandler|        Funny People|Hello, ladies. So...|
|Adam Sandler|         Grown Ups 2|<font color="#D90...|
|Adam Sandler|           Grown 

# Characters Sanitation and Tokenization

In [182]:
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

tokenizer = Tokenizer(inputCol="script", outputCol="tokenized_script")
count_tokens = udf(lambda words: len(words), IntegerType())
tokenized = tokenizer.transform(scripts_df)
tokenized = tokenized.select("actor", "movie_name", "script", "tokenized_script")
regexTokenized = regexTokenizer.transform(scripts_df)
tokenized = regexTokenized.select("actor", "movie_name", "script", "tokenized_script").withColumn("tokens_count", count_tokens(col("tokenized_script")))
tokenized.show()

+------------+--------------------+--------------------+--------------------+------------+
|       actor|          movie_name|              script|    tokenized_script|tokens_count|
+------------+--------------------+--------------------+--------------------+------------+
|Adam Sandler|      50 First Dates|So tell me. How w...|[so, tell, me, ho...|        9853|
|Adam Sandler|            Airheads|Oh, yeah! That wa...|[oh, yeah, that, ...|       10230|
|Adam Sandler|    Anger Management|<font color=orang...|[font, color, ora...|       12020|
|Adam Sandler|     Bedtime Stories|[man] I'm going t...|[man, i, m, going...|       12265|
|Adam Sandler|           Big Daddy|- Hello? - Sonny,...|[hello, sonny, it...|       10934|
|Adam Sandler|       Billy Madison|~ Suntan lotion i...|[suntan, lotion, ...|        8081|
|Adam Sandler|             Blended|Yes. Uh-huh. I'm ...|[yes, uh, huh, i,...|       12809|
|Adam Sandler|         Bulletproof|I believe in Amer...|[i, believe, in, ...|        7295|

# Stop-Word Sanitation

In [183]:
from pyspark.ml.feature import StopWordsRemover

stop_word_remover = StopWordsRemover(inputCol="tokenized_script", outputCol="sanitized_script")
sanitized = stop_word_remover.transform(tokenized)
sanitized = sanitized.select("actor", "movie_name", "script", "tokenized_script", "tokens_count", "sanitized_script").withColumn("sanitized_count", count_tokens(col("sanitized_script")))
sanitized.show()

+------------+--------------------+--------------------+--------------------+------------+--------------------+---------------+
|       actor|          movie_name|              script|    tokenized_script|tokens_count|    sanitized_script|sanitized_count|
+------------+--------------------+--------------------+--------------------+------------+--------------------+---------------+
|Adam Sandler|      50 First Dates|So tell me. How w...|[so, tell, me, ho...|        9853|[tell, hawaii, un...|           4650|
|Adam Sandler|            Airheads|Oh, yeah! That wa...|[oh, yeah, that, ...|       10230|[oh, yeah, pinhea...|           5013|
|Adam Sandler|    Anger Management|<font color=orang...|[font, color, ora...|       12020|[font, color, ora...|           5941|
|Adam Sandler|     Bedtime Stories|[man] I'm going t...|[man, i, m, going...|       12265|[man, m, going, t...|           6368|
|Adam Sandler|           Big Daddy|- Hello? - Sonny,...|[hello, sonny, it...|       10934|[hello, sonny,

# Selecting Only What Really Matters

In [184]:
sanitized = sanitized.select("actor", "movie_name", "sanitized_script")
sanitized.show()

+------------+--------------------+--------------------+
|       actor|          movie_name|    sanitized_script|
+------------+--------------------+--------------------+
|Adam Sandler|      50 First Dates|[tell, hawaii, un...|
|Adam Sandler|            Airheads|[oh, yeah, pinhea...|
|Adam Sandler|    Anger Management|[font, color, ora...|
|Adam Sandler|     Bedtime Stories|[man, m, going, t...|
|Adam Sandler|           Big Daddy|[hello, sonny, da...|
|Adam Sandler|       Billy Madison|[suntan, lotion, ...|
|Adam Sandler|             Blended|[yes, uh, huh, m,...|
|Adam Sandler|         Bulletproof|[believe, america...|
|Adam Sandler|               Click|[advertise, produ...|
|Adam Sandler|           Coneheads|[copy, roger, cap...|
|Adam Sandler|          Dirty Work|[hand, milk, mone...|
|Adam Sandler|  Eight Crazy Nights|[well, right, loo...|
|Adam Sandler|        Funny People|[hello, ladies, l...|
|Adam Sandler|         Grown Ups 2|[font, color, d90...|
|Adam Sandler|           Grown 