Set up HDFS and Google credentials

In [1]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession


LOCAL_IP = "10.164.0.2"

spark = SparkSession \
    .builder \
    .appName("Test Etienne JOB") \
    .master("spark://10.164.0.2:7077") \
    .config("spark.executor.cores", 2) \
    .config("spark.cores.max", 14) \
    .config("spark.python.worker.memory", "6g") \
    .config("spark.executor.memory", "5g") \
    .config("spark.executorEnv.SPARK_LOCAL_IP", LOCAL_IP) \
    .getOrCreate()

sc = spark.sparkContext
sc

In [2]:
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./imdb-e9e7ce7a779d.json"
os.environ["HDFSCLI_CONFIG"]="./.hdfscli.cfg"
os.environ["HADOOP_CONF_DIR"]="/opt/hadoop-3.1.0/etc/hadoop"
sc.environment["GOOGLE_APPLICATION_CREDENTIALS"]="/MovieScope-1bf4856cc738.json"

List filenames of reviews from HDFS and parallelize in preparation from processing

Parallelise the reviews and use Google NLP API to extract entities and related sentiment.

In [3]:
# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

import pyspark.sql.functions as F
from pyspark.sql.window import Window as W
from pyspark.sql.types import *
from pyspark.sql import Row
import pyspark.sql.functions as functions
from pyspark.sql.functions import collect_list
from pyspark.sql.functions import collect_set
from pyspark.sql.functions import udf
from pyspark.sql.functions import col

#from pyspark.mllib.linalg import SparseVector, DenseVector, VectorUDT
from pyspark.ml.linalg import SparseVector, DenseVector, VectorUDT
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel, RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, HashingTF, IDF, IDFModel, StringIndexer, StringIndexerModel, IndexToString
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from functools import reduce
import re
import numpy as np
from scipy.stats import norm
from math import exp
import pickle
import pandas as pd
import base64

In [4]:
def collectEntities(x, y):
    # The first reduce call doesn't pass a list for x, so we need to check for that.
    if not isinstance(x, list):
        x=[x]
        

    xd = dict(x)
    #print(xd)
    
    if not isinstance(y, list):
        y = [y]
        
    for ye in y:
        if ye[0] in xd:
            try:
                xd[ye[0]] = (xd[ye[0]]+ye[1])/2
            except:
                Null
        else:
            xd[ye[0]] = ye[1]
    
    return [o for o in xd.items()]
        

In [5]:
orientation = "pos"
collection="reviews"
urlsCollection="train"

Load genre information from file (previously collected using IMDB API)

In [6]:


def decodeGenre(x):
    try: 
        g = pickle.loads(base64.b64decode(x[2:-1]), encoding="bytes") 
        if (len(g)==0):
            return ["NA"]
        else:
            return g
    except:
        return ["NA"]    
        

def loadGenres(urlsCollection, orientation):
    genres = pd.read_csv("Data/genres_"+urlsCollection+"_urls_"+orientation+".csv", sep="\t", index_col=0, usecols=[1, 2, 3])
    genres = genres.fillna(value="b''")
    genres["GENRE"] = genres["GENRE"].apply(decodeGenre) 

    schema = StructType([
        StructField("FILM_ID", IntegerType(), True),
        StructField("GENRE", ArrayType(StringType(), containsNull=True), True)])

    genres_df = spark.createDataFrame(genres, schema)

    from pyspark.sql.functions import monotonically_increasing_id

    # This will return a new DF with all the columns + id
    genres_df = genres_df.withColumn("ID_TEMP", monotonically_increasing_id())#.limit(10)

    genres_df = genres_df.withColumn("ID",F.row_number().over(W.orderBy("ID_TEMP"))).select(["FILM_ID", "GENRE", "ID"])#.limit(10)
    
    return genres_df


In [7]:
#from pyspark.mllib.linalg import SparseVector, DenseVector, VectorUDT
from pyspark.ml.linalg import SparseVector, DenseVector, VectorUDT
from pyspark.sql.functions import udf

def sparse2dense(sp):
    return DenseVector(sp)




    
def separateGenres3(rec):
    print(rec)
    return [[genre, e, s] for (e, s) in rec.ENTITY_SENTIMENT for genre in rec.GENRE]

def prepareDataset(collection, orientation, urls):
    # Make sure we don't trigger Google Cloud API again
    entity_documents_info = spark.read.parquet("hdfs://spark-master:8020/user/lmrd/"+collection+"/"+orientation+"_doc_info.pq")
    print("emtity_documents_info")
    entity_documents_info.show(5)

    genres_df = loadGenres(urls, orientation)
    print("genres_df")
    genres_df.show(5)
    
    entity_documents_info = entity_documents_info.alias("df1").join(genres_df.alias("df2"), entity_documents_info.ID == genres_df.ID)#.select(["df1.*", "df2.FILM_ID", "df2.GENRE"])
    print("entity_documents_info")
    entity_documents_info.show(5)
    
    grouped_entities = entity_documents_info.rdd.flatMap(separateGenres3)
    grouped_entities.repartition(5)
    print("grouped_entities")
    print(grouped_entities.take(5))
    
    grouped_entities_df = spark.createDataFrame(data=grouped_entities, schema=["genre", "entity", "sentiment"])
    grouped_entities_df.cache()
    print("grouped_entites_df")
    grouped_entities_df.show()
    
    grouped_entity_words = grouped_entities_df.select(["genre", "entity"]).groupBy("genre").agg(collect_list("entity").alias("entities"))
    print("grouped_entity_words")
    grouped_entity_words.show(5)
    
    return grouped_entity_words

def extractTFIDFDataframeAndModel(collection, orientation, urls):

    grouped_entity_words = prepareDataset(collection, orientation, urls)
    
    # Create the dictionary
    countVec = CountVectorizer(inputCol="entities", outputCol="tf", minTF=0.01, minDF=1)
    #countVec = HashingTF(numFeatures=1024, inputCol="entities", outputCol="tf")
    idf = IDF(inputCol="tf", outputCol="tfidf")
    si = StringIndexer(inputCol="genre", outputCol="genreId")#, handleInvalid="keep")
    #nb = NaiveBayes(featuresCol="tfidf", labelCol="genreId", predictionCol="predictGenreId")
    rf = RandomForestClassifier(featuresCol="tfidf", labelCol="genreId", predictionCol="predictGenreId")
    #isModel = IndexToString(inputCol=nb.getPredictionCol(), outputCol="predictGenre")
    
#    grouped_entity_words = si.fit(grouped_entity_words).transform(grouped_entity_words)
    #pipeline = Pipeline(stages=[countVec, idf, si, nb])
    pipeline = Pipeline(stages=[countVec, idf, si, rf])
    
    
#    paramGrid = ParamGridBuilder() \
#        .addGrid(countVec.minTF, [1.0]) \
#        .addGrid(countVec.minDF, [1.0]) \
#        .build()
    
#    crossval = CrossValidator(estimator=pipeline,
#                  estimatorParamMaps=paramGrid,
#                  evaluator=MulticlassClassificationEvaluator(predictionCol="predictGenreId", labelCol="genreId", metricName="accuracy"),
#                  numFolds=3, parallelism=5)  # use 3+ folds in practice
    
#    cvModel = crossval.fit(grouped_entity_words)
    cvModel = pipeline.fit(grouped_entity_words)
    
    dft = cvModel.transform(grouped_entity_words)
    
    eval = MulticlassClassificationEvaluator(predictionCol="predictGenreId", labelCol="genreId", metricName="accuracy")
    print("f1-score: ", eval.evaluate(dft))
    
    
    return cvModel

cvModel = extractTFIDFDataframeAndModel(collection, orientation, urlsCollection)

#(tfidf, cvmodel, idf, siModel, isModel) = extractTFIDFDataframe(collection, orientation, urlsCollection)
#tfidf.show(5)


emtity_documents_info
+-----+--------------------+
|   ID|    ENTITY_SENTIMENT|
+-----+--------------------+
|10037|[[action, 0.01000...|
|10038|[[actors, 0.64000...|
|10039|[[achievement, 0....|
|10040|[[acting, 0.80999...|
| 1004|[[american, 0.0],...|
+-----+--------------------+
only showing top 5 rows

genres_df
+-------+-------------------+---+
|FILM_ID|              GENRE| ID|
+-------+-------------------+---+
| 453418|[Animation, Comedy]|  1|
| 453418|[Animation, Comedy]|  2|
| 453418|[Animation, Comedy]|  3|
|  64354|           [Comedy]|  4|
|  64354|           [Comedy]|  5|
+-------+-------------------+---+
only showing top 5 rows

entity_documents_info
+---+--------------------+-------+-------------------+---+
| ID|    ENTITY_SENTIMENT|FILM_ID|              GENRE| ID|
+---+--------------------+-------+-------------------+---+
|  1|[[adults, 0.0], [...| 453418|[Animation, Comedy]|  1|
|  2|[[adult comedy ca...| 453418|[Animation, Comedy]|  2|
|  3|[[british, 0.0], ...| 453418|

In [8]:
cvModel.avgMetrics


AttributeError: 'PipelineModel' object has no attribute 'avgMetrics'

In [7]:


def checkSentimentValue(x):
    try:
        f = float(x)
        
        return f
    
    except:
        print("Wrong sentiment value ", f)
        return 0
    
def extractEntitiesSetimentForReview(review_contents):
    # Instantiates a client
    client = language.LanguageServiceClient()
        
    document = types.Document(content = review_contents, 
                             type=enums.Document.Type.PLAIN_TEXT, language="en-US")
    tries=1
    
    while tries < 5:
        try:
            entities = client.analyze_entity_sentiment(document=document, encoding_type="UTF8")
            break
        except:
            f = open("/home/etienne/sparklog.txt", mode="a")

            f.write(""+str(entities)+"\n")
            f.close()
            time.sleep(1)
            
            tries +=1
            
    
    
    # Make sure we have no duplicate entities. If we do, average their sentiment.
    justLetters = re.compile("[^a-z ]")
    response = [o for o in zip([justLetters.sub("", entity.name.lower()) for entity in entities.entities], 
                               [checkSentimentValue(entity.sentiment.score) * checkSentimentValue(entity.sentiment.magnitude) 
                                    for entity in entities.entities])]
    
    
#    response = sorted(response, key=lambda x: x[0])
#    if (len(response)>1):
#        response = reduce(collectEntities, response)
    
            
    return response

In [None]:
def indexToLabel(cvModel, indexes):
    return [cvModel.bestModel.stages[2].labels[index] for index in indexes]

def indexToLabel2(cvModel, indexes):
    return [cvModel.stages[2].labels[index] for index in indexes]

indexToLabel2(cvModel, [0])
#pipeline = cvModel.bestModel.explainParams()  #getEstimator()
#siModel = pipeline.getStages()[2]

#nb = NaiveBayes(featuresCol="tfidf", labelCol="genreId", predictionCol="predictGenreId")

#nb_model = nb.fit(tfidf2)

#print(nb_model.pi)


In [34]:
# Evaluate the model
ds = prepareDataset("test_reviews", orientation, urlsCollection)

ds.show(5)

emtity_documents_info
+-----+--------------------+
|   ID|    ENTITY_SENTIMENT|
+-----+--------------------+
|10172|[[actor, 0.640000...|
|10173|[[academy award f...|
|10174|[[actor, 0.780000...|
|10175|[[attention, 0.0]...|
|10176|[[alex, 0.0], [br...|
+-----+--------------------+
only showing top 5 rows

genres_df
+-------+-------------------+---+
|FILM_ID|              GENRE| ID|
+-------+-------------------+---+
| 453418|[Animation, Comedy]|  1|
| 453418|[Animation, Comedy]|  2|
| 453418|[Animation, Comedy]|  3|
|  64354|           [Comedy]|  4|
|  64354|           [Comedy]|  5|
+-------+-------------------+---+
only showing top 5 rows

entity_documents_info
+---+--------------------+-------+-------------------+---+
| ID|    ENTITY_SENTIMENT|FILM_ID|              GENRE| ID|
+---+--------------------+-------+-------------------+---+
|  1|[[anyone, -0.0400...| 453418|[Animation, Comedy]|  1|
|  2|[[anyone, 0.0], [...| 453418|[Animation, Comedy]|  2|
|  3|[[aa couple, 0.0]...| 453418|

In [82]:
testpreds = cvModel.transform(ds)
#testpreds.take(1)[0].tf

eval = MulticlassClassificationEvaluator(predictionCol="predictGenreId", labelCol="genreId", metricName="accuracy")
print("f1-score: ", eval.evaluate(testpreds))

IllegalArgumentException: 'Field "genreId" does not exist.'

In [23]:


#revText=["I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge."]
revText=["As others that have commented around the web... I'm a 130 pilot in the Coast Guard. Having said that, and being the skeptic I am, I went expecting the over-the-top cheese factors. There was some cheese, but all in all, not much.. and the film was pretty accurate.I watched the trailer again today. After seeing the film yesterday, I've realized the trailer gives the impression the movie is nothing but rescue after rescue action scenes. This isn't the case.The movie is truly more character/story driven than action. The inner struggles both Costner and Kutcher are dealing with.. Kutcher's is revealed further into than movie than Costner's is.Of course, there is a minor love story.. no surprise there. But for the most part, the movie tells the tale of two lives that come together, and after some time, help each other heal old wounds.As girlie as it sounds, Costner and, as much as I try not to like him, Kutcher do actually work quite well together and compliment each other very well in the movie.As critics have stated, you've seen it all before.. Top Gun, Officer and a Gentlemen, etc. But what movie hasn't been remade a million times.I can recall only one F word being spoken.. and can't really recall any other language.The movie is 2+ hours, and for some, may tend to get a little long towards the end.You'll laugh, you may cry, but I can honestly say, it was worth the $4 I paid.I hope you enjoy the movie",
         "I work at a movie theater and every Thursday night we have an employee screening of one movie that comes out the next day...Today it was The Guardian. I saw the trailers and the ads and never expected much from it, and in no way really did i anticipate seeing this movie. Well turns out this movie was a lot more than I would have thought. It was a great story first of all. Ashton Kutcher and Kevin Costner did amazing acting work in this film. Being a big fan of That 70's Show I always found it hard thinking of Kutcher as anyone but Kelso despite the great acting he did in The Butterfly Effect, but after seeing this movie I think I might be able to finally look at him as a serious actor.It was also a great tribute to the unsung heroes of the U.S. Coast Guard."]
revTextRdd = sc.parallelize(revText)

#entitiesForTest = revTextRdd.flatMap(extractEntitiesSetimentForReview)
entitiesForTest = revTextRdd.flatMap(extractEntitiesSetimentForReview)

print(entitiesForTest.collect())
schema1 = StructType(
                [StructField("entity", StringType(), False), 
                 StructField("sentiment", FloatType(), False)])

entitiesForTest_df = spark.createDataFrame(entitiesForTest, schema=schema1)

entitiesForTest_df.show()
entitiesForTest2_df = entitiesForTest_df.agg(collect_set('entity').alias('entities')).crossJoin(entitiesForTest_df.agg(functions.('sentiment')).alias("sentiment"))

entitiesForTest2_df.show()



[('action', 0.0), ('all', 0.0), ('case', 0.0), ('character', 0.0), ('cheese', 0.0), ('cheese factors', -0.010000000298023226), ('coast guard', 0.0), ('costner', -0.16000000476837162), ('course', 0.0), ('critics', -0.010000000298023226), ('each other', -0.01500000044703484), ('end', 0.0), ('f word', -0.010000000298023226), ('film', 0.0), ('gentlemen', 0.0), ('impression', 0.0), ('is', -0.010000000298023226), ('kutcher', -0.03000000163912775), ('lives', -0.03000000163912775), ('love story', 0.0), ('movie', -0.005000000149011613), ('movieas', 0.0), ('nothing', 0.0), ('officer', 0.0), ('others', 0.0), ('part', -0.040000001192092904), ('pilot', 0.0), ('rescue', 0.0), ('rescue action scenes', 0.0), ('skeptic', 0.0), ('some', 0.0), ('struggles', -0.4899999833106996), ('surprise', 0.0), ('tale', -0.010000000298023226), ('times', -0.040000001192092904), ('top gun', 0.0), ('trailer', 0.0), ('web', 0.0), ('woundsas', 0.0), ('acting', 0.8099999570846563), ('actorit', 0.6400000190734865), ('ads', 0

In [11]:
entitiesForTest3_df = cvModel.transform(entitiesForTest2_df)

entitiesForTest3_df.show()

indexToLabel2(cvModel, [int(entitiesForTest3_df.select("predictGenreId").take(1)[0].predictGenreId)])

scores = entitiesForTest3_df.select("rawPrediction").collect()[0].rawPrediction
print(scores)

probs = [1-(1.0*s)/np.sum(scores) for s in scores]
#probs = [exp(np.max(scores) - s) for s in scores]
print(probs)
print(np.argsort(scores))
top = np.argsort(scores)[:-4:-1]

print(indexToLabel2(cvModel, top))

+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------+
|            entities|collect_set(sentiment)|                  tf|               tfidf|       rawPrediction|         probability|predictGenreId|
+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------+
|[thinking, employ...|  [0.0, -0.03000000...|(76511,[0,1,3,5,1...|(76511,[0,1,3,5,1...|[-334.14323319175...|[5.02615740634632...|          12.0|
+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------+

[-334.143233191757,-337.65256154337135,-336.7078834755889,-337.4525348331828,-334.34186354813886,-333.46345417628805,-335.5362793877702,-337.0783122678819,-334.3714919319533,-316.41819890263974,-335.4348399731048,-327.8694864871247,-315.32321695586654,-334.86814846859784,-3

In [39]:


entitiesForTest3_df = cvmodel.transform(entitiesForTest2_df)

entitiesForTest3_df = entitiesForTest3_df.withColumn("features", udf_to_DenseVector("tf"))

entitiesForTest3_df = model.transform(entitiesForTest3_df)

entitiesForTest3_df = isModel.transform(entitiesForTest3_df)

print(isModel.getLabels())
scores = entitiesForTest3_df.select("rawPrediction").collect()[0].rawPrediction
print(scores)

probs = [1-(1.0*s)/np.sum(scores) for s in scores]
#probs = [exp(np.max(scores) - s) for s in scores]
print(probs)
print(np.argsort(scores))
entitiesForTest3_df.show()

['Animation', 'Talk-Show', 'Thriller', 'Adult', 'War', 'Horror', 'Documentary', 'NA', 'Biography', 'Comedy', 'Western', 'Fantasy', 'Romance', 'Family', 'Drama', 'Short', 'Sport', 'History', 'Film-Noir', 'Reality-TV', 'Music', 'Mystery', 'Musical', 'Sci-Fi', 'Game-Show', 'Adventure', 'Crime', 'Action', 'News']
[-537.7891618608271,-587.9934842823056,-518.4590082407616,-585.93812581499,-543.6674228395556,-538.8842749128711,-553.6583065705015,-582.2730094151456,-537.4807382920243,-514.2212682051768,-556.3789659243172,-523.747313855792,-513.583636157141,-530.9484400011238,-506.7379617621209,-558.7893495978734,-554.9934586069586,-546.6821482616766,-561.8042696801848,-584.53159521691,-552.8948283999592,-534.1307261361959,-544.3075497937868,-529.3509095570462,-586.2027202433444,-518.7623805869365,-523.9627650786558,-514.3693191457652,-588.0464268898968]
[0.9660284817794389, 0.9628571329036251, 0.9672495451858455, 0.9629869675164696, 0.9656571588445305, 0.9659593047568444, 0.9650260462956444, 0

In [8]:
import matplotlib.pyplot as plt

def solveNorms(m1, m2, std1, std2):
    a = 1/(2*std1**2) - 1/(2*std2**2)
    b = m2/(std2**2) - m1/(std1**2)
    c = m1**2 / (2*std1**2) - m2**2 / (2*std2**2)-np.log(std2/std1)
    return np.roots([a, b, c])

#m1 = 2.5
#std1 = 0.5
#m2 = 5.0
#std2 = 0.5

def getNormOverlap(m1, m2, std1, std2):
    if ((std1==0) or (std2==0)):
        return 1-abs(m1-m2)    
    
    result = solveNorms(m1,m2,std1,std2)

#    print(result)
#    x = np.linspace(-5,9,10000)
#    plot1=plt.plot(x,norm.pdf(x,m1,std1))
#    plot2=plt.plot(x,norm.pdf(x,m2,std2))
#    plot3=plt.plot(result,norm.pdf(result,m1,std1),'o')
#
#    #Plots integrated area
#    r = result[0]
#    olap = plt.fill_between(x[x>r], 0, norm.pdf(x[x>r],m1,std1),alpha=0.3)
#    olap = plt.fill_between(x[x<r], 0, norm.pdf(x[x<r],m2,std2),alpha=0.3)
    # For single samples just take the difference between means

    lower=-10
    upper=10

    # 'lower' and 'upper' represent the lower and upper bounds of the space within which we are computing the overlap
    if(len(result)==0): # Completely non-overlapping 
        overlap = 0.0

    elif(len(result)==1): # One point of contact
        r = result[0]
        if(m1>m2):
            tm,ts=m2,std2
            m2,std2=m1,std1
            m1,std1=tm,ts
        if(r<lower): # point of contact is less than the lower boundary. order: r-l-u
            overlap = (norm.cdf(upper,m1,std1)-norm.cdf(lower,m1,std1))
        elif(r<upper): # point of contact is more than the upper boundary. order: l-u-r
            overlap = (norm.cdf(r,m2,std2)-norm.cdf(lower,m2,std2))+(norm.cdf(upper,m1,std1)-norm.cdf(r,m1,std1))
        else: # point of contact is within the upper and lower boundaries. order: l-r-u
            overlap = (norm.cdf(upper,m2,std2)-norm.cdf(lower,m2,std2))

    elif(len(result)==2): # Two points of contact
        r1 = result[0]
        r2 = result[1]
        if(r1>r2):
            temp=r2
            r2=r1
            r1=temp
        if(std1>std2):
            tm,ts=m2,std2
            m2,std2=m1,std1
            m1,std1=tm,ts
        if(r1<lower):
            if(r2<lower):           # order: r1-r2-l-u
                overlap = (norm.cdf(upper,m1,std1)-norm.cdf(lower,m1,std1))
            elif(r2<upper):         # order: r1-l-r2-u
                overlap = (norm.cdf(r2,m2,std2)-norm.cdf(lower,m2,std2))+(norm.cdf(upper,m1,std1)-norm.cdf(r2,m1,std1))
            else:                   # order: r1-l-u-r2
                overlap = (norm.cdf(upper,m2,std2)-norm.cdf(lower,m2,std2))
        elif(r1<upper): 
            if(r2<upper):         # order: l-r1-r2-u
                #print(norm.cdf(r1,m1,std1), "-", norm.cdf(lower,m1,std1), "+", norm.cdf(r2,m2,std2), "-", norm.cdf(r1,m2,std2), "+", norm.cdf(upper,m1,std1), "-", norm.cdf(r2,m1,std1))
                overlap = (norm.cdf(r1,m1,std1)-norm.cdf(lower,m1,std1))+(norm.cdf(r2,m2,std2)-norm.cdf(r1,m2,std2))+(norm.cdf(upper,m1,std1)-norm.cdf(r2,m1,std1))
            else:                   # order: l-r1-u-r2
                overlap = (norm.cdf(r1,m1,std1)-norm.cdf(lower,m1,std1))+(norm.cdf(upper,m2,std2)-norm.cdf(r1,m2,std2))
        else:                       # l-u-r1-r2
            overlap = (norm.cdf(upper,m1,std1)-norm.cdf(lower,m1,std1))

    return float(overlap)

In [9]:
getNormOverlap(1, 2, 1, 1)

0.6170750774519738

In [10]:
history_df = spark.read.parquet("hdfs://spark-master:8020/user/lmrd/reviews/History_pos_tfidf2.pq")

In [11]:
comedy_df = spark.read.parquet("hdfs://spark-master:8020/user/lmrd/reviews/Comedy_pos_tfidf2.pq")

In [14]:
horror_df = spark.read.parquet("hdfs://spark-master:8020/user/lmrd/reviews/Horror_pos_tfidf3.pq")
horror_neg_df = spark.read.parquet("hdfs://spark-master:8020/user/lmrd/reviews/Horror_neg_tfidf2.pq")

In [15]:
documentary_df = spark.read.parquet("hdfs://spark-master:8020/user/lmrd/reviews/Documentary_pos_tfidf2.pq")
documentary_df.show(5)

+-----------+-----------+------------------+----------+--------------------+-------------------+
|      genre|     entity|             tfidf|vocabIndex|            avg_sent|           std_sent|
+-----------+-----------+------------------+----------+--------------------+-------------------+
|Documentary|     strike|2.7405504149742894|      2811| -0.2283333494948844|0.29559143777024416|
|Documentary|    british| 2.734823351909319|       227|                 0.0|                0.0|
|Documentary|       soul| 2.734823351909319|       350|-0.06199999656528234|0.20214915288686097|
|Documentary|   reaction| 2.734823351909319|       449|-0.12933333485076826|0.24479765296589476|
|Documentary|personality| 2.734823351909319|       286|-0.07199999435494343| 0.2601733520212391|
+-----------+-----------+------------------+----------+--------------------+-------------------+
only showing top 5 rows



In [16]:
revText=["George and Kathy Lutz are looking for a place to anchor down and raise a family The Lutzs and their children Kathys from a previous marriage settle on an impossibly cheap large and beautiful shore house But 28 days later the macabre and scary happenings force them to leaveleaving all their earthly possession behind During the course of those 28 days the family goes through all kinds of hell a room full of flies demonic voices and a pig with glowing red eyes What kind of past does that house have that would make everything horribly wrong Based on a true story  Real or hoax you decide but The Amityville Horror has all the trappings of an excellent haunted house story Too bad that the filmmakers falter a bit with a lack of character development that also ends up stifling the actors in the film that would have helped out immensely A creepy music score the one rejected for The Exorcist and several good setpieces help out but the end of the second act kind of gets stale A good supernatural thriller 48 out of 60 found this helpful Was this review helpful Yes No  Report this  810 Yeah it has problems but I still love it Brandt Sponseller7 April 2005 George James Brolin and Kathleen Lutz Margot Kidder buy a dream house in Amityville New York for a dream price Unfortunately the price was low because just a year before the house was the location of the Ronald DeFeo Jr murdershe killed his entire family while they were sleeping As a priest Father Delaney Rod Steiger blesses the home he realizes with horror that something evil is lingering there The dream house is turning into a nightmare  Sometimes our affection for or aversion to an artwork that weve been exposed to a number of times over the years is inextricably enmeshed with our historical emotional experiences whether we admit this or not For example I strongly dislike soap operas or indeed any dramas that resemble soap operas This is probably due to the fact that for years my only exposure to soap operas was when I was home sick from school as a kid These were the days before cable television and home video In the middle of a weekday afternoon you either watched soap operas or you didnt watch television Subconsciously I associate soap operas with a feeling of illness  Likewise Jay Ansons Amityville Horror novel appeared when I was still a teen I loved it I can still remember reading it in one long sittingsomething I rarely didin the family car as we drove from Florida to Ohio to visit relatives I was excited when the film appeared and liked it a lot at the time  So although I can see many faults with Amityville Horror now I still have a deep affection for it that triggers my brain to go into an apologetic mode and defend the film I just cant bring myself to give it lower than an 8 out of 10 and even that seems low to me But I can easily see how audiences lacking a history with the film might dislike it It is relatively slow uneventful and meanderingwith a modern perspective the pacing and subtlety are reminiscent of some recent Asian horror At the same time maybe paradoxically scenery chewing has only rarely had a greater ally  Just a couple days ago MGM released newly remastered widescreen versions of Amityville 1 2 and 3 I havent seen the film look this good since seeing it in the theater in 1979 and it probably didnt even look this good then The first thing that struck me was how incredible much of the cinematography is Director Stuart Rosenberg had an amazing knack for finding intriguing angles for shots and imbuing them with beautiful colors  Unlike recent trends Rosenbergs colors are not narrowed down to a single scheme For example in some shots such as some of the interiors of the famed Amityville house we get fabulous combinations of pale greens and yellows In others such as many exterior shots near the house we get intense combinations of fall foliage colors There are also a number of beautiful shots of the famed eye window exterior of the house in differently tinted negative colors  Rosenberg evidences a great eye for placing his cast in the frame and shooting scenes to create depth and symbolism via objects that partially block or surround the frame He also has a knack for creating winding receding patterns of objects that enhance depth through perspective My affection for this aspect of the film has little nostalgic attachment as I didnt pay attention to such things as a kid I didnt start noticing them more until I started painting far into my adult years and the positive aspects of the cinematography were hardly discernible on the previous ridiculously bad pan  scan VHS release  Of course most people arent watching a film like this for the aesthetics of the visual composition This is one of the most famous haunted house films after all The horror is handled somewhat awkwardly occasionally absurdly but it still works well enough for me as understated as it is Im not referring to the acting just the horror objects Aspects such as the ubiquitous flies reminded me of similar motifs such as water in Hideo Nakatas horror films such as Ringu 1998 and Dark Water 2002 The beginning of the film showing the Defeo murders still has a lot of shock value despite its relative postTarantino tameness Most of the horror elements are more portentous but theyre regular and interesting enough to hold your attention as long as you dont mind subtlety  Subtlety however was the furthest thing from the casts minds Brolin Kidder and especially Steiger shout their lines more often than they speak them Overacting is not in their vocabularies Kidder comments on an accompanying documentary that the horror genre walks a fine line between intensity and camp That may or may not be true in general but in Amityville Horror camp is frequently broached For me it has a certain charm Im a fan of camp and so bad its good Amityvilles performances often attain both  The commentary on the new DVD is amusing given the 1970s publicity that the book and film depicted a true haunting and the subsequent thorough debunking by persons such as Stephen Kaplan Hans Holzer a parapsychologist who has been involved with the story since the early days and the author of a book upon which Amityville II was based provides the commentary He presents himself as an academic but he obviously seems to have little concern for objectivity or skepticism He not only still talks about the story as true he invents supernatural excuses for the DeFeo murders and then some barely mentioning detractors such as Kaplan  If you havent seen the film yet you should base your viewing decision on whether you have a taste for deliberately paced horror as well as a tolerance for extremely overthetop performances The film is historically important in the genre as well 77 out of 100 found this helpful Was this review helpful Yes No  Report this Bizarre Bad Badly Bizarre Bizarrely Fascinating curtis martin25 October 2004 Warning Spoilers 29 out of 35 found this helpful Was this review helpful Yes No  Report this      610 Nothing great only the eyelike windows were creepy n Brolins performance was good Fellashibby14 May 2017 Saw this on a VHS in the mid 80s Revisited it recently on a DVD To be honest i found the movie to be tedious n tame then Now i jus forwarded some boring scenes The film opens on a dark and stormy night as we hear gunshots and see flashes of light through the homes famous eyelike upstairs windows as an entire family is killed A new family moves in after a year n unsettling things begin to occur Ther are scenes where the walls drip blood Was it blood or tar i dont know Whose blood it was or where it came from was never explained Theres a hidden room in the house the dog always barks at In one scene James Brolin climbs the stairs above that room only to fall through them and into a pit of the same bloodtar Was that suppose to be comedic Also the scene involving Rod Steiger with the flies wasnt scary at all The movie was boring considering the length n nothing happens Somewhere around 0116 Josh Brolin breaks open a wall n his facial expressions n eyes r epic scene man What he sees that makes him so startled we never come to know n we dont get to see also The film is helped by an extremely creepy score composed by Lalo Schifrin n Brolin delivered a good performance Margot Kidder did a decent job 12 out of 13 found this helpful Was this review helpful Yes No  Report this  910 Dated but great seanahalpin17 February 2002 True the special effects arent so special these days True the girl with the braces brings tears of laughter rather than terror But nonetheless this movie remains a creepy gem from my young days Everyone misses the point that the real star of the movie is the house The building is both attractive and sinister  truly gothic in the importance of the setting Whether the story is true or not if you want a movie to snuggle on the couch in the dark eating popcorn feeling the thrill of a ghost story scare this is one for you 58 out of 76 found this helpful Was this review helpful Yes No  Report this  810 Get outindeed SmileysWorld26 April 2002 I was but a timid lad of 14 when taken to a drivein theater to see this incredibly effective horror filmThere is no better monster to create a film around than the Devil himselfWhen Rod Steigers characterFather Delaney is in the process of blessing the houseand was greeted by a resounding shout of Get OutI almost took it literallyit was that effectiveThe Devil is indeed one unwelcome houseguest that is very hard to kick outas you will see when you watch this filmJames Brolin and Margo Kidder head a young family who are the new inhabitants of a home where brutal murders had taken place years beforeSoonstrange happenings begin to haunt the familyas the house has trouble letting go of what had happened thereThis movie is definitely in my top 10 horror films that I have seenand if you enjoy being scared out of your witsthis film will do it for youGive it a look 47 out of 62 found this helpful Was this review helpful Yes No  Report this  810 OLD FASHIONED HORROR richard cavellero16 December 2004 Excited about the remake I decided to go out and just but the original Amityville Horror Being a huge horror buff I just had to and besides I had only seen some of its absurd sequels hearing mixed reviews from friends and critics from terrifying to hilarious I turned it on with my boyfriend at the time and prepared myself for something scary I must say that I was quite impressed And although slightly disappointed in some of the films scenes ultimately I must say this is one old fashioned scary flick I can hugely recognize the appeal it had in its its hey day With the exception of Texas chainsaw Massacre Evil Dead the Omen and some others I rarely see what people did in their horror classics nowadays Like The Exorcist pretty damn boring and funny in my opinion But getting back to this film It builds a creeping mood filled with fright inducing suspense The effects are simple but effective and the performances are somewhat over the top but necessarily wacky The films overall lasting appeal has little to do with the films apparent campiness it has more to do with the real terror inducing legend that inspired it Like the Chainsaw remake the new ones looks to amp up the horror and intensity which would be greatly welcomed Although a great horror classic Amityvilles finale is somewhat anti climactic and after a long and impressively scary buildup it fails to deliver the end goods But whoa some of the scenes from the imaginary friend Jody flying out the window to the visitor at the door to the voice in the house and just everything in the basement this film is all about delivering some authentic chills 810777"]

#revText=["Though it has somehow or other managed to escape all the standard reference books, this film is a real and unheralded discovery--a visually distinguished and absorbing Gothic thriller, halfway between Bava country and the Mexican gothics like The Witch's Mirror.Blancheville Monster where have you been hiding? Right off the bat, we are treated to a deliciously evocative visual opening--a wintry, bare tree ridden country road, awash with an icy looking rain, beyond which stands the distant castle on the mountain.Ann Radcliffe would be well pleased! And the film keeps on delivering: two attractive female leads, a mysterious and scarred man locked in a tower room, frequent thunder and lightning, and all the necessary accoutrements of remote castle Gothic, from flickering torches to doleful turns on the family harpsichord.The countryside in which the film is shot has the desolate beauty of a November day, and strongly suggests the Brittany in which the film is set, (though not shot). In this sense, some of the landscapes resemble those in Bresson's Diary of a Country Priest. Devotees of the genre are herewith advised to seek this out. It's far more deserving than some of its over-hyped relatives.",
#"The opening sequences show both titles (Horror and The Blancheville Monster) neatly after each other, as if the distributors couldn't really decide which of the two was better and/or more appropriate. They should have just called it The Blancheville Horror... Problem solved! Anyway, that was totally irrelevant. I'm more and more becoming a fan of director Alberto De Martino! He's been making good horror movies consistently from the early 60's until the mid 80's, yet he never received the respect and appreciation that other Italian directors did. This stylish and severely underrated picture came out alongside loads of other Gothic horror movies and, although not as brilliant as, say, Black Sunday or Kill Babyn Kill, it's a hugely atmospheric and powerfully compelling chill-tale with an overall decent script and convincing set pieces. Only a couple of days prior to her 21st birthday, a beautiful girl travels back to her wealthy father's castle, accompanied by her new lover and best friend. Upon arrival, her brother informs her about the tragic incident that supposedly killed her father and how he got madly obsessed with the legend of the Blancheville family curse. But the new arrivals notice that there are a lot more strange things going on. Why have all the servants been replaced? Why is the brother so nervous about the girl's upcoming birthday? And, most of all, who or what produces those creepy screams at night in the castle's darkest tower? The Blancheville Monster offers pretty much all the elements you're looking for in good Gothic horror, including eerie thunderstorms, scary black-caped monsters, ominous vaults and tight costumes that supply the female cast members with impressive cleavage. De Martino does a great directing job, spreading the suspense equally throughout the whole film and the finale albeit not too hard to predict  is formidably tense. Unlike any of Mario Bava's Gothic horror movies, The Blancheville Monster will not haunt your nightmares, but for fans of classic horror it's definitely worth seeing.17 out of 19 found this helpful. Somewhat slow, short on horror, but eerie.",
#"This pretty good Corman-styled quasi-Poe entry complete with Vincent Price lookalike, and red herrings, is a little too plodding for it's own good and needs some more frequent and stronger moments of horror to make it to the very good level. The same can be said of the Corman-Poe films and such, yet they had great indelible scare moments, fabulous art direction in color and Vincent Price to keep them interesting. This looks like a late night spook show staple from the 60s and 70s, and is fine as such, except I saw it without commercials. I kept thinking it would be better if I'd watched it on an old B&W 20 portable TV on a wire-rack TV stand with potato chips and soda, and a chair with lumpy cushion on a chilly and windy October night just to get into the period.",
#"I sought out this little gem after reading about it in Tim Lucas' bio of Mario Bava. Lucas believes the effects in this 1962 b/w strongly resemble Bava's technical flourishes. I sought out this title for this reason, but also because I collect films of the era containing the word 'monster' in the title. I also have a growing appreciation for Gothic Italian horror and mystery. After snapping it up on Amazon for about five bucks including postage, I am pleased to announce that it was well worth my time and effort.There is a lot to like about this movie, including an eerie score and loads of captivating locations and sets; the crumbling abbey is reminiscent of Universal's Dracula but much more realistic and effectively photographed. Moments of the film seem over the top, and overcooked, but it stands up well for its time overall. This is especially desirable to those of us who recall the old Shock Theatre days of television, and enjoy Italian Gothic horror of the 60s.",
#"I had no idea what to expect from this one, but it turned out to be Italy's response to the Roger Corman Edgar Allan Poe films, and it's actually pretty good.In late 19th century northern France, lovely Emily De Blancheville returns to her ancestral home from finishing school to find that her brother has sacked the entire staff and all the new servants are creepy. Worse yet, her father  whom she had believed to be killed in a fire  is actually alive but hideously burned and criminally insane, and locked up in the tower. Her brother explains that there is a curse on the De Blancheville line, and their father believes that the curse can only be broken if Emily dies before her 21st birthday, which is coming up so close that they've already got the castle bedecked with festive balloons. Well, to make 89 minutes short, the father escapes and pretty soon Emily is in for a bad time of it.What I liked about this film: It's produced by Llama Films, which has to make a person smile. The leading ladies are all lovely, and Emily's little peekaboo nightie is extremely flattering (yeah, yeah, I know, men are pigs. Oink, oink). The location and sets are amazing, with real castles and genuine ruins (it's so cold, even indoors, that you can constantly see the actors' breath). The monster's makeup seems to be some kind of Kharis mask, Italian style. The dialog is priceless, if a tad wordy (You will follow me, Emily. To your tomb. To your death. To die. To die. To die. To die). Have I mentioned it was produced by Llama Films? Oh, and the brother (Roderick, what else) is played by a guy who looks like Vincent Price. At least, in the context of this film. There are a lot of other shenanigans, with a doctor who is not what he appears, a housekeeper who is exactly what she appears, a premature burial, et al, but I don't like to give away too much of the plot. Sometimes, you have to just let the film run its course and try not to think about what it all means for months, or even years.",
#"The Blancheville Monster is a decent Italian Gothic entry. It's true that it is a little plodding and uneventful, however, it's occasionally eerie and stylish. The story is about a badly disfigured count who visits his daughter by night, attempting to induce her into committing suicide to revoke an old family curse. Frankly, the story is absolutely ridiculous, I mean couldn't the count just kill his daughter rather than embark on his moonlit walks with her to the family mausoleum? Well, yes, however, these spooky encounters do actually provide the film with its best Gothic imagery. These scenes, set in the dead of night, with the sleepwalking daughter being followed a few paces behind by the black-clad, monstrous count through the ruins to the family tomb, are very striking. So really, plot inconsistencies have to be weighed against this. Although I do have a little problem with the final confrontation where the daughter escapes from her entombed situation without even a hint of how this came to be. Nevertheless, these observations aside, The Blancheville Monster is not bad. The location is well used and there is some decent photography to accentuate this. If you are a fan of 60's Italian Gothic then it's really one to check out.",
#"A young, beautiful blonde, her best friend from college with her brother and an admirer of her blonde beauty reach home, the stately residence of the famed and fabled Blachevilles. Upon arriving there, we meet the brother Roderick, a scary looking housekeeper that seems to possess a station way above her ranks, and a doctor that oozes something not closely related to charm. It seems that all the old servants have been let go and that the patriarch - the father- has recently died - or not? I liked this film for several reasons. If nothing else it has atmosphere. The castle used is an impressive set. The black and white cinematography accentuates the rooms of the castle, a huge family tomb, grounds swirling with fog, and a lot more. We have a huge organ being grinded out in a few scenes in classic horror classic fashion(say that fast five times). The story and the monster are nothing fantastic at all, but the film works because the artful direction is able to build suspense. Now, if you are the type of viewer that needs a lot of action, then you really must stay away as this film is more talking and all red herrings(in what turns out to be a not all that clever mystery of who is the Blancheville monster). There is some rather preposterous story about an ancient curse on the family that will die off when the last female(?) descendant turns 21 - utter rubbish. But it is incidental when one looks at the way the film was filmed, and the pace slowly creeps up from slower to slow and then finally fast at the end in the revelation of what has lain cloaked the whole time. This was an Italian production with lots of Spanish influence(much of the cast is Spanish). The actors are all decent and able to make you believe in them at times. This is by no means a great horror film, but it, as other reviewers have noted, is in the same vein as a Roger Corman movie of the same period or a black and white Hammer film. This is the Euro version with no big stars but a decent story and lots of mood.",
#"A list of the neat elements of the film a.k.a.  Horror , a.k.a. The Blanchville Monster,...1. It's Italian, 2. black and white, 3. Horror sneakily appears as an occult film but really a psychological horror film, and 4. the ultra-creepy background music. I watched this movie when I was a 10-year old watching it on shock theater late Saturday night. There are a few boring parts and some parts that should never been put into the film, such as Rodrigues playing the keyboards. But I'm a little prejudiced on bragging on this movie because I've never seen an Italian horror film I didn't like ( Black Sunday, Black Sabbath, Suspiria, Inferno, etc. ).",
#"There are no vampires, witches or ghosts, but the horror in this genuinely creepy Italian grande guignol are impossible to deny. Better than many others of the same themes, this is still equally chilling, only missing the presence of Barbara Steele, even though there is a role in this that seems to have been written for her.Every element that makes films of this type so intriguing is there, and while it is said to have been influenced by the stories of Edgar Allan Poe, there is enough in it to make it seem fresh and filled with its own ideas. An old family curse is said to be out to kill the daughter of a recently deceased count. Allegedly, her surviving past the age of 21 will end their reign, so her father is rumored to still be alive and determined to kill her. While it was pretty obvious to me what was going on, it is fun watching everything unfold. The blonde girls are the heroines, while the beautiful brunette housekeeper is assumed to be evil. This is one worth re-watching, a rare quality of the many films in this genre which are often too laughable to believe.",
#"Slow moving Italian Gothic, heavily influenced by the Corman/Poe cycle",
#"Italian cinema has a long history of ripping off successful movies and this Italian/Spanish co-production is no exception. Cashing in on the success of Roger Corman's Edgar Allan Poe movies with Vincent Price, this movie – purporting to be from Poe himself – relies heavily on the plot ingredients and atmosphere found in the Corman flicks. The setting is an old dark castle, the plot involves genetic madness and disfiguration, and everything that goes on is steeped in mystery and suspense. There's even a supporting character, the doctor, who's been made up to look a lot like Vincent price! Sadly, as with most rip-offs, THE BLANCHEVILLE MONSTER is an inferior product and it lacks the genuine originality found in other Italian Gothic movies from the same period: CASTLE OF BLOOD, TERROR OF DR HICHCOCK, BLACK Sunday are just a few I could name. That's why you'll almost never hear this film mentioned in the same breath as the others. The main problem is that BLANCHEVILLE tries too hard, and the stodgy script doesn't help. When it tries to be scary, it ends up being boring, and there just isn't enough of the slim storyline to pad out a whole movie, even with the crew's best efforts.The film does boast some fine moments, and these are mainly down to director Albert De Martino, a mainstay of the genre for a good 20/30 years. Scenes of the heroine being pursued through a dead wood by a deformed killer are superbly creepy, and the whole buried alive aspect of the plot is handled effectively – it's just a shame it takes an hour and ten minutes to get there! The cast can't be faulted, either, with a very good turn from Gerard Tichy in the Vincent Price role – the sinister older brother who has dark secrets of his own. Okay, so Ombretta Colli isn't much of an actress, but she's pretty and in a visual film like this that counts for something. I have to say that I preferred Helga Line (HORROR EXPRESS), here appearing in an early role as a scheming villainess. With good black-and-white visuals and some great scare scenes, THE BLANCHEVILLE MONSTER could have been up there with the other films of this period. Instead, it's a merely adequate flick that might well be just too damned slow for modern tastes.",
#"This film and a lot of the cast were unfamiliar to me, and I'm a horror fanatic, so I assume that most of you never heard of it. It is reminiscent of other Gothic horror films, but I say that in the best of terms. The settings are cold and sterile which adds to the creepiness. It starts off a little slow with Emily returning from school, but it's kind of cool because everyone else is acting so weird and you only know as much as she does. Keeping the viewer in the dark as to what is going on, puts you into the lead character's position. Then, when you think the strangeness has been explained, suddenly the cast starts acting odd again. You are never put at ease for the heroine's safety and everything about the ending was pretty surprising. Suspenseful horror movies, and old Twilight Zone episodes, can be fun to watch when you don't know why everyone is acting so strange. You and perhaps the lead character are the only ones who aren't in on the joke or riddle or curse in the case of the Blancheville Monster."]
revTextRdd = sc.parallelize(revText)

#entitiesForTest = revTextRdd.flatMap(extractEntitiesSetimentForReview)
entitiesForTest = revTextRdd.flatMap(extractEntitiesSetimentForReview)

entitiesForTest.cache()

entitiesForTest2 = entitiesForTest.reduceByKey(lambda x,y: (x+y)/2, 5)

schema1 = StructType(
                [StructField("entity", StringType(), False), 
                 StructField("sentiment", FloatType(), False)])

entitiesForTest_df = spark.createDataFrame(entitiesForTest2, schema=schema1)

entitiesForTest_df.show()

entitiesForTest_df.registerTempTable("df")
entitiesForTest_df = spark.sql("select ltrim(rtrim(entity)) as entity, avg(sentiment) as avg_sent, stddev(sentiment) as std_sent from df group by entity")

entitiesForTest_df = entitiesForTest_df.withColumn("std_sent", functions.when(functions.isnan(entitiesForTest_df.std_sent)==True, 0).otherwise(entitiesForTest_df.std_sent))


#entitiesForTest_df = entitiesForTest_df.agg(collect_set('tentity').alias('entities')).crossJoin(entitiesForTest_df.agg(collect_set("avg_sent").alias("avg_sent"), collect_set("std_sent").alias("std_sent")))


#entitiesForTest2_df = entitiesForTest_df.groupBy("entity").agg(functions.avg("sentiment").alias("avg_sent"), functions.stddev("sentiment").alias("std_sent"))
#entitiesForTest2_df = entitiesForTest_df.agg(collect_set('entity').alias('entities'), collect_set("avg_sent"))

entitiesForTest_df.show()

+-------------------+------------+
|             entity|   sentiment|
+-------------------+------------+
|             report|       -0.05|
|             family|   -0.251875|
|       jr murdershe|-0.080000006|
|        shore house|         0.0|
|                one|       -0.09|
|haunted house story|-0.010000001|
|         everything|       -0.18|
|         happenings| -0.26000002|
|              place|       -0.09|
|           thriller| -0.16000001|
|               eyes|      -0.045|
|                bit|       -0.25|
|          trappings|-0.010000001|
|       the exorcist| -0.21000001|
|   stuart rosenberg|        1.45|
|          affection| -0.19500001|
|             people|         0.0|
|  brandt sponseller|       -0.09|
|             angles|        1.05|
|             scenes|      -0.155|
+-------------------+------------+
only showing top 20 rows

+--------------+--------------------+--------+
|        entity|            avg_sent|std_sent|
+--------------+--------------------+---

In [17]:
sim_udf = udf(getNormOverlap, FloatType())

In [18]:
def computeEntitySentimentSimilarity(trained_df, test_df):
    j = test_df.select(col("entity"), col("avg_sent").alias("tst_avg_sent"), 
                                  col("std_sent").alias('tst_std_sent')).join(trained_df, test_df.entity==trained_df.entity)
    j = j.withColumn("similarity", sim_udf(j.avg_sent, j.tst_avg_sent, j.std_sent, j.tst_std_sent)).orderBy("tfidf", ascending=False)

    return j

In [19]:
pos_sim_df = computeEntitySentimentSimilarity(horror_df, entitiesForTest_df).show(100)
neg_sim_df = computeEntitySentimentSimilarity(horror_neg_df, entitiesForTest_df).show(100)

pos_sim_df.withColumn("scaled_sim", pos_sim_df.similarity*pos_sim_df)

+-----------+--------------------+------------+------+-----------+------------------+----------+--------------------+-------------------+--------------------+----------+
|     entity|        tst_avg_sent|tst_std_sent| genre|     entity|             tfidf|vocabIndex|            avg_sent|           std_sent|           sent_hist|similarity|
+-----------+--------------------+------------+------+-----------+------------------+----------+--------------------+-------------------+--------------------+----------+
|  nightmare|-0.04000000283122063|         0.0|Horror|  nightmare|13.956971773672775|       594|-0.30222222360057965| 0.2948411625946721|[0.0,0.0,0.0,0.0,...|0.73777777|
|   haunting|                 0.0|         0.0|Horror|   haunting| 6.414035123119086|      4623|  0.4028571333203997| 0.6113022730514908|[0.0,0.0,0.0,0.0,...| 0.5971429|
|        job|  0.4899999797344208|         0.0|Horror|        job|3.8647768910276814|        32|  0.4757017326426872| 0.4329891134295431|[0.0,0.0,0.0,

AttributeError: 'NoneType' object has no attribute 'withColumn'

In [139]:
computeEntitySentimentSimilarity(history_df, entitiesForTest_df).show(100)

+-----------+-------------------+------------+-------+-----------+------------------+----------+-------------------+------------------+-----------+
|     entity|       tst_avg_sent|tst_std_sent|  genre|     entity|             tfidf|vocabIndex|           avg_sent|          std_sent| similarity|
+-----------+-------------------+------------+-------+-----------+------------------+----------+-------------------+------------------+-----------+
|      price|-0.1600000113248825|         0.0|History|      price|  2.70805020110221|      8369|-0.4049999713897705| 0.572756452300129| 0.24499996|
|        mgm| 0.1600000113248825|         0.0|History|        mgm|  2.70805020110221|      9711|-0.5400000214576721|0.5400000214576721| 0.70000005|
|composition|                0.0|         0.0|History|composition| 1.455287232606842|       448|0.35999998450279236|0.5977457375045399| 0.35999998|
|        job| 0.4899999797344208|         0.0|History|        job|0.1823215567939546|         0|0.44066664680528

In [107]:
computeEntitySentimentSimilarity(comedy_df, entitiesForTest_df).show(100)

+----------+--------------------+------------+------+----------+------------------+----------+--------------------+-------------------+----------+
|    entity|        tst_avg_sent|tst_std_sent| genre|    entity|             tfidf|vocabIndex|            avg_sent|           std_sent|similarity|
+----------+--------------------+------------+------+----------+------------------+----------+--------------------+-------------------+----------+
|  haunting|                 0.0|         1.0|Comedy|  haunting| 1.791759469228055|      1698|   0.809999942779541|  0.809999942779541| 0.6459616|
|skepticism|                 0.0|         1.0|Comedy|skepticism|1.6094379124341003|      1063|-0.48499997705221176| 0.4596193593023801| 0.5825295|
|     shout|               -0.25|         1.0|Comedy|     shout|1.0986122886681098|       135|   -0.64000004529953|   0.64000004529953|0.73166984|
| nightmare|-0.04000000283122063|         1.0|Comedy| nightmare|0.5679840376059393|        13| -0.4426086815074086|0.4

In [140]:
computeEntitySentimentSimilarity(documentary_df, entitiesForTest_df).show(100)

+--------------+--------------------+------------+-----------+--------------+------------------+----------+-------------------+-------------------+-----------+
|        entity|        tst_avg_sent|tst_std_sent|      genre|        entity|             tfidf|vocabIndex|           avg_sent|           std_sent| similarity|
+--------------+--------------------+------------+-----------+--------------+------------------+----------+-------------------+-------------------+-----------+
|          evil|  -0.809999942779541|         0.0|Documentary|          evil|  2.70805020110221|      8845|-0.3774999901652336| 0.3089093437387237| 0.43249995|
|cinematography| 0.18000000715255737|         0.0|Documentary|cinematography|1.6094379124341003|       920|  0.312727262604643| 0.5241581685669622| 0.13272725|
|       buildup|-0.04000000283122063|         0.0|Documentary|       buildup|1.6094379124341003|      1067|  0.809999942779541|  0.809999942779541| 0.84999996|
|     nightmare|-0.04000000283122063|   

In [142]:
comedy_df.show(100)

+------+--------------------+-----------------+----------+--------------------+-------------------+
| genre|              entity|            tfidf|vocabIndex|            avg_sent|           std_sent|
+------+--------------------+-----------------+----------+--------------------+-------------------+
|Comedy|              trogar|2.302585092994046|      5456|  -1.619999885559082|  1.619999885559082|
|Comedy|      kung fu master|2.302585092994046|      7326| -0.7200000286102295| 0.7200000286102295|
|Comedy|   la teta y la luna|2.302585092994046|      6889|  0.5100000500679016| 0.5100000500679016|
|Comedy|       house painter|2.302585092994046|      7645|    0.64000004529953|   0.64000004529953|
|Comedy|               tread|2.302585092994046|      5864|-0.36000001430511475|0.36000001430511475|
|Comedy|      celeste talbot|2.302585092994046|      5461| -0.5400000214576721| 0.5400000214576721|
|Comedy|  genius performance|2.302585092994046|      5855|                 0.5|                0.5|


In [113]:
np.std([2])

0.0

In [20]:
sc.stop()