Set up HDFS and Google credentials

In [1]:
sc


In [2]:
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./imdb-e9e7ce7a779d.json"
os.environ["HDFSCLI_CONFIG"]="./.hdfscli.cfg"
os.environ["HADOOP_CONF_DIR"]="/opt/hadoop-3.1.0/etc/hadoop"
sc.environment["GOOGLE_APPLICATION_CREDENTIALS"]="/MovieScope-1bf4856cc738.json"

List filenames of reviews from HDFS and parallelize in preparation from processing

Parallelise the reviews and use Google NLP API to extract entities and related sentiment.

In [17]:
# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

import pyspark.sql.functions as F
from pyspark.sql.window import Window as W
from pyspark.sql.types import *
from pyspark.sql import Row
import pyspark.sql.functions as functions
from pyspark.sql.functions import collect_list
from pyspark.sql.functions import collect_set
from pyspark.sql.functions import udf

#from pyspark.mllib.linalg import SparseVector, DenseVector, VectorUDT
from pyspark.ml.linalg import SparseVector, DenseVector, VectorUDT
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel, RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, HashingTF, IDF, IDFModel, StringIndexer, StringIndexerModel, IndexToString
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from functools import reduce
import re
import numpy as np
from math import exp
import pickle
import pandas as pd
import base64

from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

In [4]:
def collectEntities(x, y):
    # The first reduce call doesn't pass a list for x, so we need to check for that.
    if not isinstance(x, list):
        x=[x]
        

    xd = dict(x)
    #print(xd)
    
    if not isinstance(y, list):
        y = [y]
        
    for ye in y:
        if ye[0] in xd:
            try:
                xd[ye[0]] = (xd[ye[0]]+ye[1])/2
            except:
                Null
        else:
            xd[ye[0]] = ye[1]
    
    return [o for o in xd.items()]
        

In [5]:
orientation = "pos"
collection="reviews"
urlsCollection="train"

Load genre information from file (previously collected using IMDB API)

In [6]:


def decodeGenre(x):
    try: 
        g = pickle.loads(base64.b64decode(x[2:-1]), encoding="bytes") 
        if (len(g)==0):
            return ["NA"]
        else:
            return g
    except:
        return ["NA"]    
        

def loadGenres(urlsCollection, orientation):
    genres = pd.read_csv("Data/genres_"+urlsCollection+"_urls_"+orientation+".csv", sep="\t", index_col=0, usecols=[1, 2, 3])
    genres = genres.fillna(value="b''")
    genres["GENRE"] = genres["GENRE"].apply(decodeGenre) 

    schema = StructType([
        StructField("FILM_ID", IntegerType(), True),
        StructField("GENRE", ArrayType(StringType(), containsNull=True), True)])

    genres_df = spark.createDataFrame(genres, schema)

    from pyspark.sql.functions import monotonically_increasing_id

    # This will return a new DF with all the columns + id
    genres_df = genres_df.withColumn("ID_TEMP", monotonically_increasing_id())#.limit(10)

    genres_df = genres_df.withColumn("ID",F.row_number().over(W.orderBy("ID_TEMP"))).select(["FILM_ID", "GENRE", "ID"])#.limit(10)
    
    return genres_df


In [10]:
#from pyspark.mllib.linalg import SparseVector, DenseVector, VectorUDT
from pyspark.ml.linalg import SparseVector, DenseVector, VectorUDT
from pyspark.sql.functions import udf

def sparse2dense(sp):
    return DenseVector(sp)




    
def separateGenres3(rec):
    print(rec)
    return [[genre, e, s] for (e, s) in rec.ENTITY_SENTIMENT for genre in rec.GENRE]

def prepareDataset(collection, orientation, urls):
    # Make sure we don't trigger Google Cloud API again
    entity_documents_info = spark.read.parquet("hdfs://spark-master:8020/user/lmrd/"+collection+"/"+orientation+"_doc_info2.pq")
    print("emtity_documents_info")
    entity_documents_info.show(5)

    genres_df = loadGenres(urls, orientation)
    print("genres_df")
    genres_df.show(5)
    
    entity_documents_info = entity_documents_info.alias("df1").join(genres_df.alias("df2"), entity_documents_info.ID == genres_df.ID)#.select(["df1.*", "df2.FILM_ID", "df2.GENRE"])
    print("entity_documents_info")
    entity_documents_info.show(5)
    
    grouped_entities = entity_documents_info.rdd.flatMap(separateGenres3)
    grouped_entities.repartition(5)
    print("grouped_entities")
    print(grouped_entities.take(5))
    
    grouped_entities_df = spark.createDataFrame(data=grouped_entities, schema=["genre", "entity", "sentiment"])
    grouped_entities_df.cache()
    print("grouped_entites_df")
    grouped_entities_df.show()
    
    grouped_entity_words = grouped_entities_df.select(["genre", "entity"]).groupBy("genre").agg(collect_list("entity").alias("entities"))
    print("grouped_entity_words")
    grouped_entity_words.show(5)
    
    return grouped_entity_words

def extractTFIDFDataframeAndModel(collection, orientation, urls):

    grouped_entity_words = prepareDataset(collection, orientation, urls)
    
    # Create the dictionary
    countVec = CountVectorizer(inputCol="entities", outputCol="tf")
    #countVec = HashingTF(numFeatures=1024, inputCol="entities", outputCol="tf")
    idf = IDF(inputCol="tf", outputCol="tfidf")
    si = StringIndexer(inputCol="genre", outputCol="genreId")#, handleInvalid="keep")
    #nb = NaiveBayes(featuresCol="tfidf", labelCol="genreId", predictionCol="predictGenreId")
    rf = RandomForestClassifier(featuresCol="tfidf", labelCol="genreId", predictionCol="predictGenreId")
    #isModel = IndexToString(inputCol=nb.getPredictionCol(), outputCol="predictGenre")
    
#    grouped_entity_words = si.fit(grouped_entity_words).transform(grouped_entity_words)
    #pipeline = Pipeline(stages=[countVec, idf, si, nb])
    pipeline = Pipeline(stages=[countVec, idf, si, rf])
    
    
#    paramGrid = ParamGridBuilder() \
#        .addGrid(countVec.minTF, [1.0]) \
#        .addGrid(countVec.minDF, [1.0]) \
#        .build()
    
#    crossval = CrossValidator(estimator=pipeline,
#                  estimatorParamMaps=paramGrid,
#                  evaluator=MulticlassClassificationEvaluator(predictionCol="predictGenreId", labelCol="genreId", metricName="accuracy"),
#                  numFolds=3, parallelism=5)  # use 3+ folds in practice
    
#    cvModel = crossval.fit(grouped_entity_words)
    cvModel = pipeline.fit(grouped_entity_words)
    
    dft = cvModel.transform(grouped_entity_words)
    
    eval = MulticlassClassificationEvaluator(predictionCol="predictGenreId", labelCol="genreId", metricName="accuracy")
    print("f1-score: ", eval.evaluate(dft))
    
    
    return cvModel

cvModel = extractTFIDFDataframeAndModel(collection, orientation, urlsCollection)

#(tfidf, cvmodel, idf, siModel, isModel) = extractTFIDFDataframe(collection, orientation, urlsCollection)
#tfidf.show(5)


emtity_documents_info
+-----+--------------------+
|   ID|    ENTITY_SENTIMENT|
+-----+--------------------+
|10037|[[titanic, -1.800...|
|10038|[[rose, 0.0], [ja...|
|10039|[[titanic, 0.0], ...|
|10040|[[titanic, 1.88],...|
| 1004|[[masterpiece, 0....|
+-----+--------------------+
only showing top 5 rows

genres_df
+-------+-------------------+---+
|FILM_ID|              GENRE| ID|
+-------+-------------------+---+
| 453418|[Animation, Comedy]|  1|
| 453418|[Animation, Comedy]|  2|
| 453418|[Animation, Comedy]|  3|
|  64354|           [Comedy]|  4|
|  64354|           [Comedy]|  5|
+-------+-------------------+---+
only showing top 5 rows

entity_documents_info
+---+--------------------+-------+-------------------+---+
| ID|    ENTITY_SENTIMENT|FILM_ID|              GENRE| ID|
+---+--------------------+-------+-------------------+---+
|  1|[[bromwell high, ...| 453418|[Animation, Comedy]|  1|
|  2|[[format, 0.0], [...| 453418|[Animation, Comedy]|  2|
|  3|[[bromwell high, ...| 453418|

In [81]:
cvModel.avgMetrics


[0.0]

In [11]:


def checkSentimentValue(x):
    try:
        f = float(x)
        
        return f
    
    except:
        print("Wrong sentiment value ", f)
        return 0
    
def extractEntitiesSetimentForReview(review_contents):
    # Instantiates a client
    client = language.LanguageServiceClient()
        
    document = types.Document(content = review_contents, 
                             type=enums.Document.Type.PLAIN_TEXT, language="en-US")
    tries=1
    
    while tries < 5:
        try:
            entities = client.analyze_entity_sentiment(document=document, encoding_type="UTF8")
            break
        except:
            f = open("/home/etienne/sparklog.txt", mode="a")

            f.write(""+str(entities)+"\n")
            f.close()
            time.sleep(1)
            
            tries +=1
            
    
    
    # Make sure we have no duplicate entities. If we do, average their sentiment.
    justLetters = re.compile("[^a-z ]")
    response = [o for o in zip([lemmatizer(justLetters.sub("", entity.name.lower()), u"NOUN")[0] for entity in entities.entities], 
                               [checkSentimentValue(entity.sentiment.score) * checkSentimentValue(entity.sentiment.magnitude) 
                                    for entity in entities.entities])]
    
    
#    response = sorted(response, key=lambda x: x[0])
#    if (len(response)>1):
#        response = reduce(collectEntities, response)
    
            
    return response

In [12]:
def indexToLabel(cvModel, indexes):
    return [cvModel.bestModel.stages[2].labels[index] for index in indexes]

def indexToLabel2(cvModel, indexes):
    return [cvModel.stages[2].labels[index] for index in indexes]

indexToLabel2(cvModel, [0])
#pipeline = cvModel.bestModel.explainParams()  #getEstimator()
#siModel = pipeline.getStages()[2]

#nb = NaiveBayes(featuresCol="tfidf", labelCol="genreId", predictionCol="predictGenreId")

#nb_model = nb.fit(tfidf2)

#print(nb_model.pi)


['Animation']

In [21]:
# Evaluate the model
ds = prepareDataset("test_reviews", orientation, urlsCollection)

ds.show(5)

AnalysisException: 'Path does not exist: hdfs://spark-master:8020/user/lmrd/test_reviews/pos_doc_info2.pq;'

In [82]:
testpreds = cvModel.transform(ds)
#testpreds.take(1)[0].tf

eval = MulticlassClassificationEvaluator(predictionCol="predictGenreId", labelCol="genreId", metricName="accuracy")
print("f1-score: ", eval.evaluate(testpreds))

IllegalArgumentException: 'Field "genreId" does not exist.'

In [19]:


#revText=["I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge."]
revText=["As others that have commented around the web... I'm a 130 pilot in the Coast Guard. Having said that, and being the skeptic I am, I went expecting the over-the-top cheese factors. There was some cheese, but all in all, not much.. and the film was pretty accurate.I watched the trailer again today. After seeing the film yesterday, I've realized the trailer gives the impression the movie is nothing but rescue after rescue action scenes. This isn't the case.The movie is truly more character/story driven than action. The inner struggles both Costner and Kutcher are dealing with.. Kutcher's is revealed further into than movie than Costner's is.Of course, there is a minor love story.. no surprise there. But for the most part, the movie tells the tale of two lives that come together, and after some time, help each other heal old wounds.As girlie as it sounds, Costner and, as much as I try not to like him, Kutcher do actually work quite well together and compliment each other very well in the movie.As critics have stated, you've seen it all before.. Top Gun, Officer and a Gentlemen, etc. But what movie hasn't been remade a million times.I can recall only one F word being spoken.. and can't really recall any other language.The movie is 2+ hours, and for some, may tend to get a little long towards the end.You'll laugh, you may cry, but I can honestly say, it was worth the $4 I paid.I hope you enjoy the movie",
         "I work at a movie theater and every Thursday night we have an employee screening of one movie that comes out the next day...Today it was The Guardian. I saw the trailers and the ads and never expected much from it, and in no way really did i anticipate seeing this movie. Well turns out this movie was a lot more than I would have thought. It was a great story first of all. Ashton Kutcher and Kevin Costner did amazing acting work in this film. Being a big fan of That 70's Show I always found it hard thinking of Kutcher as anyone but Kelso despite the great acting he did in The Butterfly Effect, but after seeing this movie I think I might be able to finally look at him as a serious actor.It was also a great tribute to the unsung heroes of the U.S. Coast Guard."]
revTextRdd = sc.parallelize(revText)

lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
sc.broadcast(lemmatizer)

#entitiesForTest = revTextRdd.flatMap(extractEntitiesSetimentForReview)
entitiesForTest = revTextRdd.flatMap(extractEntitiesSetimentForReview)

print(entitiesForTest.collect())
schema1 = StructType(
                [StructField("entity", StringType(), False), 
                 StructField("sentiment", FloatType(), False)])

entitiesForTest_df = spark.createDataFrame(entitiesForTest, schema=schema1)


entitiesForTest_df.registerTempTable("df")
grouped_entities_df2 = spark.sql("select ltrim(rtrim(entity)) as tentity, avg(sentiment) as avg_sent, stddev(sentiment) as std_sent from df group by tentity ")
#grouped_entities_df2.registerTempTable("grouped_entities_df")
#grouped_entities_df3 = spark.sql("select genre, tentity as entity, avg_sent, std_sent from grouped_entities_df where abs(avg_sent)>0.3 order by genre, entity, avg_sent desc")

grouped_entities_df3 = grouped_entities_df2.withColumn("std_sent", functions.when(functions.isnan(grouped_entities_df2.std_sent)==True, functions.abs(grouped_entities_df2.avg_sent)).otherwise(grouped_entities_df2.std_sent))


entitiesForTest2_df = grouped_entities_df3.agg(collect_set('tentity').alias('entities')).crossJoin(grouped_entities_df3.agg(collect_set("avg_sent").alias("avg_sent"), collect_set("std_sent").alias("std_sent")))




#entitiesForTest_df.show()
#entitiesForTest2_df = entitiesForTest_df.agg(collect_set('entity').alias('entities')).crossJoin(entitiesForTest_df.agg(collect_set('sentiment')).alias("sentiment"))

entitiesForTest2_df.show()



[('pilot', 0.0), ('nothing', 0.0), ('movie', 0.0), ('web', 0.0), ('other', 0.0), ('costner', -0.16000000476837162), ('cheese factor', -0.010000000298023226), ('skeptic', 0.0), ('life', -0.03000000163912775), ('trailer', 0.0), ('coast guard', 0.0), ('trailer', 0.0), ('cheese', 0.0), ('film', 0.0), ('all', 0.0), ('film', 0.0), ('love story', 0.0), ('case', 0.0), ('each other', -0.040000001192092904), ('rescue', 0.0), ('action', 0.0), ('movie', -0.010000000298023226), ('kutcher', -0.03000000163912775), ('impression', 0.0), ('course', 0.0), ('is', -0.010000000298023226), ('surprise', 0.0), ('part', -0.040000001192092904), ('rescue action scene', 0.0), ('character', 0.0), ('struggle', -0.4899999833106996), ('some', 0.0), ('time', -0.040000001192092904), ('f word', -0.010000000298023226), ('tale', -0.010000000298023226), ('woundsa', 0.0), ('end', 0.0), ('each other', 0.010000000298023226), ('critic', -0.010000000298023226), ('officer', 0.0), ('moviea', 0.0), ('gentleman', 0.0), ('top gun', 0

In [20]:
entitiesForTest3_df = cvModel.transform(entitiesForTest2_df)

entitiesForTest3_df.show()

indexToLabel2(cvModel, [int(entitiesForTest3_df.select("predictGenreId").take(1)[0].predictGenreId)])

scores = entitiesForTest3_df.select("rawPrediction").collect()[0].rawPrediction
print(scores)

probs = [1-(1.0*s)/np.sum(scores) for s in scores]
#probs = [exp(np.max(scores) - s) for s in scores]
print(probs)
print(np.argsort(scores))
top = np.argsort(scores)[:-4:-1]

print(indexToLabel2(cvModel, top))

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+
|            entities|            avg_sent|            std_sent|                  tf|               tfidf|       rawPrediction|         probability|predictGenreId|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+
|[thinking, employ...|[0.0, -0.01500000...|[0.0, 0.160000011...|(70083,[0,1,2,4,6...|(70083,[0,1,2,4,6...|[0.15818713450292...|[0.00790935672514...|          28.0|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+

[0.158187134502924,3.1185045948203842,0.23472222222222222,3.0694444444444446,0.3601712614870509,0.17485380116959065,0.23253968253968255,0.4263888888888889,0.522671261487051,0.1269

In [39]:


entitiesForTest3_df = cvmodel.transform(entitiesForTest2_df)

entitiesForTest3_df = entitiesForTest3_df.withColumn("features", udf_to_DenseVector("tf"))

entitiesForTest3_df = model.transform(entitiesForTest3_df)

entitiesForTest3_df = isModel.transform(entitiesForTest3_df)

print(isModel.getLabels())
scores = entitiesForTest3_df.select("rawPrediction").collect()[0].rawPrediction
print(scores)

probs = [1-(1.0*s)/np.sum(scores) for s in scores]
#probs = [exp(np.max(scores) - s) for s in scores]
print(probs)
print(np.argsort(scores))
entitiesForTest3_df.show()

['Animation', 'Talk-Show', 'Thriller', 'Adult', 'War', 'Horror', 'Documentary', 'NA', 'Biography', 'Comedy', 'Western', 'Fantasy', 'Romance', 'Family', 'Drama', 'Short', 'Sport', 'History', 'Film-Noir', 'Reality-TV', 'Music', 'Mystery', 'Musical', 'Sci-Fi', 'Game-Show', 'Adventure', 'Crime', 'Action', 'News']
[-537.7891618608271,-587.9934842823056,-518.4590082407616,-585.93812581499,-543.6674228395556,-538.8842749128711,-553.6583065705015,-582.2730094151456,-537.4807382920243,-514.2212682051768,-556.3789659243172,-523.747313855792,-513.583636157141,-530.9484400011238,-506.7379617621209,-558.7893495978734,-554.9934586069586,-546.6821482616766,-561.8042696801848,-584.53159521691,-552.8948283999592,-534.1307261361959,-544.3075497937868,-529.3509095570462,-586.2027202433444,-518.7623805869365,-523.9627650786558,-514.3693191457652,-588.0464268898968]
[0.9660284817794389, 0.9628571329036251, 0.9672495451858455, 0.9629869675164696, 0.9656571588445305, 0.9659593047568444, 0.9650260462956444, 0