Set up HDFS and Google credentials

In [47]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession


LOCAL_IP = "10.164.0.2"

spark = SparkSession \
    .builder \
    .appName("Test Etienne JOB") \
    .master("spark://10.164.0.2:7077") \
    .config("spark.executor.cores", 2) \
    .config("spark.cores.max", 14) \
    .config("spark.executorEnv.SPARK_LOCAL_IP", LOCAL_IP) \
    .getOrCreate()
    #.config("spark.python.worker.memory", "6g") \
    #.config("spark.executor.memory", "5g") \

sc = spark.sparkContext
sc

In [48]:
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./imdb-e9e7ce7a779d.json"
os.environ["HDFSCLI_CONFIG"]="./.hdfscli.cfg"
os.environ["HADOOP_CONF_DIR"]="/opt/hadoop-3.1.0/etc/hadoop"
sc.environment["GOOGLE_APPLICATION_CREDENTIALS"]="/MovieScope-1bf4856cc738.json"

In [49]:
# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
from functools import reduce

from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
from pyspark.sql import functions
from pyspark.sql.functions import udf
from pyspark.ml.linalg import ArrayType, VectorUDT, Vectors

import re
import time
import numpy as np

from pyspark.sql.types import *

In [8]:
orientation = "pos"
collection="reviews"
urlsCollection="train"

In [36]:
from pyspark.sql import Row
from pyspark.sql.functions import collect_list

grouped_entities_df = spark.read.parquet("hdfs://spark-master:8020/user/lmrd/"+collection+"/"+urlsCollection+"_"+orientation+"_grouped_entities2.pq")
grouped_entities_df.repartition(200)

DataFrame[genre: string, entity: string, sentiment: double]

In [43]:
def binVector(x):
        return Vectors.dense(np.histogram(x, bins=20, range=(-5, 5))[0])

histo_udf = functions.udf(binVector, VectorUDT())#ArrayType(FloatType()))

grouped_entities_df.registerTempTable("grouped_entities_df")
print(grouped_entities_df.count())

grouped_entity_words = spark.sql("select genre, collect_list(ltrim(rtrim(entity))) as entities, count(entity) as entity_count from grouped_entities_df group by genre ")
#grouped_entity_words = grouped_entities_df.select(["genre", "entity"]).groupBy("genre").agg(collect_list("entity").alias("entities"))
print(grouped_entity_words.count())

grouped_entities_df.registerTempTable("grouped_entities_df")
grouped_entities_df2 = spark.sql("select genre as genre, ltrim(rtrim(entity)) as tentity, collect_list(sentiment) as sents, avg(sentiment) as avg_sent, stddev(sentiment) as std_sent from grouped_entities_df group by genre, tentity ")
print(grouped_entities_df2.count())

grouped_entities_df2.registerTempTable("grouped_entities_df")
grouped_entities_df3 = spark.sql("select genre, tentity as entity, sents, avg_sent, std_sent, size(sents) as num_sents from grouped_entities_df where abs(avg_sent)>0.3")
print(grouped_entities_df3.count())

grouped_entities_df3 = grouped_entities_df3.withColumn("std_sent", functions.when(functions.isnan(grouped_entities_df3.std_sent)==True, 0).otherwise(grouped_entities_df3.std_sent)).orderBy([ "genre", "entity"], ascending=False)

grouped_entities_df3 = grouped_entities_df3.withColumn("sent_hist", histo_udf("sents"))

#grouped_sentiment = grouped_entities_df3.select(["genre", "avg_sent", "std_sent"]).groupBy("genre").agg(collect_list("avg_sent").alias("avg_sent"), collect_list("std_sent").alias("std_sent"))
#grouped_entity_words.show()
#grouped_sentiment.show()

1540580
29
301025
30290


In [39]:
grouped_entities_df.orderBy("sentiment", ascending=False).show()
grouped_entities_df3.show()
grouped_entity_words.show()

+-----------+-----------+------------------+
|      genre|     entity|         sentiment|
+-----------+-----------+------------------+
|    Fantasy|combination|3.4200000762939453|
|    Romance|combination|3.4200000762939453|
|     Action|combination|3.4200000762939453|
|     Comedy|combination|3.4200000762939453|
|    Romance|masterpiece| 3.419999837875366|
|      Drama|masterpiece| 3.419999837875366|
|     Comedy|masterpiece| 3.419999837875366|
|Documentary|       film|3.3299999237060547|
|    Romance|      movie| 3.239999771118164|
|        War|      movie| 3.239999771118164|
|     Action|       film| 3.239999771118164|
|   Thriller|       film| 3.239999771118164|
|     Comedy|      movie| 3.239999771118164|
|     Horror|       film| 3.239999771118164|
|    Fantasy|      movie| 3.239999771118164|
|    Musical|      movie| 3.239999771118164|
|  Adventure|      movie|2.7300000190734863|
|      Drama|      movie|2.7300000190734863|
|     Comedy|      movie|2.7300000190734863|
|     Fami

In [40]:
from pyspark.ml.feature import CountVectorizer, IDF

# remove sentiment info for use by hashingTF/tfif

# Load documents (one per line).

countVec = CountVectorizer(inputCol="entities", outputCol="tf")
cvmodel = countVec.fit(grouped_entity_words)

tf = cvmodel.transform(grouped_entity_words)
tf.show()
#sc.broadcast(hashingTF)

# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
# First to compute the IDF vector and second to scale the term frequencies by IDF.
tf.cache()
idf = IDF(inputCol="tf", outputCol="tfidf").fit(tf)
tfidf = idf.transform(tf)
tfidf.show()
# spark.mllib's IDF implementation provides an option for ignoring terms
# which occur in less than a minimum number of documents.
# In such cases, the IDF for these terms is set to 0.
# This feature can be used by passing the minDocFreq value to the IDF constructor.
# idfIgnore = IDF(minDocFreq=2).fit(tf)
# tfidfIgnore = idfIgnore.transform(tf)

tfidf.count()

+-----------+--------------------+------------+--------------------+
|      genre|            entities|entity_count|                  tf|
+-----------+--------------------+------------+--------------------+
|      Crime|[giallo, element,...|       93862|(69858,[0,1,2,3,4...|
|    Romance|[idea, film, cred...|      134891|(69858,[0,1,2,3,4...|
|   Thriller|[giallo, element,...|      119420|(69858,[0,1,2,3,4...|
|  Adventure|[biopic, ned kell...|       67813|(69858,[0,1,2,3,4...|
|         NA|[lydia reed, the ...|        1101|(69858,[0,1,2,3,4...|
|      Drama|[idea, film, cred...|      308494|(69858,[0,1,2,3,4...|
|        War|[agust villaronga...|       28057|(69858,[0,1,2,3,4...|
|Documentary|[tobias schneebau...|       22679|(69858,[0,1,2,3,4...|
| Reality-TV|[couple, island p...|        1193|(69858,[1,2,3,4,5...|
|     Family|[fairy tale, sing...|       50334|(69858,[0,1,2,3,4...|
|    Fantasy|[fairy tale, sing...|       67615|(69858,[0,1,2,3,4...|
|  Game-Show|[elementary schoo...|

29

In [42]:
from pyspark.sql import Row
from pyspark.sql.functions import explode
import numpy as np

tfidf = tfidf.repartition(100)
vocab = tfidf.select(["genre", "tfidf"])
genreVocabs = dict()

for genre in vocab.collect():
    genreName = genre.genre
    
    t=genre.tfidf
    genreVocabs[genreName] = t
    
globalVocab = list(cvmodel.vocabulary)
#print(genreVocabs)
    
sc.broadcast(globalVocab)
sc.broadcast(genreVocabs)

def remapEntitiesByTfidf(row):
    tfidfMappings = genreVocabs[row.genre]
    tfIndex = globalVocab.index(row.entity)
    tfidf = tfidfMappings[tfIndex]
    
    return Row(genre=row.genre, entity=row.entity, tfidf=float(tfidf), vocabIndex=int(tfIndex))
    
genreCorpora=dict()

for genre in genreVocabs.keys():

    genreEntities = tfidf.where(tfidf.genre==genre).select("genre", explode("entities").alias("entity")).distinct()
    
    print(genreEntities.count())
    #genreEntities.orderBy(["genre", "entity"]).show()
    
    #data = genreEntities.rdd.map(remapEntitiesByTfidf)

    entitiesByTfidf = spark.createDataFrame(data=genreEntities.rdd.map(remapEntitiesByTfidf), schema=["entity", "genre", "tfidf", "vocabIndex"])
    print(entitiesByTfidf())
    #entitiesByTfidf.orderBy(["genre", "entity"]).show()
    entitiesByTfidf = entitiesByTfidf.join(grouped_entities_df3, on=["genre", "entity"], how="inner")#.sort("tfidf", ascending=False)
    entitiesByTfidf.orderBy("tfidf", ascending=False).show()
    genreCorpora[genre] = entitiesByTfidf
    
    print(entitiesByTfidf.count())
    

+-------+----------------+------------------+----------+--------------------+--------------------+-------------------+---------+--------------------+
|  genre|          entity|             tfidf|vocabIndex|               sents|            avg_sent|           std_sent|num_sents|           sent_hist|
+-------+----------------+------------------+----------+--------------------+--------------------+-------------------+---------+--------------------+
|Fantasy|        malfique|16.119224164338117|      6191|[1.36000001430511...| 0.40500001329928637|  0.594906973451176|        8|[0.0,0.0,0.0,0.0,...|
|Fantasy|           marcu|14.552872326068421|      5032|[0.0, -1.10000002...| -0.4340000103227794| 0.4941479835549382|       10|[0.0,0.0,0.0,0.0,...|
|Fantasy|        lassalle|14.104321143795852|      6989|[-0.0100000007078...| -0.3128571461087891|0.32780655646770873|        7|[0.0,0.0,0.0,0.0,...|
|Fantasy|           beast|12.332476867384303|      1091|[-2.0799999237060...|-0.33370370066000354|0.

In [12]:
from pyspark.sql.functions import *

for (genreName, corpus) in genreCorpora.items():
    print(genreName)
    df = corpus.select(col("genre"), col("entity"), col("tfidf"), col("vocabIndex"), col("avg_sent"), col("std_sent"), col("sent_hist"))

    df.write.parquet("hdfs://spark-master:8020/user/lmrd/"+collection+"/"+genreName+"_"+orientation+"_tfidf3.pq", mode="overwrite")

Fantasy
Game-Show
Talk-Show
Adventure
Horror
Short
Drama
Romance
Thriller
War
Musical
Music
Western
History
Documentary
Comedy
Family
Sci-Fi
NA
News
Animation
Biography
Adult
Crime
Mystery
Film-Noir
Action
Sport
Reality-TV


In [44]:
sc.stop()
