Set up HDFS and Google credentials

In [1]:
sc


In [2]:
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./imdb-e9e7ce7a779d.json"
os.environ["HDFSCLI_CONFIG"]="./.hdfscli.cfg"
os.environ["HADOOP_CONF_DIR"]="/opt/hadoop-3.1.0/etc/hadoop"
sc.environment["GOOGLE_APPLICATION_CREDENTIALS"]="/imdb-e9e7ce7a779d.json"

List filenames of reviews from HDFS and parallelize in preparation from processing

Parallelise the reviews and use Google NLP API to extract entities and related sentiment.

In [3]:
# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
from functools import reduce

def collectEntities(x, y):
    # The first reduce call doesn't pass a list for x, so we need to check for that.
    if not isinstance(x, list):
        x=[x]
        

    xd = dict(x)
    #print(xd)
    
    if not isinstance(y, list):
        y = [y]
        
    for ye in y:
        if ye[0] in xd:
            try:
                xd[ye[0]] = (xd[ye[0]]+ye[1])/2
            except:
                Null
        else:
            xd[ye[0]] = ye[1]
    
    return [o for o in xd.items()]
        


In [4]:
tf = sc.wholeTextFiles("hdfs://sp-master:8020/user/lmrd/reviews/pos")

In [7]:
tf.take(5)

[('hdfs://sp-master:8020/user/lmrd/reviews/pos/0_9.txt',
  'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'),
 ('hdfs://sp-master:8020/user/lmrd/reviews/pos/10000_8.txt',
  'Homelessness (or Houselessness as George Carlin stated) has be

In [5]:
import re
from pyspark.sql.types import *

def checkSentimentValue(x):
    try:
        f = float(x)
        
        return f
    
    except:
        print("Wrong sentiment value ", f)
        return 0
    
def extractEntitiesSetiment2(fileObj):
    # Instantiates a client
    client = language.LanguageServiceClient()
    
    review_contents = fileObj[1]
        
    #print(review_contents)
    document = types.Document(content = review_contents, 
                             type=enums.Document.Type.PLAIN_TEXT)
    
    entities = client.analyze_entity_sentiment(document=document, encoding_type="UTF8")
    
    # Make sure we have no duplicate entities. If we do, average their sentiment.
    justLetters = re.compile("[^a-z ]")
    response = [o for o in zip([justLetters.sub("", entity.name.lower()) for entity in entities.entities], [checkSentimentValue(entity.sentiment.score) * checkSentimentValue(entity.sentiment.magnitude) for entity in entities.entities])]
    response = sorted(response, key=lambda x: x[0])
    response = reduce(collectEntities, response)
    #print(fileObj[0], response)
    try:
        fid = int(fileObj[0])
    except:
        fid=0
    
    return (fileObj[0], response)

def extractOrdering(rec):
    filenameRegexp = ".*/([0-9]*)_.*\.txt$"
    r = re.search(filenameRegexp, rec[0])

    return (int(r.groups()[0])+1, rec[1])
    #hdfs://localhost:9000/user/lmrd/reviews/pos/3467_7.txt

#sc.broadcast(filenameRegexp)
filesRdd = tf.map(extractOrdering)

#schema1 = StructType([
#    StructField("ID", IntegerType(), False),
#    StructField("GENRE", ArrayType(
#            StructField("ENTITY", StringType(), False), 
#            StructField("SENTIMENT", FloatType(), False)), nullable=True)])

entity_documents_info = filesRdd.map(extractEntitiesSetiment2)
entity_documents_info.cache()
#entity_documents_info.saveAsTextFile("hdfs://sp-master:8020/user/lmrd/reviews/temp_pos3.txt")


entity_documents_info = spark.createDataFrame(filesRdd.map(extractEntitiesSetiment2), schema=["ID", "ENTITIY_SENTIMENT"])


In [None]:
#entity_documents_info = spark.createDataFrame(filesRdd.map(extractEntitiesSetiment2), schema=["ID", "ENTITIY_SENTIMENT"])

entity_documents_info.write.parquet("hdfs://spark-master:8020/user/lmrd/reviews/pos_doc_info.pq", mode="overwrite")

In [10]:
filesRdd.take(5)
#entity_documents_info.show(5)
#entity_documents_info.take(5)

[(0,
  'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'),
 (10000,
  'Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who d

Load genre information from file (previously collected using IMDB API)

In [9]:
import pickle
import pandas as pd
import base64
from functools import reduce
import pyspark.sql.functions as F
from pyspark.sql.window import Window as W

def decodeGenre(x):
    try: 
        g = pickle.loads(base64.b64decode(x[2:-1]), encoding="bytes") 
        if (len(g)==0):
            return ["NA"]
        else:
            return g
    except:
        return ["NA"]    
        
        
genres = pd.read_csv("Data/genres_train_urls_pos.csv", sep="\t", index_col=0, usecols=[1, 2, 3])
#print(genres.head())
genres = genres.fillna(value="b''")
genres["GENRE"] = genres["GENRE"].apply(decodeGenre) 

# Get list of unique genre values
#unique_genres = set(reduce(lambda x, y: x+y, genres["GENRE"].values))
#print(unique_genres)

#print(genres)
#print(genres[["ID", "GENRE"]])
#z = zip(genres["ID"], genres["GENRE"])


#genres_rdd = sc.parallelize([(int(k)-1, v[0], v[1]) for (k, v) in genres.iteritems()])

schema = StructType([
    StructField("FILM_ID", IntegerType(), True),
    StructField("GENRE", ArrayType(StringType(), containsNull=True), True)])

genres_df = spark.createDataFrame(genres, schema)

from pyspark.sql.functions import monotonically_increasing_id

# This will return a new DF with all the columns + id
genres_df = genres_df.withColumn("ID_TEMP", monotonically_increasing_id())#.limit(10)

genres_df = genres_df.withColumn("ID",F.row_number().over(W.orderBy("ID_TEMP"))).select()#.limit(10)

#df1.withColumn("idx", F.row_number())
genres_df.show(5)
#genres_rdd.collect()

+-------+-------------------+-------+---+
|FILM_ID|              GENRE|ID_TEMP| ID|
+-------+-------------------+-------+---+
| 453418|[Animation, Comedy]|      0|  1|
| 453418|[Animation, Comedy]|      1|  2|
| 453418|[Animation, Comedy]|      2|  3|
|  64354|           [Comedy]|      3|  4|
|  64354|           [Comedy]|      4|  5|
+-------+-------------------+-------+---+
only showing top 5 rows



In [None]:
entity_documents_info = entity_documents_info.alias("df1").join(genres_df.alias("df2"), entity_documents_info.ID == genres_df.ID)#.select(["df1.*", "df2.FILM_ID", "df2.GENRE"])

entity_documents_info.show(5)

Zip the document-entity-sentiment rdd with the genre rdd.
There should be exactly the same number of reviews as records in the genres rdd.

Group documents by genre

In [25]:
def separateGenres(rec):
    print(len(rec))
    return [[genre, rec[0]] for genre in rec[1][1]]

def separateGenres2(rec):
    return [[genre, e, s] for (e, s) in rec[0] for genre in rec[1][1]]

def separateGenres3(rec):
    print(rec)
    return [[genre, e, s] for (e, s) in rec.ENTITIY_SENTIMENT for genre in rec.GENRE]
    
#grouped_entities = entity_documents_info.flatMap(separateGenres).reduceByKey(collectEntities)
grouped_entities = entity_documents_info.rdd.flatMap(separateGenres3)

grouped_entities_df = spark.createDataFrame(data=grouped_entities, schema=["genre", "entity", "sentiment"])
#grouped_entities_df.show()
grouped_entities_df.cache()

DataFrame[genre: string, entity: string, sentiment: double]

In [27]:
grouped_entities_df.show()

+---------+--------------+--------------------+
|    genre|        entity|           sentiment|
+---------+--------------+--------------------+
|Animation|        anyone|-0.04000000119209...|
|   Comedy|        anyone|-0.04000000119209...|
|Animation|ashton kutcher|                 0.0|
|   Comedy|ashton kutcher|                 0.0|
|Animation|   ben randall|0.040000001192092904|
|   Comedy|   ben randall|0.040000001192092904|
|Animation|     character|                 0.0|
|   Comedy|     character|                 0.0|
|Animation|        comedy|                 0.0|
|   Comedy|        comedy|                 0.0|
|Animation|      emotions|                 0.0|
|   Comedy|      emotions|                 0.0|
|Animation|       friends|                 0.0|
|   Comedy|       friends|                 0.0|
|Animation|          half|                 0.0|
|   Comedy|          half|                 0.0|
|Animation|  jake fischer|                 0.0|
|   Comedy|  jake fischer|              

In [28]:
from pyspark.sql import Row
from pyspark.sql.functions import collect_list

#def removeSentiment(x):
#    entities = list()
#    for xe in x:
#        entities.append(xe[0])
#        
#    return entities
#
#grouped_entity_words = grouped_entities.values().map(removeSentiment)

grouped_entity_words = grouped_entities_df.select(["genre", "entity"]).groupBy("genre").agg(collect_list("entity").alias("entities"))
grouped_sentiment = grouped_entities_df.select(["genre", "sentiment"]).groupBy("genre").agg(collect_list("sentiment").alias("sentiment"))
#grouped_entity_words.show()
#grouped_sentiment.show()

In [31]:
grouped_sentiment.show()

+---------+--------------------+
|    genre|           sentiment|
+---------+--------------------+
|  Romance|[0.0, 0.010000000...|
|    Drama|[0.0, 0.010000000...|
|Animation|[-0.0400000011920...|
|   Comedy|[-0.0400000011920...|
+---------+--------------------+



In [32]:
from pyspark.ml.feature import CountVectorizer, IDF

# remove sentiment info for use by hashingTF/tfif

# Load documents (one per line).

countVec = CountVectorizer(inputCol="entities", outputCol="tf")
cvmodel = countVec.fit(grouped_entity_words)

tf = cvmodel.transform(grouped_entity_words)
tf.show()
#sc.broadcast(hashingTF)

# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
# First to compute the IDF vector and second to scale the term frequencies by IDF.
tf.cache()
idf = IDF(inputCol="tf", outputCol="tfidf").fit(tf)
tfidf = idf.transform(tf)
tfidf.show()
# spark.mllib's IDF implementation provides an option for ignoring terms
# which occur in less than a minimum number of documents.
# In such cases, the IDF for these terms is set to 0.
# This feature can be used by passing the minDocFreq value to the IDF constructor.
# idfIgnore = IDF(minDocFreq=2).fit(tf)
# tfidfIgnore = idfIgnore.transform(tf)

+---------+--------------------+--------------------+
|    genre|            entities|                  tf|
+---------+--------------------+--------------------+
|  Romance|[acting moments, ...|(216,[0,1,2,3,4,5...|
|    Drama|[acting moments, ...|(216,[0,1,2,3,4,5...|
|Animation|[anyone, ashton k...|(216,[0,1,2,3,5,6...|
|   Comedy|[anyone, ashton k...|(216,[0,1,2,3,5,6...|
+---------+--------------------+--------------------+

+---------+--------------------+--------------------+--------------------+
|    genre|            entities|                  tf|               tfidf|
+---------+--------------------+--------------------+--------------------+
|  Romance|[acting moments, ...|(216,[0,1,2,3,4,5...|(216,[0,1,2,3,4,5...|
|    Drama|[acting moments, ...|(216,[0,1,2,3,4,5...|(216,[0,1,2,3,4,5...|
|Animation|[anyone, ashton k...|(216,[0,1,2,3,5,6...|(216,[0,1,2,3,5,6...|
|   Comedy|[anyone, ashton k...|(216,[0,1,2,3,5,6...|(216,[0,1,2,3,5,6...|
+---------+--------------------+----------

In [33]:
from pyspark.sql import Row
from pyspark.sql.functions import explode
import numpy as np

vocab = tfidf.select(["genre", "tfidf"])
genreVocabs = dict()

for genre in vocab.collect():
    genreName = genre.genre
    
    t=genre.tfidf
    genreVocabs[genreName] = t
    
globalVocab = list(cvmodel.vocabulary)
    
sc.broadcast(globalVocab)
sc.broadcast(genreVocabs)

def remapEntitiesByTfidf(row):
    tfidfMappings = genreVocabs[row.genre]
    tfIndex = globalVocab.index(row.entity)
    tfidf = tfidfMappings[tfIndex]
    
    return Row(genre=row.genre, entity=row.entity, tfidf=float(tfidf), vocabIndex=int(tfIndex))
    
genreCorpora=dict()

for genre in genreVocabs.keys():
    genreEntities = tfidf.where(tfidf.genre==genre).select("genre", explode("entities").alias("entity"))
    
    #genreEntities.show()
    
    #data = genreEntities.rdd.map(remapEntitiesByTfidf)

    entitiesByTfidf = spark.createDataFrame(data=genreEntities.rdd.map(remapEntitiesByTfidf), schema=["entity", "genre", "tfidf", "vocabIndex"])
    #entitiesByTfidf.show()
    entitiesByTfidf = entitiesByTfidf.join(grouped_entities_df, on=["genre", "entity"], how="inner" ).groupBy(["genre", "entity", "tfidf", "vocabIndex"]).avg("sentiment").sort("tfidf", ascending=False)
    
    genreCorpora[genre] = entitiesByTfidf
    
genreCorpora["Comedy"].show()

+------+------------------+------------------+----------+--------------------+
| genre|            entity|             tfidf|vocabIndex|      avg(sentiment)|
+------+------------------+------------------+----------+--------------------+
|Comedy|            movies|1.8325814637483102|        74|-0.00250000007450...|
|Comedy|               one|1.0216512475319814|        20|                 0.0|
|Comedy|         character|1.0216512475319814|        24|                 0.0|
|Comedy|            anyone|1.0216512475319814|        16|-0.02000000059604...|
|Comedy|               any|1.0216512475319814|        26| -0.2449999916553498|
|Comedy|       performance|1.0216512475319814|        23|                 0.0|
|Comedy|          anything|1.0216512475319814|        25|                 0.0|
|Comedy|        characters|0.9162907318741551|       186|                 0.0|
|Comedy|           hatches|0.9162907318741551|       203|                 0.0|
|Comedy|              line|0.9162907318741551|      

In [55]:
entitiesByTfidf.collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 178 in stage 122.0 failed 1 times, most recent failure: Lost task 178.0 in stage 122.0 (TID 4756, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 229, in main
    process()
  File "/opt/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 224, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 372, in dump_stream
    vs = list(itertools.islice(iterator, batch))
TypeError: 'NoneType' object is not iterable

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 229, in main
    process()
  File "/opt/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 224, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 372, in dump_stream
    vs = list(itertools.islice(iterator, batch))
TypeError: 'NoneType' object is not iterable

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [249]:
grouped_words_tfidf = grouped_entities.values().zip(tfidf)

grouped_words_tfidf.collect()

[([('British', 0.0),
   ('Bromwell High', 0.03000000163912775),
   ('Keisha', 0.0),
   ('Latrina', 0.0),
   ('Natella', 0.0),
   ('South London Public School', 0.0),
   ('characters', 0.040000001192092904),
   ('correctness', 0.0),
   ('cross section', 0.0),
   ('episode', 0.0),
   ('escapades', 0.0),
   ('fun', 0.36000002861023006),
   ('laughter', 0.0),
   ('parody', -0.010000000298023226),
   ('protagonists', 0.0),
   ('show', -0.010000000298023226),
   ('shows', 0.7200000572204601),
   ('society', 0.040000001192092904),
   ('students', 0.0),
   ('subject', 0.020000000596046452),
   ('teachers', 0.0),
   ('term', 0.8099999570846563),
   ('want', 0.4899999833106996),
   ('window', 0.0),
   ('De Niro', 0.010000000298023226),
   ('Fonda', 0.010000000298023226),
   ('acting', 0.25),
   ('average', 0.010000000298023226),
   ('best.', 0.0),
   ('blue collar', 0.010000000298023226),
   ('cinematography', 0.0),
   ('coartship', 0.010000000298023226),
   ('drama', 0.16000000476837162),
   ('

In [243]:
from __future__ import print_function

collectedTFDF = tfidf.collect()
print(len(collectedTFDF))
i=1
for d in grouped_entity_words.collect():
    print("Document ", i, ":")
    
    tfidfscores = [collectedTFDF[i-1][hashingTF.indexOf(d[e])] for e in range(0, len(d))]
    
    for e in range(0, len(d)):
        print(d[e]+"("+str(collectedTFDF[i-1][hashingTF.indexOf(d[e])])+") ", end="")
        
    i+=1
    print(" ")
    

13
Document  1 :
British(1.540445040947149) Bromwell High(1.0296194171811581) Keisha(1.252762968495368) Latrina(1.252762968495368) Natella(1.252762968495368) South London Public School(1.540445040947149) characters(1.252762968495368) correctness(1.540445040947149) cross section(1.540445040947149) episode(1.0296194171811581) escapades(1.540445040947149) fun(1.540445040947149) laughter(1.540445040947149) parody(1.540445040947149) protagonists(1.540445040947149) show(1.540445040947149) shows(1.540445040947149) society(1.540445040947149) students(1.0296194171811581) subject(1.540445040947149) teachers(1.0296194171811581) term(1.540445040947149) want(1.540445040947149) window(1.540445040947149) De Niro(1.540445040947149) Fonda(1.540445040947149) acting(0.8472978603872037) average(1.540445040947149) best.(1.540445040947149) blue collar(1.540445040947149) cinematography(1.540445040947149) coartship(1.540445040947149) drama(1.0296194171811581) fans(1.540445040947149) film(0.44183275227903923) 

Bromwell High(1.0296194171811581) INSPECTOR(1.540445040947149) STUDENT(1.540445040947149) Teachers(1.540445040947149) adults(1.540445040947149) age(1.540445040947149) all(1.540445040947149) episode(1.0296194171811581) line(1.540445040947149) one(0.6931471805599453) pettiness(1.540445040947149) pity(1.540445040947149) pomp(1.540445040947149) programs(1.540445040947149) reality(1.540445040947149) satire(1.540445040947149) school(1.252762968495368) school life(1.540445040947149) schools(1.540445040947149) scramble(1.540445040947149) situation(1.540445040947149) student(1.540445040947149) students(1.0296194171811581) teachers(1.0296194171811581) teaching profession(1.540445040947149)  
Document  12 :
America.(1.0296194171811581) Broadway(1.0296194171811581) Congress(1.0296194171811581) Constitution(1.0296194171811581) Europe(1.0296194171811581) FUTZ(1.0296194171811581) HAIR(1.0296194171811581) Jesus Christ Superstar(1.0296194171811581) New York(1.0296194171811581) Off Off Broadway(1.029619

In [48]:
import pickle
import pandas as pd
import base64

def decodeGenre(x):
    try: 
        return pickle.loads(base64.b64decode(x[2:-1]), encoding="bytes") 
        
    except:
        return []    
        
        
genres = pd.read_csv("id_pos_genre_train.csv", sep="\t", index_col=0, usecols=[1, 2, 3])
genres = genres.fillna(value="b''")
genres["GENRE_t"] = genres["GENRE"].apply(decodeGenre) 
print(genres["GENRE_t"])

# url_df = pd.read_csv("id_pos_genre_train_prot2_stringdump.csv", delimiter="\t")
# #temp = url_df["GENRE"].apply(lambda x: print(x))#pickle.loads(x.encode()))#
# print(pickle.loads(url_df.iloc[0,3][2:-1].encode("utf-8")))
#f = open("id_pos_genre_train.csv", mode="rb")

#o=pickle.load(f)

#f.close()

#print(len(o))
#print(o)

#print(o.loc[o["ID"]=="0453418"])
##print(o["ID"]=="0453418")
#print(["Comedy" in l for l in o["GENRE"]])
#o.loc[["Comedy" in l for l in o["GENRE"]]]

# pickle.dumps(o.iloc[0, 2])

index
1                    [Animation, Comedy]
2                               [Comedy]
3                       [Drama, Romance]
4             [Fantasy, Musical, Family]
5     [Crime, Horror, Mystery, Thriller]
6                        [Comedy, Short]
7                         [Crime, Drama]
8            [Horror, Mystery, Thriller]
9                                     []
10                               [Short]
Name: GENRE_t, dtype: object


In [33]:
url_df.head()

Unnamed: 0.1,Unnamed: 0,index,ID,GENRE
0,0,1,453418,b'\x80\x02]q\x00(X\t\x00\x00\x00Animationq\x01...
1,1,2,64354,b'\x80\x02]q\x00X\x06\x00\x00\x00Comedyq\x01a.'
2,2,3,100680,b'\x80\x02]q\x00(X\x05\x00\x00\x00Dramaq\x01X\...
3,3,4,177606,b'\x80\x02]q\x00(X\x07\x00\x00\x00Fantasyq\x01...
4,4,5,74223,b'\x80\x02]q\x00(X\x05\x00\x00\x00Crimeq\x01X\...


In [45]:
type(pickle.dumps(o.iloc[0, 2]))

bytes

In [228]:
test = [("a", 1), ("b", 2), ("c", 3), ("a", 4), ("c", 6)]

test_sorted = sorted(test, key=lambda x: x[0])
print(test_sorted)

print(reduce(collectEntities, test_sorted))
#entity_documents_info.collect()

[('a', 1), ('a', 4), ('b', 2), ('c', 3), ('c', 6)]
{'a': 1}
{'a': 2.5}
{'a': 2.5, 'b': 2}
{'a': 2.5, 'b': 2, 'c': 3}
[('a', 2.5), ('b', 2), ('c', 4.5)]
