Set up HDFS and Google credentials

In [1]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession


LOCAL_IP = "10.164.0.2"

spark = SparkSession \
    .builder \
    .appName("Test Etienne JOB") \
    .master("spark://10.164.0.2:7077") \
    .config("spark.executor.cores", 2) \
    .config("spark.cores.max", 14) \
    .config("spark.python.worker.memory", "6g") \
    .config("spark.executor.memory", "5g") \
    .config("spark.executorEnv.SPARK_LOCAL_IP", LOCAL_IP) \
    .getOrCreate()

sc = spark.sparkContext
sc

In [2]:
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./imdb-e9e7ce7a779d.json"
os.environ["HDFSCLI_CONFIG"]="./.hdfscli.cfg"
os.environ["HADOOP_CONF_DIR"]="/opt/hadoop-3.1.0/etc/hadoop"
sc.environment["GOOGLE_APPLICATION_CREDENTIALS"]="/MovieScope-1bf4856cc738.json"

List filenames of reviews from HDFS and parallelize in preparation from processing

In [3]:
# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
from functools import reduce

from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
from pyspark.sql import functions
import re
import time

from pyspark.sql.types import *

In [5]:
orientation = "pos"
collection="reviews"
urlsCollection="train"

In [7]:
# Make sure we don't trigger Google Cloud API again
entity_documents_info = spark.read.parquet("hdfs://spark-master:8020/user/lmrd/"+collection+"/"+orientation+"_doc_info2.pq")
entity_documents_info.show(5)
print(orientation)

+-----+--------------------+
|   ID|    ENTITY_SENTIMENT|
+-----+--------------------+
|10037|[[titanic, -1.800...|
|10038|[[rose, 0.0], [ja...|
|10039|[[titanic, 0.0], ...|
|10040|[[titanic, 1.88],...|
| 1004|[[masterpiece, 0....|
+-----+--------------------+
only showing top 5 rows

pos


Load genre information from file (previously collected using IMDB API)

In [8]:
import pickle
import pandas as pd
import base64
from functools import reduce
import pyspark.sql.functions as F
from pyspark.sql.window import Window as W

def decodeGenre(x):
    try: 
        g = pickle.loads(base64.b64decode(x[2:-1]), encoding="bytes") 
        if (len(g)==0):
            return ["NA"]
        else:
            return g
    except:
        return ["NA"]    
        
        
genres = pd.read_csv("Data/genres_"+urlsCollection+"_urls_"+orientation+".csv", sep="\t", index_col=0, usecols=[1, 2, 3])
#print(genres.head())
genres = genres.fillna(value="b''")
genres["GENRE"] = genres["GENRE"].apply(decodeGenre) 

# Get list of unique genre values
#unique_genres = set(reduce(lambda x, y: x+y, genres["GENRE"].values))
#print(unique_genres)

#print(genres.head())
#print(genres[["ID", "GENRE"]])
#z = zip(genres["ID"], genres["GENRE"])


#genres_rdd = sc.parallelize([(int(k)-1, v[0], v[1]) for (k, v) in genres.iteritems()])

schema = StructType([
    StructField("FILM_ID", IntegerType(), True),
    StructField("GENRE", ArrayType(StringType(), containsNull=True), True)])

genres_df = spark.createDataFrame(genres, schema)

from pyspark.sql.functions import monotonically_increasing_id

# This will return a new DF with all the columns + id
genres_df = genres_df.withColumn("ID_TEMP", monotonically_increasing_id())#.limit(10)

genres_df = genres_df.withColumn("ID",F.row_number().over(W.orderBy("ID_TEMP"))).select(["FILM_ID", "GENRE", "ID"])#.limit(10)

#df1.withColumn("idx", F.row_number())
genres_df.show(5)
#genres_rdd.collect()

+-------+-------------------+---+
|FILM_ID|              GENRE| ID|
+-------+-------------------+---+
| 453418|[Animation, Comedy]|  1|
| 453418|[Animation, Comedy]|  2|
| 453418|[Animation, Comedy]|  3|
|  64354|           [Comedy]|  4|
|  64354|           [Comedy]|  5|
+-------+-------------------+---+
only showing top 5 rows



In [9]:
entity_documents_info = entity_documents_info.alias("df1").join(genres_df.alias("df2"), entity_documents_info.ID == genres_df.ID)#.select(["df1.*", "df2.FILM_ID", "df2.GENRE"])

entity_documents_info.show(5)

+---+--------------------+-------+-------------------+---+
| ID|    ENTITY_SENTIMENT|FILM_ID|              GENRE| ID|
+---+--------------------+-------+-------------------+---+
|  1|[[bromwell high, ...| 453418|[Animation, Comedy]|  1|
|  2|[[format, 0.0], [...| 453418|[Animation, Comedy]|  2|
|  3|[[bromwell high, ...| 453418|[Animation, Comedy]|  3|
|  4|[[world, 0.0], [s...|  64354|           [Comedy]|  4|
|  5|[[futz, 0.0], [pi...|  64354|           [Comedy]|  5|
+---+--------------------+-------+-------------------+---+
only showing top 5 rows



Group documents by genre

In [10]:
def separateGenres(rec):
    print(len(rec))
    return [[genre, rec[0]] for genre in rec[1][1]]

def separateGenres2(rec):
    return [[genre, e, s] for (e, s) in rec[0] for genre in rec[1][1]]

def separateGenres3(rec):
    print(rec)
    return [[genre, e, s] for (e, s) in rec.ENTITY_SENTIMENT for genre in rec.GENRE]
    
#grouped_entities = entity_documents_info.flatMap(separateGenres).reduceByKey(collectEntities)
grouped_entities = entity_documents_info.rdd.flatMap(separateGenres3)
grouped_entities.repartition(5)
grouped_entities_df = spark.createDataFrame(data=grouped_entities, schema=["genre", "entity", "sentiment"])
#grouped_entities_df.show()
grouped_entities_df.cache()



DataFrame[genre: string, entity: string, sentiment: double]

In [11]:
grouped_entities.take(50)

[['Animation', 'bromwell high', 0.0],
 ['Comedy', 'bromwell high', 0.0],
 ['Animation', 'teacher', 0.0],
 ['Comedy', 'teacher', 0.0],
 ['Animation', 'program', 0.0],
 ['Comedy', 'program', 0.0],
 ['Animation', 'school life', 0.0],
 ['Comedy', 'school life', 0.0],
 ['Animation', 'student', 0.0],
 ['Comedy', 'student', 0.0],
 ['Animation', 'episode', 0.0],
 ['Comedy', 'episode', 0.0],
 ['Animation', 'teaching profession', 0.0],
 ['Comedy', 'teaching profession', 0.0],
 ['Animation', 'teacher', -0.809999942779541],
 ['Comedy', 'teacher', -0.809999942779541],
 ['Animation', 'student', -0.04000000283122063],
 ['Comedy', 'student', -0.04000000283122063],
 ['Animation', 'student', -0.010000000707805157],
 ['Comedy', 'student', -0.010000000707805157],
 ['Animation', 'satire', 0.0],
 ['Comedy', 'satire', 0.0],
 ['Animation', 'school', -0.04000000283122063],
 ['Comedy', 'school', -0.04000000283122063],
 ['Animation', 'situation', 0.0],
 ['Comedy', 'situation', 0.0],
 ['Animation', 'pettines', -0

In [44]:
#grouped_entities_df.show(5)
grouped_entities_df.write.parquet("hdfs://spark-master:8020/user/lmrd/"+collection+"/"+urlsCollection+"_"+orientation+"_grouped_entities2.pq", mode="overwrite")

In [1]:
sc.stop()

NameError: name 'sc' is not defined