Set up HDFS and Google credentials

In [14]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession


LOCAL_IP = "10.164.0.2"

spark = SparkSession \
    .builder \
    .appName("Test Etienne JOB") \
    .master("spark://10.164.0.2:7077") \
    .config("spark.executor.cores", 2) \
    .config("spark.cores.max", 18) \
    .config("spark.python.worker.memory", "6g") \
    .config("spark.executor.memory", "5g") \
    .config("spark.executorEnv.SPARK_LOCAL_IP", LOCAL_IP) \
    .getOrCreate()

sc = spark.sparkContext
sc

In [2]:
import sys
from random import random
from operator import add


partitions = 200
n = 1000000 * partitions

def f(_):
    x = random() * 2 - 1
    y = random() * 2 - 1
    return 1 if x ** 2 + y ** 2 <= 1 else 0

count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
print("Pi is roughly %f" % (4.0 * count / n))


Pi is roughly 3.140805


In [10]:
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./imdb-e9e7ce7a779d.json"
os.environ["HDFSCLI_CONFIG"]="./.hdfscli.cfg"
os.environ["HADOOP_CONF_DIR"]="/opt/hadoop-3.1.0/etc/hadoop"
sc.environment["GOOGLE_APPLICATION_CREDENTIALS"]="/MovieScope-1bf4856cc738.json"

List filenames of reviews from HDFS and parallelize in preparation from processing

Parallelise the reviews and use Google NLP API to extract entities and related sentiment.

In [4]:
# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
from functools import reduce

from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
from pyspark.sql import functions
import re
import time

from pyspark.sql.types import *

In [5]:
def collectEntities(x, y):
    # The first reduce call doesn't pass a list for x, so we need to check for that.
    if not isinstance(x, list):
        x=[x]
        

    xd = dict(x)
    #print(xd)
    
    if not isinstance(y, list):
        y = [y]
        
    for ye in y:
        if ye[0] in xd:
            try:
                xd[ye[0]] = (xd[ye[0]]+ye[1])/2
            except:
                Null
        else:
            xd[ye[0]] = ye[1]
    
    return [o for o in xd.items()]
        

In [6]:
orientation = "pos"
collection="reviews"
urlsCollection="train"

In [6]:
tf = sc.wholeTextFiles("hdfs://sp-master:8020/user/lmrd/"+collection+"/"+orientation)
tf = tf.repartition(5)

In [None]:
tf.take(5)

In [7]:


def checkSentimentValue(x):
    try:
        f = float(x)
        
        return f
    
    except:
        print("Wrong sentiment value ", f)
        return 0
    
def extractEntitiesSetiment2(fileObj):
    # Instantiates a client
    client = language.LanguageServiceClient()
    
    review_contents = fileObj[1]
        
    #print(review_contents)
    document = types.Document(content = review_contents, 
                             type=enums.Document.Type.PLAIN_TEXT, language="en-US")
    
    tries=1
    
    while tries < 5:
        try:
            entities = client.analyze_entity_sentiment(document=document, encoding_type="UTF8")
            break
        except:
            f = open("/home/etienne/sparklog.txt", mode="a")
            f.write(""+str(fileObj[0])+"\n")
            f.write(""+str(entities)+"\n")
            f.close()
            time.sleep(1)
            
            tries +=1
    
    # Make sure we have no duplicate entities. If we do, average their sentiment.
    justLetters = re.compile("[^a-z ]")
    response = [o for o in zip([lemmatizer(justLetters.sub("", entity.name.lower()), u"NOUN")[0] for entity in entities.entities], 
                               [checkSentimentValue(entity.sentiment.score) * checkSentimentValue(entity.sentiment.magnitude) 
                                    for entity in entities.entities])]
    
#    response = sorted(response, key=lambda x: x[0])
#    if (len(response)>1):
#        response = reduce(collectEntities, response)
    
        
    #print(fileObj[0], response)
    try:
        fid = int(fileObj[0])
    except:
        fid=0
    
    return (fid, response)

def extractOrdering(rec):
    filenameRegexp = ".*/([0-9]*)_.*\.txt$"
    r = re.search(filenameRegexp, rec[0])

    return (int(r.groups()[0])+1, rec[1])
    #hdfs://localhost:9000/user/lmrd/reviews/pos/3467_7.txt


lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
sc.broadcast(lemmatizer)

filesRdd = tf.map(extractOrdering)
filesRdd = filesRdd.repartition(5)

schema1 = StructType([
    StructField("ID", IntegerType(), False),
    StructField("ENTITY_SENTIMENT", ArrayType(
            StructType([StructField("ENTITY", StringType(), False), 
                        StructField("SENTIMENT", FloatType(), False)])), nullable=True)])


entity_documents_info = filesRdd.map(extractEntitiesSetiment2)

entity_documents_info.cache()
#entity_documents_info.saveAsTextFile("hdfs://sp-master:8020/user/lmrd/reviews/temp_pos3.txt")


entity_documents_info = spark.createDataFrame(filesRdd.map(extractEntitiesSetiment2), schema1)#schema=["ID", "ENTITIY_SENTIMENT"])

#entity_documents_info = entity_documents_info.rdd.repartition(5)

In [8]:
#entity_documents_info = spark.createDataFrame(filesRdd.map(extractEntitiesSetiment2), schema=["ID", "ENTITIY_SENTIMENT"])

entity_documents_info.write.parquet("hdfs://spark-master:8020/user/lmrd/"+collection+"/"+orientation+"_doc_info2.pq", mode="overwrite")

In [26]:
sc.stop()