In [20]:
from pyspark.sql import SparkSession
from operator import add
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import corr, length, col, when
import time
import matplotlib.pyplot as plt
from pyspark.sql.functions import avg

#change spark.cores.max to change the total number of cores used. Keep executor cores set to 1 since the small vms have only one core having more here causes a ressource issue

spark_session = SparkSession\
        .builder\
        .master("spark://de-i-19:7077")\
        .appName("Project_Group19")\
        .config("spark.dynamicAllocation.enabled", False)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","300s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .config("spark.executor.cores", 1)\
        .config("spark.cores.max", 4)\
        .config("spark.executor.memory","1g")\
        .config("spark.shuffle.service.enabled", "false")\
        .config("spark.dynamicAllocation.enabled", "false")\
        .getOrCreate()




# RDD API
spark_context = spark_session.sparkContext

# spark_context.setLogLevel("INFO")
spark_context.setLogLevel("WARN")

In [21]:
from pyspark.sql import functions as F  

# Load Data

In [22]:
data = spark_session.read.csv("hdfs://192.168.2.246:9000/user/ubuntu/amazon_reviews_us_Baby_v1_00.tsv", sep="\t", header=True, inferSchema=True)
data.printSchema()



root
 |-- marketplace: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: timestamp (nullable = true)



                                                                                

In [23]:
data.show(20)

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|        review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|         US|    9970739| R8EWA1OFT84NX|B00GSP5D94|     329991347|Summer Infant Swa...|            Baby|          5|            0|          0|   N|                Y|Great swaddled bl...|Loved these swadd...|2015-08-31 00:00:00|
|         US|   23538442|R2JWY4YRQD4FOP|B00YYDDZGU|     646108902|Pacifier Clip Gir...| 

In [None]:
data.withColumn('word', F.explode(F.split(F.col('review_body'), ' ')))\
    .groupBy('word')\
    .count()\
    .sort('count', ascending=False)\
    .show()



In [None]:
from pyspark.sql.functions import col, explode, split, lower

# Define a list of common words to exclude
common_words = ['the', 'and', 'to', 'i', 'a', 'it', 'is', 'for', 'of', 'in', 'this', 'my', 'that', 'with', 'on', 'was', 'have', 'but', 'so',
                'we', 'not', 'are', 'you', 'as', 'very', 'they', 'when', 'be', 'one', 'just', 'would', 'at', 'great', "it's", 'can', 'like', 'our',
                '', 'had', 'these', 'if', 'up', 'she', 'he', 'or', 'out', 'use', 'has', 'all', 'her', 'get', 'from', 'it', '/><br', 'it.', 'them',
                'because', 'the', 'a', 'an', 'and', 'but', 'or', 'if', 'while', 'as', 'of',
                'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
                'through', 'during', 'before', 'after', 'above', 'below', 'to', 
                'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
                'again', 'further', 'then', 'once', 'here', 'there', 'when', 
                'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 
                'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 
                'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 
                'can', 'will', 'just', 'don', 'should', 'now', 'your', 'really', 'much', 'do', 'also', 'his'
               
               ]

# Apply lower case to the review_body column to make the comparison case-insensitive
filtered_data = data.withColumn('word', explode(split(lower(col('review_body')), ' '))) \
                    .filter(~col('word').isin(common_words)) \
                    .groupBy('word') \
                    .count() \
                    .sort('count', ascending=False)

# Show the filtered DataFrame
filtered_data.show()


# Sentiment analysis
https://www.johnsnowlabs.com/sentiment-analysis-with-spark-nlp-without-machine-learning/

In [6]:
spark_session.stop()

In [7]:
import sparknlp
spark = sparknlp.start() # start spark session


In [13]:
import nltk
nltk.download('opinion_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
from nltk.corpus import opinion_lexicon

pos_list=set(opinion_lexicon.positive())
neg_list=set(opinion_lexicon.negative())

In [14]:
# Function to classify words
def classify_word(word):
    if word in pos_list:
        return 'positive'
    elif word in neg_list:
        return 'negative'
    else:
        return 'neutral'

# Example review_body
review_body = "The product is good and affordable, but the delivery was late and the packaging was damaged."

# Tokenize the review_body into words
words = nltk.word_tokenize(review_body)

# Classify each word in the review_body
word_sentiments = {word: classify_word(word) for word in words}

# Print the classification results
for word, sentiment in word_sentiments.items():
    print(f"{word}: {sentiment}")

The: neutral
product: neutral
is: neutral
good: positive
and: neutral
affordable: positive
,: neutral
but: neutral
the: neutral
delivery: neutral
was: neutral
late: neutral
packaging: neutral
damaged: negative
.: neutral


In [19]:
from nltk.corpus import opinion_lexicon
import nltk
from collections import Counter

# Download the opinion_lexicon if not already downloaded
nltk.download('opinion_lexicon')

# Get the positive and negative word lists from opinion_lexicon
pos_list = set(opinion_lexicon.positive())
neg_list = set(opinion_lexicon.negative())

# Function to classify the sentiment of a word
def classify_word(word):
    if word in pos_list:
        return 'positive'
    elif word in neg_list:
        return 'negative'
    else:
        return 'neutral'

# Function to classify the sentiment of a sentence
def classify_sentence(sentence):
    # Tokenize the sentence into words
    words = nltk.word_tokenize(sentence.lower())
    # Classify the sentiment of each word
    sentiments = [classify_word(word) for word in words]
    # Count the occurrences of each sentiment
    counts = Counter(sentiments)
    # Get the sentiment with the highest count
    majority_sentiment = max(counts, key=counts.get)
    return majority_sentiment

# Assuming 'df' is your DataFrame containing the 'review_body' column
sentiment_udf = F.udf(classify_sentence)

# Apply the sentiment classification UDF to the 'review_body' column
df_with_sentiment = data.withColumn('sentence_sentiment', sentiment_udf(data['review_body']))

# Show the DataFrame with the added column of sentence sentiment
df_with_sentiment.show(truncate=False)


[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[Stage 3:>                                                          (0 + 1) / 1]

24/03/05 23:36:30 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 9)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 540, in main
    raise RuntimeError(
RuntimeError: Python in worker has different version 3.8 than that in driver 3.10, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:68)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
	at org.apache.spark.InterruptibleIterator.hasNext(Interruptible

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 540, in main
    raise RuntimeError(
RuntimeError: Python in worker has different version 3.8 than that in driver 3.10, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.
