In [1]:
import sparknlp
import logging
sparknlp.start()
import numpy as np

from sparknlp import *
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, udf
from pyspark.sql.types import *

from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.ml.regression import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from sklearn.ensemble import VotingClassifier
from pyspark.sql.types import DoubleType



In [2]:
spark = SparkSession.builder.appName('Spark-Sentiment').getOrCreate()
logger = spark.sparkContext._jvm.org.apache.log4j
logger.LogManager.getLogger("org.apache.spark.scheduler").setLevel(logger.Level.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)
spark.sparkContext.setLogLevel("ERROR")


In [3]:
spark.conf.set("spark.hadoop.google.cloud.auth.service.account.enable", "true")
spark.conf.set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", "path/to/your/credentials.json")

In [4]:
model_path = 'gs://msca-bdp-student-gcs/Group4_Project_Data/models/LDA_model'

In [5]:
from pyspark.ml.clustering import LocalLDAModel

# Load trained LDA model
lda_model = LocalLDAModel.load(model_path)

                                                                                

In [6]:
# Read the CSV file from GCS
df2 = spark.read.format("csv").option("header", "true").\
                option("delimiter", "\t").load("gs://msca-bdp-student-gcs/Group4_Project_Data/amazon_reviews_us_Digital_Music_Purchase_v1_00.tsv")
df2 = df2.dropna()

##place to use k-means
df2 = df2.withColumn("star_rating",df2.star_rating.cast('int'))
df2 = df2.withColumn('sentiment', when(col('star_rating') <= 3, 'negative').otherwise('positive'))

In [7]:
#pipeline
tokenizer=Tokenizer(inputCol="review_body", outputCol="review_body_words")
remover = StopWordsRemover(inputCol="review_body_words", outputCol="review_body_words_filtered")
hashingTF = HashingTF(inputCol="review_body_words_filtered", outputCol="hashingTF_features")
idf = IDF(inputCol="hashingTF_features", outputCol="idf_features")
labelIndexer = StringIndexer(inputCol="sentiment", outputCol="sentiment_label")

pipeline = Pipeline(stages=[tokenizer,remover,hashingTF,idf,labelIndexer])

In [8]:
countVectorizer = CountVectorizer(inputCol="review_body_words_filtered", outputCol="raw_features")

In [9]:
preprocessed_df_2=pipeline.fit(df2).transform(df2)

                                                                                

In [10]:
cv_model_2=countVectorizer.fit(preprocessed_df_2)

                                                                                

In [11]:
data2=cv_model_2.transform(preprocessed_df_2)
data2=data2.withColumnRenamed("raw_features","features")

In [12]:
predictions=lda_model.transform(data2)

In [13]:
topics=lda_model.describeTopics(maxTermsPerTopic=3)

In [14]:
# extract the top term indices for each topic
top_terms = topics.select("termIndices").collect()

In [15]:
# show the top terms for each predicted topic
for i, row in enumerate(predictions.select("review_body", "topicDistribution").collect()):
    if i>=10:
        break
    print("Review {}: {}".format(i, row.review_body))
    for j in range(len(top_terms)):
        print("Top terms for Topic {}: {}".format(j, [cv_model_2.vocabulary[int(row.topicDistribution.argmax())]]))

                                                                                

Review 0: Great  rendition. Great  song
Top terms for Topic 0: ['music']
Top terms for Topic 1: ['music']
Top terms for Topic 2: ['music']
Top terms for Topic 3: ['music']
Top terms for Topic 4: ['music']
Top terms for Topic 5: ['music']
Top terms for Topic 6: ['music']
Top terms for Topic 7: ['music']
Top terms for Topic 8: ['music']
Top terms for Topic 9: ['music']
Review 1: A good music to listen.
Top terms for Topic 0: ['songs']
Top terms for Topic 1: ['songs']
Top terms for Topic 2: ['songs']
Top terms for Topic 3: ['songs']
Top terms for Topic 4: ['songs']
Top terms for Topic 5: ['songs']
Top terms for Topic 6: ['songs']
Top terms for Topic 7: ['songs']
Top terms for Topic 8: ['songs']
Top terms for Topic 9: ['songs']
Review 2: Not as good as the original but I needed to hear the parts and they were more easily heard in this one.
Top terms for Topic 0: ['like']
Top terms for Topic 1: ['like']
Top terms for Topic 2: ['like']
Top terms for Topic 3: ['like']
Top terms for Topic 4: [