In [5]:
from pyspark.sql import SparkSession

In [13]:
spark = SparkSession \
        .builder \
        .appName('Wrangling Data') \
        .getOrCreate()

In [14]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk-pom:1.10.34,org.apache.hadoop:hadoop-aws:2.7.2 pyspark-shell'
from pyspark.sql import SQLContext
from pyspark import SparkContext
#sc = SparkContext()
#sqlContext = SQLContext(sc)
#filePath = "s3a://yourBucket/yourFile.parquet"
#df = sqlContext.read.parquet(filePath)

In [23]:
author_df = spark.read.csv('s3:///bloggers-data/author/authors.csv')
author_df.createOrReplaceTempView('author')

author_table = spark.sql("""
select author_id,
        name,
        meibi,
        meibix
from author
""")
author_table_df.write.parquet('')

word__count_table = spark.sql("""
select author_id,
        name,
        Average Number of Words in posts,
        Average Number of Words in posts (without stopwords)
from author
""")
word_count_table.write.parquet('')

In [None]:
post_df = spark.read.csv('s3:///bloggers-data/author/post.csv')
post_df.createOrReplaceTempView('posts')

post_table = spark.sql("""
select post_id,
        title,
        blogger_name,
        blogger_id,
        number_of_comments,
        content,
        url,
        date,
        number_of_retrieved_comments
from post
""")

post_table_df.write.parquet('')




In [None]:
comment_df = spark.read.csv('s3:///bloggers-data/author/comment.csv')
comment_df.createOrReplaceTempView('comment')
comment_table = spark.sql("""
select comment_id,
        post_id,
        content,
        aurthor,
        date,
        vote
from comment
""")
comment_table.write.parquet('')

In [None]:
comment_df = spark.read.csv('s3:///bloggers-data/author/comment.csv')
author_df = spark.read.csv('s3:///bloggers-data/author/authors.csv')
comment_df.createOrReplaceTempView('comment')
author_df.createOrReplaceTempView('author')


review_staging_df = spark.sql("""
select a.author_id,
        c.author,
        c.post_id,
        comment_id,
        c.date,
        c.content,
        sentiment
from comment c
join author a on (a.name=c.author)
""")


# Build the sentiment analyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

def get_sentiment_analysis_score(sentence):
    score = analyser.polarity_scores(sentence)
    return score['compound']

def get_sentiment_analysis_result(score):
    if score >= 0.05:
        return "POSITIVE"
    elif score <= -0.05:
        return "NEGATIVE"
    else:
        return "NEUTRAL"
    
get_sentiment_analysis_score_udf = F.udf(lambda x: get_sentiment_analysis_score(x), DoubleType())
get_sentiment_analysis_result_udf = F.udf(lambda x: get_sentiment_analysis_result(x), StringType())

# Load review data and write a parquet
comment_review_table_df = review_staging_df \
    .withColumn("sa_score", get_sentiment_analysis_score_udf("text")) \
    .withColumn("sentiment", get_sentiment_analysis_result_udf("sa_score")) \
    .select(
        "review_id",
        "user_id",
        "business_id",
        "stars",
        "date",
        "text",
        "sentiment")
comment_review_table_df.write.parquet("")

comment_table.write.parquet('')