In [1]:
# I checked warnings, but for the final report I prefer ignore those 
#that really does not affect the results (warnings of libraries, etc)
import warnings
warnings.simplefilter('ignore')

In [2]:
#my own functions
%load_ext autoreload
%autoreload 2

from utils.py_functions import *
from utils.cleaning_functions import *

In [3]:
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from functools import reduce
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud 
import pandas as pd
import re
import string

## **Create spark session and provide master as yarn-client and provide application name.**

In [4]:
# Configuration properties of Apache Spark
#sc.stop()
from pyspark import SparkConf
from pyspark.sql import SparkSession

APP_NAME = 'pyspark_python'
MASTER = 'local[*]'

conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster(MASTER)
spark = SparkSession.builder.config(conf = conf).getOrCreate()
sc = spark.sparkContext

In [5]:
## **LOAD DATA**

In [6]:
schema = StructType([
    StructField("marketplace",  StringType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("product_id",  StringType(), True),
    StructField("product_parent", IntegerType(), True),
    StructField("product_title", StringType(), True),
    StructField("product_category", StringType(), True),
    StructField("star_rating", IntegerType(), True),
    StructField("helpful_votes", IntegerType(), True),
    StructField("total_votes", IntegerType(), True),
    StructField("vine", StringType(), True),
    StructField("verified_purchase", StringType(), True),
    StructField("review_headline", StringType(), True),
    StructField("review_body", StringType(), True),
    StructField("review_date", StringType(), True)])


df_video_games = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option("delimiter","\t")\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .load('data/amazon_reviews_us_Digital_Video_Games_v1_00.tsv')

df_software = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option("delimiter","\t")\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .load('data/amazon_reviews_us_Software_v1_00.tsv')

df_digital_software = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option("delimiter","\t")\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .load('data/amazon_reviews_us_Digital_Software_v1_00.tsv')

## **MERGE DATA**

In [7]:
df = df_digital_software.union(df_software);
df = df.union(df_video_games);

## **ELIMINATE DUPLICATED DATA**

In [None]:
#POLARITY
df_X = df.dropDuplicates(subset= ['product_title', 'product_category', 'product_title'])
df_X = df_X.rdd.map(lambda x: (x["review_id"], x["product_category"],  x["product_title"], x["review_body"], polarity_txt(x["review_body"])))
df_X=spark.createDataFrame(df_X, schema = ["review_id", "product_category", "product_title", "review_body", "review_body_pol"])

In [None]:
df_X.show()

In [None]:
#So, we eliminate them
print(df.toPandas().shape)
df_not_dup = df.dropDuplicates(subset= ['product_title', 'product_category', 'product_title'])
print(df.toPandas().shape)

## **COMPLETE MISSING**

In [9]:
#df = df.toPandas()
#df.review_body.fillna(df.product_title, inplace=True)
#df=spark.createDataFrame(df)

df.withColumn("product_title",coalesce(df.product_title,df.review_body)) 

## **GENERATE NEW FEATURES**

In [10]:
# Our list of functions to apply.
transform_functions = [
    lambda x: len(x),
    lambda x: x.count(" "),
    lambda x: x.count("."),
    lambda x: x.count("!"),
    lambda x: x.count("?"),
    lambda x: len(x) / (x.count(" ") + 1),
    lambda x: x.count(" ") / (x.count(".") + 1),
    lambda x: len(re.findall("CD|DVD", x)), # CD 
    lambda x: len(re.findall(r"\d+st|\d+th|\d+sd", x)), # th--> 4th, 5th or 1st or 2sd
    lambda x: len(re.findall("[A-Z]", x)), # number of uppercase letters
    lambda x: len(re.findall("[0-9]", x)), #numbers
    lambda x: len(re.findall("\d{4}", x)),
    lambda x: len(re.findall("\d$", x)), #end with number
    lambda x: len(re.findall("^\d", x)), #start with number
    lambda x: len(re.findall("[\w]+-[\w]+",x)), #words separated with -
    lambda x: len(re.findall("OLD VERSION|Old Version|old version",x)), #old version
]

transform_functions_len = [
    lambda x: len(x)
]

In [11]:
df_num_2 = df.toPandas()
df_num = df_num_2[['product_title']]
df_num_2 = df_num_2[['review_id']]
for func in transform_functions:
     df_num_2 = pd.concat([df_num_2, df_num['product_title'].apply(func)], axis=1)

KeyboardInterrupt: 

In [None]:
df_num_2.columns = ['review_id', 'title_len', 'title_words', 'title_points',
                  'title_exc', 'title_int', 'ratio_spaces_point', 'ratio_len_points', 
                    'title_cd','title_th', 'title_upper_letters', 'title_numbers',
                    'title_years', 'end_number', 'starts_number', 'word_sep', 
                  'title_old_version']

## **CLEAN FEATURE: review_body**

## **is it an informative feature or nor?**

In [None]:
df_X = df.rdd.map(lambda x: (x["review_id"], x["product_category"],  x["product_title"], x["review_body"], polarity_txt(x["review_body"])))
df_X=spark.createDataFrame(df_X, schema = ["review_id", "product_category", "product_title", "review_body", "review_body_pol"])

In [None]:
df_X.show(5)

In [None]:
from pyspark.sql import functions as F
df_product_title = df_X.groupBy(["product_title"]).agg(F.count('product_title'))

In [None]:
df_product_title.show(5)

In [None]:
df2 = df_X.join(df_product_title, df_X.product_title == df_product_title.product_title, 'left')

In [None]:
df2.show(5)

In [None]:
df_X.toPandas().shape

In [None]:
def product_title_cleaning(df):
    #eliminate contractions I'm -> I am
    df_X = df.rdd.map(lambda x: (x["review_id"], x["product_category"],  x["product_title"], fix_abbreviation(x["review_body"])))
    df_X=spark.createDataFrame(df_X, schema = ["review_id", "product_category", "product_title", "review_body"])
    #consider only noums in the text
    df_X = df_X.rdd.map(lambda x: (x["review_id"], x["product_category"],  x["product_title"], tag_and_remove(x["review_body"])))
    df_X=spark.createDataFrame(df_X, schema = ["review_id", "product_category", "product_title", "review_body"])
    #lemmatization
    df_X = df_X.rdd.map(lambda x: (x["review_id"], x["product_category"],  x["product_title"], lemitizeWords(x["review_body"])))
    df_X=spark.createDataFrame(df_X, schema = ["review_id", "product_category", "product_title", "review_body"])

    #clean text
    df_X = df_X.rdd.map(lambda x: (x["review_id"], x["product_category"],  x["product_title"], clean_text(x["review_body"])))
    df_X=spark.createDataFrame(df_X, schema = ["review_id", "product_category", "product_title", "review_body"])
    #spelling correction
    df_X = df_X.rdd.map(lambda x: (x["review_id"], x["product_category"],  x["product_title"], spell_correction(x["review_body"])))
    df_X=spark.createDataFrame(df_X, schema = ["review_id", "product_category", "product_title", "review_body"])
    return df_X