In [None]:
# https://stackoverflow.com/questions/55094320/how-to-read-a-large-tsv-file-in-python-and-convert-it-to-csv

In [None]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType,BinaryType

In [None]:
# sc.stop

In [None]:
spark = SparkSession.builder \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config('spark.executor.instances', 3) \
    .appName("Amazon Data Analysis") \
    .getOrCreate()
    
sqlContext = SQLContext(spark)


In [None]:
df = spark.read.csv("amazon_reviews_us_Baby_v1_00.tsv", sep = "\t", header = True, inferSchema = True)


In [None]:
sqlContext.registerDataFrameAsTable(df, "df")

In [None]:
df.show(5)

In [None]:
# Total # of variables 
num_cols = len(df.columns)
print(num_cols)

In [None]:
# Total # of rows
df.count()

In [None]:
df.select(df.columns[:8]).show(5)

In [None]:
df.select(df.columns[8:]).show(5)

In [None]:
# removing repetitive/unnecssary information
df = df.select([col for col in df.columns if col not in ['marketplace', 'vine']])


In [None]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col

# Check for null values in all columns
null_counts = df.select([col(c).isNull().cast('int').alias(c) for c in df.columns])

# Sum up the counts of null values in each column
total_null_counts = null_counts.agg(*[F.sum(c).alias(c) for c in null_counts.columns])

# Display the total counts of null values in each column
total_null_counts.show()


In [None]:
df.groupBy('review_date').count().show(7)

In [None]:
sql_query = """
SELECT
    SUBSTRING(review_date, 1, 4) AS year,
    COUNT(*) AS count
FROM
    df
GROUP BY
    SUBSTRING(review_date, 1, 4)
ORDER BY
    year ASC
"""
result_df = spark.sql(sql_query)

result_df.show()

In [None]:
df.printSchema()

In [None]:
# from lib.YearPlotter import YearPlotter
# k=5
# _title='TOBS for %s / %d'%(station,year)
# fig, ax = plt.subplots(figsize=_figsize);
# YP=YearPlotter()
# YP.plot(M[:k,:366].T,fig,ax,title=_title,labels=_labels)# ,labels=labels);
# ylabel('temp in centigrade');

In [None]:
sql_query = """
    SELECT * 
    FROM df
    WHERE YEAR(review_date) = 2015
"""

# Execute the SQL query
result_df = spark.sql(sql_query)

# Show the result DataFrame
result_df.show(1)


In [None]:
# Execute the SQL query
result_df = spark.sql(sql_query)

# Count the number of rows in the result DataFrame
result_count = result_df.count()

# Show the result count
print("Count of review dates in 2015:", result_count)


In [None]:
import matplotlib.pyplot as plt

def plot_star_rating_distribution(df,df_name):
    """
    Plot the distribution of star ratings for a given DataFrame, given that each Amazon category is a a different df. 
    ex. baby = df, toy= df1

    Args:
    - df: The DataFrame containing the star ratings.
    - df_name: The name of the DataFrame.

    Returns:
    - None
    """
    # Count the number of occurrences for each star rating
    star_rating_counts = df.groupBy('star_rating').count().orderBy('star_rating')

    # Plot the counts using a bar plot
    plt.figure(figsize=(10, 6))
    plt.bar(star_rating_counts.toPandas()['star_rating'], star_rating_counts.toPandas()['count'])
    plt.title(f'Star Rating Distribution {df_name}')
    plt.xlabel('Star Rating')
    plt.ylabel('Number of Reviews')
    plt.show()


In [None]:
plot_star_rating_distribution(df,'Baby')


In [None]:
from pyspark.sql.functions import year, substring

def plot_purchase_counts(df, df_name, target_year):
    """
    Plot the purchase counts for a given DataFrame and year.

    Args:
    - df: The DataFrame containing the review data.
    - target_year: The year for which purchase counts will be plotted.
    - df_name: The name of the DataFrame.

    Returns:
    - None
    """
    # Filter the DataFrame to include only rows from the specified year
    df_year = df.filter(year(df['review_date']) == target_year)

    # Extract the month from the 'review_date' column
    df_year = df_year.withColumn('review_month', substring(df_year['review_date'], 6, 2))

    # Count the number of purchases for each month
    purchase_counts = df_year.groupby('review_month').count().orderBy('review_month')

    # Plot the counts using a bar plot
    plt.figure(figsize=(10, 6))
    plt.bar(purchase_counts.toPandas()['review_month'], purchase_counts.toPandas()['count'])
    plt.title(f'Purchase Counts for {df_name} Year {target_year}')
    plt.xlabel('Month')
    plt.ylabel('Number of Purchases')
    plt.show()


In [None]:
plot_purchase_counts(df, "Baby", 2015)
plot_purchase_counts(df, "Baby", 1999)
