# TensorIOT Data Assignment 1

## Submitted by : Deepansh Singh
### Email - deepanshsingh8@gmail.com

Data Assignment 1

·         Download the data files from here - http://jmcauley.ucsd.edu/data/amazon/links.html

·         Apache spark tools locally and necessary tools

·         Download a review file with a million reviews.

·         Using Jupyter notebook create a program to read the million reviews and get the following.

·         Item having the least rating.

·         Item having most rating.

·         Item having the longest reviews.

·         Transform: change the date MM-DD-YYYY format.

·         Show a desired data frame operation which you learnt recently.

·         Convert the whole file into Parquet file after transforming.

·         Upload code to GitHub and complete Readme.md which anyone can understand.

·         Send GitHub link to HR.

### Method - 1 for reading Data

In [None]:
from pyspark.sql import SparkSession
import pandas as pd

# Initialize a Spark session
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "15g") \
    .appName('AmazonReview') \
    .getOrCreate()

# Specify the path to your JSON file
json_file_path = "Grocery_and_Gourmet_Food.json"

# Read the JSON file into a Spark DataFrame
spark_df = spark.read.json(json_file_path)

# Convert the Spark DataFrame to a Pandas DataFrame
pandas_df = spark_df.toPandas()

# Show the first few rows of the Pandas DataFrame
print(pandas_df.head())

### Method - 2 for reading data

In [None]:
import pandas as pd
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Grocery_and_Gourmet_Food.json.gz')

## Exploratory analysis

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.drop(['vote', 'style', 'image'], axis = 1, inplace = True)

In [None]:
df['asin'].value_counts()

#  Item having the least rating

In [None]:
Average_ratings = df.groupby(['asin'])['overall'].mean().reset_index()

In [None]:
least_rated_item = df.loc[df['overall'].idxmin()]
least_rated_item

In [None]:
least_rated_items = Average_ratings.sort_values(['overall', 'asin'], ascending = [True, True]).rename(columns = {'asin':'ProductID', 'overall': 'AverageRating'})

In [None]:
# Top 5 least rated items
least_rated_items[:5]

#  Item having the Highest rating

In [None]:
most_rated_item = df.loc[df['overall'].idxmax()]
most_rated_item

In [None]:
#using the Average_ratings calculated above to display the names
highest_rated_items = Average_ratings.sort_values(['overall', 'asin'], ascending = [False, True]).rename(columns = {'asin':'ProductID', 'overall': 'AverageRating'})

In [None]:
# Top 5 highest rated items
highest_rated_items[:5]

# Item having longest Review

In [None]:
df['stringreview'] = df['reviewText'].astype(str)

In [None]:
df['stringreview'].astype(str)

In [None]:
index_longest_review = df['reviewText'].astype(str).apply(len).idxmax()

In [None]:
#Item with details having longest review
item_with_longest_review = df.loc[index_longest_review]
item_with_longest_review

In [None]:
length = len(item_with_longest_review['stringreview'])
#Length of the review
length

# Converting reviewTime to date MM-DD-YYYY format.

In [None]:
df['reviewDate'] = pd.to_datetime(df['unixReviewTime'], unit='s').dt.strftime('%m-%d-%Y')

In [None]:
df.drop(['reviewTime', 'unixReviewTime', 'reviewText'], axis = 1, inplace = True)

In [None]:
df = df.rename(columns = {'stringreview':'Review'})

 # Show a desired data frame operation which you learnt recently.

In [None]:
print(' '.join(df['Review'].tolist()[:2000]))

In [None]:
!pip install contractions
import contractions
def cont_to_exp(x):
    if type(x) is str:
        x = x.replace('\\','')
        x = x.replace('...','')
        x = x.replace(' - ', '')
        x = x.replace('    ', '')
        x = contractions.fix(x)
        return x
    else:
        return x

In [None]:
df['Review'] = df['Review'].apply(lambda x: cont_to_exp(x))

In [None]:
print(' '.join(df['Review'].tolist()[:1000]))

In [None]:
# Feature Engineering
!pip install TextBlob
from textblob import TextBlob

## Uses prebuilt library to calculate polarity of sentiment
df['polarity'] = df['Review'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head()

In [None]:
## Gives the number of characters in the narrative
df['Character_len'] = df['Review'].apply(lambda x: len(x))

## Gives the length of the narrative - total words
df['Narrative_len'] = df['Review'].apply(lambda x: len(x.split()))

In [None]:
df.head()

In [None]:
##To avoid spaces in the text, we calculate the word length this way
def avg_word_len(x):
    words = x.split()
    if len(words) != 0:
        word_len = 0
        for word in words:
            word_len = word_len + len(word)
        return word_len/len(words)

In [None]:
df['avg_word_len'] = df['Review'].apply(lambda x: avg_word_len(x))
df.head()

# Distribution of Sentiment Polarity

In [None]:
!pip install cufflinks

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import cufflinks as cf
from plotly.offline import iplot
%matplotlib inline

In [None]:
py.offline.init_notebook_mode(connected=True)
cf.go_offline()

In [None]:
df['polarity'].iplot(kind = 'hist', colors = 'red', bins = 20,title = 'Sentiment Polarity Distribution')

# Distribution of Narrative Length and Character Length

In [None]:
df['Character_len'].iplot(kind = "hist",bins = 50, colors = 'red', xTitle = "Character Length in Narrative", yTitle = "Count", title = "Narrative Text Character Distribution")

In [None]:
df['Narrative_len'].iplot(kind = "hist",bins = 50, colors = 'red', xTitle = "Narrative Length", yTitle = "Count", title = "Narrative Length Distribution")

In [None]:
df['avg_word_len'].iplot(kind = "hist", colors = 'red', xTitle = "Avg Word Length", bins = 50, yTitle = "Count", title = "Narrative Text Length Avg Word Distribution")

# Saving to Parquet file
## Further Bigram and Trigram analysis - but it is out of scope for this assignment

In [None]:
df.to_parquet('Groceries_and_Gourmet_Food.parquet', index=False)

In [None]:
df