In [2]:
#Import neccessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Word Count Comparison 

### Objective: comparing the word count of the individual products with the word count of this category level
### Outline
1. filter columns
2. create total_word metrics
3. cut word bins with ranges
4. Aggregate across products
5. Merge product and category
6. Merge by joining it on category_id



In [None]:
# Filter column of interest
df= pd.read_csv("RSC_reviews_with_category.csv")
sales = pd.read_csv("Sales.csv")
df = df[['product','text','category_id2']]

#Let's create the word count column
df['totalwords'] = df['text'].str.split().str.len()

#Create word bins with appropriate ranges
df['word_bins'] = pd.cut(x=df['totalwords'], bins=[0, 5, 15, 25, 40, 65, 100, 200, 100000])
df['word_bins'] = pd.cut(x=df['totalwords'], bins=[0, 5, 15, 25, 40, 65, 100, 200, 100000], labels=['0 - 5 words', '6 - 15 words', '16 - 25 words', '26 - 40 words', '41 - 65 words', '66 - 100 words', '101 - 200 words','200+'])

# Create a dataframe to aggregate word bins across products & categories
# Normalize to get proportions
product_aggregation = pd.crosstab(df["product"], df["word_bins"], margins=True, normalize='index')
category_aggregation = pd.crosstab(df["category_id2"], df["word_bins"], margins=True, normalize='index')

# Merge the two features table together
product_aggregation = pd.merge(product_aggregation,sales, how = 'inner', left_on = "product", right_on="id")
product_aggregation = pd.merge(product_aggregation,category_aggregation, how = 'inner', left_on = "category_id2", right_on="category_id2")

#product_aggregation.to_csv("Word_count_features.csv")

In [None]:
product_aggregation

## 

## Rating Distribution / Review Count history

### Objective: find number of reviews and their ratings over 1/3/5 months interval
### Outline
1. Clean data transformation
2. create total_word metrics
3. cut word bins with ranges
4. Aggregate across products
5. Merge product and category
6. Merge by joining it on category_id






### 6 Features:
* (2) change in proportion/delta of 1 star reviews compared to last quarter
* (2) change in proportion/delta of 5 star reviews compared to last quarter
* (1) change in count of 1 star reviews compared to last quarter
* (1) change in number of reviews


In [None]:
df= pd.read_csv("RSC_reviews_with_category.csv")

df['date']=pd.to_datetime(df['date'])

df['year']=df['date'].dt.year
df['week']=df['date'].dt.day
df['month']=df['date'].dt.month

In [None]:
sub=df2[['product','stars','date','year','week','month']]


In [None]:

# 
t1 = df>> mask(df.week>=30,df.year==2019)
t2 = df>> mask(df.week>=18, df.week<30, df.year==2019)
t3 = df>> mask(df.week>=6, df.week<18, df.year==2019)



In [None]:
## 1 star, 5 star table
t1_5_star= t1 >> mask(t1.stars==5)
t2_5_star= t2 >> mask(t2.stars==5)
t3_5_star= t3 >> mask(t3.stars==5)

t1_1_star= t1 >> mask(t1.stars==1)
t2_1_star= t2 >> mask(t2.stars==1)
t3_1_star= t3 >> mask(t3.stars==1)

In [None]:
def get_size(df):
    return df.groupby(['product']).size().to_frame(name='count').reset_index()

In [None]:
m=get_size(t1)

In [None]:
(get_size(t1),
get_size(t2),
get_size(t3),
get_size(t1_1_star),
get_size(t2_1_star),
get_size(t3_1_star),
get_size(t1_5_star),
get_size(t2_5_star),
get_size(t3_5_star))


### Other methods:

In [4]:
from dfply import *
df= pd.read_csv("RSC_reviews_with_category.csv")

df['date']=pd.to_datetime(df['date'])

df['year']=df['date'].dt.year
df['week']=df['date'].dt.day
df['month']=df['date'].dt.month
df=df >> mask(df.week<30, df.week>=6)

#Create week bins with appropriate ranges
df['week_group'] = pd.cut(x=df['week'], bins=[0,6, 18, 30], labels=['Q1','Q2','Q3'])

In [5]:
sub=df[['product','stars','year','week_group']]
sub=sub >> mask(sub.year==2019)
sub_1=sub[sub.stars==1]
sub_5=sub[sub.stars==5]

In [9]:
sub_1.groupby(['week_group','product']).size()

week_group  product   
Q1          B000241NRI     1
            B00063446M     2
            B00068R98C     1
            B00074L4RW     2
            B00074L4UO     2
                          ..
Q3          B075RYQ35W    18
            B0762TKBL2     5
            B076XDFBFR     1
            B0775219GQ     1
            B078N83GS4     1
Length: 437, dtype: int64

In [10]:
## 1 star review || 5 star review count
count_five_star = pd.crosstab(sub_5["product"], sub_5["week_group"])
count_one_star = pd.crosstab(sub_1["product"], sub_1["week_group"])

In [11]:
## # review count
count_review= pd.crosstab(sub["product"], sub["week_group"])


In [19]:
count_review['Q2']-count_review['Q1']

product
B0000BYCM0     1
B0000DAPGK     3
B0001ZWZ9S     1
B00023N7TG    15
B00023ND8G     3
              ..
B078N3JVYV     0
B078N564WT     1
B078N83GS4     1
B0793FBDVL    -1
B0793GS8PF    -2
Length: 656, dtype: int64

In [23]:
count_five_star=count_five_star.rename(columns={'Q1':'Q1_count_5star','Q2':'Q2_count_5star','Q3':'Q3_count_5star'})
count_one_star=count_one_star.rename(columns={'Q1':'Q1_count_1star','Q2':'Q2_count_1star','Q3':'Q3_count_1star'})
count_review=count_review.rename(columns={'Q1':'Q1_count_review','Q2':'Q2_count_review','Q3':'Q3_count_review'})

In [24]:
#count_five_star['Q2']-count_five_star['Q1']
#count_five_star['Q3']-count_five_star['Q2']


KeyError: 'Q2'