## Documentation:


###  Objective:
Find variables that helps to explain product Salesrank in Amazon

### What things this notebook has done?
1. Change tabular data into item-wise dictionary

2. Metrics generation:
   1. Calculation: aggregation metrics (Average ratings)
   2. Date: time metrics
   3. Text: 
       average word count
      

3. Regression basedline model


### What's the dataset is about?
1. Avery's product reviews data from Amazon
2. Sales Rank of Avery products under different categories


### Run time


### Common bugs:


### Below are detailed code chunks 

## Part 1: Setup working environment & Dataset
* Load sales rank & review dataset for Avery product

In [1]:
debug=True
option=2

In [2]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

#!pip install quilt
#!quilt install ResidentMario/missingno_data
from quilt.data.ResidentMario import missingno_data

%matplotlib inline

In [3]:
ar1 = pd.read_csv('Avery_reviews.csv')
asr1 = pd.read_csv('SalesRank_Avery.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


### Options for modeling:
* Group 1: more variability
* Group 2: most reviews records

Must under missing_value check

In [4]:

if(option==1):
    # more reviews and highly variant sales rank
    common = ['B00006IBVB',
     'B001HA2H58',
     'B004HLZ1FM',
     'B00V2M9O98',
     'B0155U3EES',
     'B01FKQBAES',
     'B01HC4KIVW',
     'B07DFY9YRH']
elif(option==2):
    # products with most reviews
    common=['B00006IBVB', 
         'B0155U3EES', 
         'B001HA2H58', 
         'B00V2M9O98', 
         'B07DFY9YRH',
        'B004HLZ1FM', 
         'B01HC4KIVW']
else:
    pass


In [5]:

if(debug):
    asr=asr1[asr1['id'].isin(common)].set_index('id',drop=True)
    #asr1 = asr1.set_index('id',drop = True)
    ar=ar1[ar1['product'].isin(common)].set_index('product',drop=True)
else:
    asr = asr1.set_index('id',drop = True)
    # Same as above, create another dictionary for review data
    ar = ar1.set_index('product',drop = True)

### Transform into key-value pairs by each unique product ID
To make aggregation on review ratings, we need details on each product



In [6]:
# create a dictionary to store the information for each product. Each product represents one element of the dictionary.
asr_dict = {}
for i in asr.index.unique():
    asr_dict[i] = asr.loc[i, ['source','date', 'category_id1',
       'category_name1', 'category_rank1', 'category_id2', 'category_name2',
       'category_rank2', 'category_id3', 'category_name3', 'category_rank3',
       'category_id4', 'category_name4', 'category_rank4']]

In [7]:
item_list=list(asr_dict.keys())
item_list[1]

'B07DFY9YRH'


### Features Time based rank Features
* log_rank 
* delta_rank
* date

#### Salesrank dataset

In [8]:
for i in asr_dict:
    if type(asr_dict[i]) != pd.core.series.Series:
        asr_dict[i] = asr_dict[i].sort_values('date').reset_index()
        asr_dict[i].drop_duplicates(inplace = True)
        asr_dict[i]['date'] = pd.to_datetime(asr_dict[i]['date'])
        asr_dict[i]['log_rank'] = asr_dict[i]['category_rank1'].map(lambda x: math.log(x))
        asr_dict[i]['delta_rank'] = asr_dict[i]['log_rank'].diff()
        a = asr_dict[i].index[0]
        asr_dict[i].drop(a, inplace = True)
        asr_dict[i]['date'] = asr_dict[i]['date'].dt.strftime('%Y-%m-%d')
        asr_dict[i]['date'] = pd.to_datetime(asr_dict[i]['date'])


In [9]:
for i in asr_dict:
    if type(asr_dict[i]) != pd.core.series.Series:
        asr_dict[i] = asr_dict[i].reset_index(drop = True)

#### Review dataset


In [10]:
ar_dict = {}
for i in ar.index.unique():
    ar_dict[i] = ar.loc[i, ['source','date','author', 'verified', 'vine', 'stars', 'pvotes', 'tvotes',
       'title', 'text', 'image', 'video']]

In [11]:
for i in ar_dict:
    if type(ar_dict[i]) != pd.core.series.Series:
        ar_dict[i] = ar_dict[i].sort_values('date').reset_index()
        ar_dict[i].drop_duplicates(inplace = True)
        ar_dict[i]['date'] = pd.to_datetime(ar_dict[i]['date'])
        ar_dict[i]['date'] = ar_dict[i]['date'].dt.strftime('%Y-%m-%d')
        ar_dict[i]['date'] = pd.to_datetime(ar_dict[i]['date'])

In [12]:
for i in ar_dict:
    if type(ar_dict[i]) != pd.core.series.Series:
        ar_dict[i] = ar_dict[i].reset_index(drop = True)

### 2.1 Feature group: aggregated features


In [13]:
# Now that we have created two dictionaries, next we are going to do the aggregation
for i in asr_dict:
    if i in ar_dict:
        for j in asr_dict[i].index.unique():
            if type(ar_dict[i]) != pd.core.series.Series:
                asr_dict[i].loc[j, "stars_recent_10"] = ar_dict[i][ar_dict[i]['date'] <= asr_dict[i].loc[j, 'date']].tail(10)['stars'].mean()
    

In [14]:
for i in asr_dict:
    if i in ar_dict:
        for j in asr_dict[i].index.unique():
            if type(ar_dict[i]) != pd.core.series.Series:
                asr_dict[i].loc[j, 'stars_recent_oneweek'] = ar_dict[i][(ar_dict[i]['date'] <= asr_dict[i].loc[j, 'date']) & (ar_dict[i]['date'] >= (asr_dict[i].loc[j,'date'] - datetime.timedelta(days = 6)))]['stars'].mean()
                
                
                asr_dict[i].loc[j, 'stars_recent_onemonth'] = ar_dict[i][(ar_dict[i]['date'] <= 
                                                                          asr_dict[i].loc[j, 'date']) & (ar_dict[i]['date'] >= (asr_dict[i].loc[j,'date'] - datetime.timedelta(days = 29)))]['stars'].mean()
                
                
                asr_dict[i].loc[j, 'stars_avg'] = ar_dict[i][ar_dict[i]['date'] <= asr_dict[i].loc[j, 'date']]['stars'].mean()
                
                asr_dict[i].loc[j, 'review_acc'] = ar_dict[i][ar_dict[i]['date'] <= asr_dict[i].loc[j, 'date']]['date'].count()
                
                asr_dict[i].loc[j, 'reviewnum_oneweek'] = ar_dict[i][(ar_dict[i]['date'] <= 
                                                                          asr_dict[i].loc[j, 'date']) & (ar_dict[i]['date'] >= (asr_dict[i].loc[j,'date'] - datetime.timedelta(days = 6)))]['date'].count()
                
                
                asr_dict[i].loc[j, 'reviewnum_onemonth'] = ar_dict[i][(ar_dict[i]['date'] <= 
                                                                          asr_dict[i].loc[j, 'date']) & (ar_dict[i]['date'] >= (asr_dict[i].loc[j,'date'] - datetime.timedelta(days = 29)))]['date'].count()

### 2.2 Text related Features:
* average word count

In [15]:
# of words in the reviews
# Generate a new column: average word count of review that happen on the same sales rank day
# for i in df_review2.index.unique():
#     df_review2.loc[i,'word_count'] = df_review2.loc[i,'text'].count(' ') + 1
  
for i in asr_dict:
    try:
        if i in ar_dict:
            for j in asr_dict[i].index.unique():
                if type(ar_dict[i]) != pd.core.series.Series:
                    asr_dict[i].loc[j, 'avg_word_count'] = ar_dict[i][ar_dict[i]['date'] <= asr_dict[i].loc[j, 'date']]['text'].apply(lambda x: x.count(' ')+1).mean()
                    asr_dict[i].loc[j, 'avg_word_count_10'] = ar_dict[i][ar_dict[i]['date'] <= asr_dict[i].loc[j, 'date']].tail(10)['text'].apply(lambda x: x.count(' ')+1).mean()        
    except:
        continue                                                                             

### 2.3 Features: reviewer status
* all: count of varified_account reviews 
* past one week: count of varified_account reviews

In [16]:
for i in asr_dict:
    if i in ar_dict:
        for j in asr_dict[i].index.unique():
            if type(ar_dict[i]) != pd.core.series.Series:
                asr_dict[i].loc[j, 'verified_acc'] = (ar_dict[i][(ar_dict[i]['date'] <= asr_dict[i].loc[j, 'date']) & (ar_dict[i]['verified'] == True)]['date'].count()) / (ar_dict[i][ar_dict[i]['date'] <= asr_dict[i].loc[j, 'date']]['date'].count()) 
            if type(ar_dict[i]) != pd.core.series.Series:
                asr_dict[i].loc[j, 'verified_acc_oneweek'] = (ar_dict[i][(ar_dict[i]['date'] <= 
                                                                          asr_dict[i].loc[j, 'date']) & (ar_dict[i]['date'] >= (asr_dict[i].loc[j,'date'] - datetime.timedelta(days = 6))) & (ar_dict[i]['verified'] == True)]['date'].count()) / (ar_dict[i][(ar_dict[i]['date'] <= 
                                                                          asr_dict[i].loc[j, 'date']) & (ar_dict[i]['date'] >= (asr_dict[i].loc[j,'date'] - datetime.timedelta(days = 6)))]['date'].count()) 
       

  if __name__ == '__main__':


#### Remove problematic products with only 1 records

In [17]:
problem_products=[]
for i in asr_dict:
    if type(asr_dict[i]['category_rank1'])!= pd.core.series.Series:
        problem_products.append(i)
problem_products

[]

In [18]:
for item in problem_products:
    del asr_dict[item]

### 2.4 Features: log rank & delta rank

In [19]:
for i in asr_dict:
    asr_dict[i]['log_salesrank'] = asr_dict[i]['category_rank1'].apply(lambda x: math.log(x))
    asr_dict[i]['delta_salesrank'] = asr_dict[i]['log_salesrank'].diff()  

In [20]:
# Since we use the delta of log salesrank, the first row should be deleted
for i in asr_dict:
    asr_dict[i].drop(asr_dict[i].index[0],inplace = True)                    

#### We can tell many products have missing value so the calculated fields are missing

### 2.5 Data preparation for modeling
Models for different product groups
    * Large salesrank variation
    * Most reviews records

In [21]:
# To change the keys from product ids to numbers, preparing for the following loop.
new_dict_asr=dict((i,asr_dict[k]) for i,k in enumerate(sorted(asr_dict.keys())))

# pull out interested products to form a new dataframe

a = new_dict_asr[0]
for i in new_dict_asr:
    new_dict_asr[i]=new_dict_asr[i].reset_index()
    if new_dict_asr[i].loc[1,'id'] in common: 
        a = pd.concat([a,new_dict_asr[i]],ignore_index=True)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


In [22]:
# The first product is what we don't want, (not in common), but in the above loop, it's more convenient to include that, so we delete it now.
final_df = a[a['id'] != 'B00000JFNV']

# put our interested columns in the final dataframe.
final_df = final_df.reset_index(drop = True)[['id', 'date', 'category_name1',
       'category_rank1','log_rank', 'delta_rank',
       'stars_recent_10', 'stars_recent_oneweek',
       'stars_recent_onemonth', 'stars_avg', 'review_acc', 'reviewnum_oneweek',
       'reviewnum_onemonth', 'avg_word_count', 'verified_acc',
       'verified_acc_oneweek', 'log_salesrank', 'delta_salesrank','avg_word_count_10']]


# replace the NAs with 0
final_df = final_df.fillna(0)
temp=final_df
#final_df.to_csv('final_df.csv')

## 3. Modeling Part
1. Check missing values for calculated fields
2. Model construction
3. Model results
4. Interpretation of important variables

### 3.1 missing value check

In [23]:
complete_df= new_dict_asr[0]
for i in new_dict_asr:
    new_dict_asr[i]=new_dict_asr[i].reset_index()
    complete_df= pd.concat([complete_df,new_dict_asr[i]],ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [24]:
# get the number of missing data points per column
def missing_col(df):
    missing_values_count = round(df.isnull().sum()/complete_df.shape[0]*100,1)
    rm_list=missing_values_count[missing_values_count>35].index
    return(missing_values_count,rm_list)

missing_values_count1,rm_list1=missing_col(complete_df)
missing_values_count1

avg_word_count             0.0
avg_word_count_10          0.0
category_id1               0.0
category_id2               0.0
category_id3             100.0
category_id4             100.0
category_name1             0.0
category_name2             0.0
category_name3           100.0
category_name4           100.0
category_rank1             0.0
category_rank2             0.0
category_rank3           100.0
category_rank4           100.0
date                       0.0
delta_rank                 0.0
delta_salesrank            0.0
id                         0.0
index                      0.0
level_0                   12.5
log_rank                   0.0
log_salesrank              0.0
review_acc                 0.0
reviewnum_onemonth         0.0
reviewnum_oneweek          0.0
source                     0.0
stars_avg                  0.0
stars_recent_10            0.0
stars_recent_onemonth      0.0
stars_recent_oneweek      19.7
verified_acc               0.0
verified_acc_oneweek      19.7
dtype: f

In [25]:
missing_values_count2,rm_list2=missing_col(sub)
#complete_df=complete_df.drop(rm_list,axis=1)
missing_values_count2.head()

NameError: name 'sub' is not defined

In [None]:
final_df['id'].unique()

### 3.2 Model construction

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
X = final_df[['stars_recent_10', 'stars_recent_oneweek', 'stars_recent_onemonth', 'stars_avg', 'review_acc', 
               'reviewnum_onemonth', 'avg_word_count', 'avg_word_count_10','verified_acc', 'verified_acc_oneweek']]

y = final_df['log_rank']
poly = PolynomialFeatures(interaction_only=True,include_bias = False)
X = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

reg = LinearRegression().fit(X_train, y_train)
reg.score(X_train, y_train)

### 3.3 Model results

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
prediction=reg.predict(X_test)
# The coefficients
print('Coefficients: \n', reg.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, prediction))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, prediction))

In [None]:
final_df

### 3.4 conclusion