In [1]:
import pandas as pd
import numpy as np
from multiprocessing import process
from functools import reduce

import matplotlib.pyplot as plt
%matplotlib inline

## Explore return rates by transactions and time frame

<p> This workbook takes raw data as date, customer_id and asin and delivers EDA on what is the return rate after various timeframes </p>

<p> The hypothesis is, based on granular data up to transaction level, that customers are emotional in the sense that they have short attention span and generally buy in chunks, ie once they purchased something, the next one is more likely to be in the very near future than in a distant one. </p>


In [2]:
#df = pd.read_csv("Z:00_ETL/return_rates/return_rates_4.txt", sep="\t") #too large to be processed locally at once

In [60]:
iter_csv = pd.read_csv("Z:00_ETL/return_rates/return_rates_1.txt", sep="\t", chunksize=1000000, engine='python')
df = pd.concat([chunk for chunk in iter_csv])

In [62]:
print(df.shape, df['encrypted_customer_id'].nunique())
df.head()

(36656396, 4) 2500000


Unnamed: 0,transaction_date_local,encrypted_customer_id,units,rank
0,2016-05-22,A01699183EOH5ZVXY8Z01,1,1
1,2016-06-23,A01699183EOH5ZVXY8Z01,1,2
2,2016-09-05,A01699183EOH5ZVXY8Z01,1,3
3,2016-10-09,A01699183EOH5ZVXY8Z01,1,4
4,2017-06-03,A01699183EOH5ZVXY8Z01,1,5


In [64]:
df[df['encrypted_customer_id'] == 'A01699183EOH5ZVXY8Z01']

Unnamed: 0,transaction_date_local,encrypted_customer_id,units,rank
0,2016-05-22,A01699183EOH5ZVXY8Z01,1,1
1,2016-06-23,A01699183EOH5ZVXY8Z01,1,2
2,2016-09-05,A01699183EOH5ZVXY8Z01,1,3
3,2016-10-09,A01699183EOH5ZVXY8Z01,1,4
4,2017-06-03,A01699183EOH5ZVXY8Z01,1,5


In [65]:
df['encrypted_customer_id'].nunique()

2500000

In [67]:
def add_dates_to_df(df):
    """the function calculates first 5 purchase dates for each customer and appends to the initial dataset"""
    #df_r['rank'] = df_r.groupby('encrypted_customer_id')['transaction_date_local'].rank(ascending=True) #takes really long
    
    dataframe = []
    for i in range(0,5):
        t = pd.DataFrame(df.groupby("encrypted_customer_id")['transaction_date_local'].nth(i).reset_index()) 
    #k = t.iloc[:,1]
    #df['col%d' %i] = pd.merge(left=df,right=t,left_on='encrypted_customer_id',right_on='encrypted_customer_id', how='left')
        dataframe.append(t)
    
    temp = reduce(lambda x,y:pd.merge(x,y,on = 'encrypted_customer_id',how='outer'), dataframe)   
    new = pd.merge(df,temp,on = 'encrypted_customer_id', how = 'left' )
    new.columns = ['transaction_date','encrypted_customer_id','units',
                  'rank','first_date','second_date','third_date','fourth_date','fifth_date']
    
    return new

In [68]:
new_df = add_dates_to_df(df)

In [69]:
new_df.tail()

Unnamed: 0,transaction_date,encrypted_customer_id,units,rank,first_date,second_date,third_date,fourth_date,fifth_date
36656391,2014-04-06,AZWZGPQU6CBYE,1,2,2014-03-21,2014-04-06,2014-06-26,2014-08-06,
36656392,2014-06-26,AZWZGPQU6CBYE,1,3,2014-03-21,2014-04-06,2014-06-26,2014-08-06,
36656393,2014-08-06,AZWZGPQU6CBYE,1,4,2014-03-21,2014-04-06,2014-06-26,2014-08-06,
36656394,2016-12-08,AZXCT2A5XJERA,1,1,2016-12-08,,,,
36656395,2015-04-25,AZYEA6YWWL22B,1,1,2015-04-25,,,,


In [70]:
new_df[new_df['encrypted_customer_id'] == 'A01699183EOH5ZVXY8Z01'].nunique()

transaction_date         5
encrypted_customer_id    1
units                    1
rank                     5
first_date               1
second_date              1
third_date               1
fourth_date              1
fifth_date               1
dtype: int64

In [118]:
def calculate_time_diff(df):
    df['transaction_date'] = pd.to_datetime(df['transaction_date'])
    df['first_date']       = pd.to_datetime(df['first_date'])
    df['second_date']      = pd.to_datetime(df['second_date'])
    df['third_date']       = pd.to_datetime(df['third_date'])
    df['fourth_date']      = pd.to_datetime(df['fourth_date'])
    df['fifth_date']       = pd.to_datetime(df['fifth_date'])
    
    df['diff_vs_first'] = (df['transaction_date'] - df['first_date']) / np.timedelta64(1,"D")
    df['diff_vs_second'] = (df['transaction_date'] - df['second_date']) / np.timedelta64(1,"D")
    df['diff_vs_third'] = (df['transaction_date'] - df['third_date']) / np.timedelta64(1,"D")
    df['diff_vs_fourth'] = (df['transaction_date'] - df['fourth_date']) / np.timedelta64(1,"D")
    df['diff_vs_fifth'] = (df['transaction_date'] - df['fifth_date']) / np.timedelta64(1,"D")
    
    df = df.fillna(0)
    return df

In [119]:
new_df=calculate_time_diff(new_df)

In [120]:
new_df.head()

Unnamed: 0,transaction_date,encrypted_customer_id,units,rank,first_date,second_date,third_date,fourth_date,fifth_date,diff_vs_first,diff_vs_second,diff_vs_third,diff_vs_fourth,diff_vs_fifth
0,2016-05-22,A01699183EOH5ZVXY8Z01,1,1,2016-05-22,2016-06-23,2016-09-05,2016-10-09,2017-06-03,0.0,-32.0,-106.0,-140.0,-377.0
1,2016-06-23,A01699183EOH5ZVXY8Z01,1,2,2016-05-22,2016-06-23,2016-09-05,2016-10-09,2017-06-03,32.0,0.0,-74.0,-108.0,-345.0
2,2016-09-05,A01699183EOH5ZVXY8Z01,1,3,2016-05-22,2016-06-23,2016-09-05,2016-10-09,2017-06-03,106.0,74.0,0.0,-34.0,-271.0
3,2016-10-09,A01699183EOH5ZVXY8Z01,1,4,2016-05-22,2016-06-23,2016-09-05,2016-10-09,2017-06-03,140.0,108.0,34.0,0.0,-237.0
4,2017-06-03,A01699183EOH5ZVXY8Z01,1,5,2016-05-22,2016-06-23,2016-09-05,2016-10-09,2017-06-03,377.0,345.0,271.0,237.0,0.0


# Brute force approach

<p> The idea is to retrieve all corresponding ranks and look at distribution from previous purchase to the current </p>
<p> Basis is the number of customers who made the previous purchase </p>

In [74]:
def make_subset(df, rank):
    return df[df['rank'] == rank]

In [75]:
def set_buckets(df, column):
    df = df.copy()
    df['buckets'] = 'na'
    
    df['buckets'].loc[(df[column] <= 10) & (df[column]) > 0.0] =  "10days"
    df['buckets'].loc[ (df[column] > 10 ) 
                            & (df[column] <= 20) ] =  "20days"
    df['buckets'].loc[ (df[column] > 20 ) 
                            & (df[column] <= 30) ] =  "30days"
    df['buckets'].loc[ (df[column] > 30 ) 
                            & (df[column] <= 40) ] =  "40days"
    df['buckets'].loc[ (df[column] > 40 ) 
                            & (df[column] <= 50) ] =  "50days"
 

    df['buckets'].loc[ (df[column] > 60 ) 
                            & (df[column] <= 90) ] =  "90days"
    df['buckets'].loc[ (df[column] > 90 ) 
                            & (df[column] <= 120) ] =  "120days"
    df['buckets'].loc[ (df[column] > 120 ) 
                            & (df[column] <= 364) ] =  "364days"
    
    df['buckets'].loc[ (df[column] > 364 ) ] =  "365+days"
    
    return df

In [76]:
def combine_with_bases(df1,df2,rank1):
    t = pd.DataFrame(df1.groupby("buckets").size().reset_index())
    t['total'] = df2[df2['rank'] == rank1 - 1]['encrypted_customer_id'].nunique()
    t.columns = ['buckets','size','total']
    t['share'] = t['size'] / t['total']
    t['name'] = rank1
    return t

## Start with rank 2

In [93]:
rank2 = make_subset(new_df,2)
rank2  = set_buckets(rank2,"diff_vs_first")
rank2 = combine_with_bases(rank2,new_df,2)
rank2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,buckets,size,total,share,name
0,10days,524109,2500000,0.209644,2
1,120days,72903,2500000,0.029161,2
2,20days,146690,2500000,0.058676,2
3,30days,97503,2500000,0.039001,2
4,364days,286804,2500000,0.114722,2
5,365+days,288327,2500000,0.115331,2
6,40days,65102,2500000,0.026041,2
7,50days,56579,2500000,0.022632,2
8,90days,102277,2500000,0.040911,2
9,na,43698,2500000,0.017479,2


## Proceed with rank 3

In [94]:
rank3 = make_subset(new_df,3)
rank3  = set_buckets(rank3,"diff_vs_second")
rank3 = combine_with_bases(rank3,new_df,3)
rank3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,buckets,size,total,share,name
0,10days,478915,1683992,0.284393,3
1,120days,59791,1683992,0.035506,3
2,20days,140983,1683992,0.08372,3
3,30days,90929,1683992,0.053996,3
4,364days,204914,1683992,0.121683,3
5,365+days,133098,1683992,0.079037,3
6,40days,59497,1683992,0.035331,3
7,50days,50913,1683992,0.030234,3
8,90days,87208,1683992,0.051786,3
9,na,37744,1683992,0.022413,3


## Proceed with rank 4

In [95]:
rank4 = make_subset(new_df,4)
rank4  = set_buckets(rank4,"diff_vs_third")
rank4 = combine_with_bases(rank4,new_df,4)
rank4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,buckets,size,total,share,name
0,10days,439968,1343992,0.327359,4
1,120days,49450,1343992,0.036793,4
2,20days,129243,1343992,0.096164,4
3,30days,82693,1343992,0.061528,4
4,364days,155740,1343992,0.115879,4
5,365+days,80321,1343992,0.059763,4
6,40days,53219,1343992,0.039598,4
7,50days,44691,1343992,0.033252,4
8,90days,74499,1343992,0.055431,4
9,na,33766,1343992,0.025124,4


## Proceed with rank 5

In [96]:
rank5 = make_subset(new_df,5)
rank5  = set_buckets(rank5,"diff_vs_fourth")
rank5 = combine_with_bases(rank5,new_df,5)
rank5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,buckets,size,total,share,name
0,10days,405766,1143590,0.354818,5
1,120days,42219,1143590,0.036918,5
2,20days,120067,1143590,0.104991,5
3,30days,75823,1143590,0.066303,5
4,364days,123919,1143590,0.10836,5
5,365+days,53862,1143590,0.047099,5
6,40days,48239,1143590,0.042182,5
7,50days,40093,1143590,0.035059,5
8,90days,64861,1143590,0.056717,5
9,na,29254,1143590,0.025581,5


## Proceed with rank 6

In [97]:
rank6 = make_subset(new_df,6)
rank6  = set_buckets(rank6,"diff_vs_fifth")
rank6 = combine_with_bases(rank6,new_df,6)
rank6

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,buckets,size,total,share,name
0,10days,378352,1004103,0.376806,6
1,120days,36148,1004103,0.036,6
2,20days,111499,1004103,0.111043,6
3,30days,69344,1004103,0.069061,6
4,364days,100941,1004103,0.100529,6
5,365+days,39450,1004103,0.039289,6
6,40days,44133,1004103,0.043953,6
7,50days,36697,1004103,0.036547,6
8,90days,57167,1004103,0.056933,6
9,na,26677,1004103,0.026568,6


In [98]:
list = [rank2,rank3,rank4,rank5,rank6]

In [100]:
f = open("list_files_US.csv", 'a')
for i in list:
    i.to_csv(f)
f.close()