In [166]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from functools import reduce
import csv
%matplotlib inline

## Explore return rates by transactions and time frame

This workbook takes raw data as date, customer_id and asin and delivers EDA on what is the return rate after various timeframes

In [120]:
df = pd.read_csv("Z:00_ETL/return_rates/return_rates_4.txt", sep="\t")

In [121]:
print(df.shape)
df.tail()

(31058257, 4)


Unnamed: 0,transaction_date_local,encrypted_customer_id,units,rank
31058252,2017-10-31,AZXQGY74R5CZT,1,43
31058253,2018-01-07,AZXQGY74R5CZT,1,44
31058254,2018-02-09,AZXQGY74R5CZT,2,45
31058255,2018-03-08,AZXQGY74R5CZT,1,46
31058256,2018-03-17,AZXQGY74R5CZT,1,47


In [147]:
#df[df['encrypted_customer_id'] == 'AZXQGY74R5CZT']

In [124]:
df['encrypted_customer_id'].nunique()

2500000

In [125]:
def add_dates_to_df(df):
    """the function calculates first 5 purchase dates for each customer and appends to the initial dataset"""
    #df_r['rank'] = df_r.groupby('encrypted_customer_id')['transaction_date_local'].rank(ascending=True) #takes really long
    
    dataframe = []
    for i in range(0,5):
        t = pd.DataFrame(df.groupby("encrypted_customer_id")['transaction_date_local'].nth(i).reset_index()) 
    #k = t.iloc[:,1]
    #df['col%d' %i] = pd.merge(left=df,right=t,left_on='encrypted_customer_id',right_on='encrypted_customer_id', how='left')
        dataframe.append(t)
    
    temp = reduce(lambda x,y:pd.merge(x,y,on = 'encrypted_customer_id',how='outer'), dataframe)   
    new = pd.merge(df,temp,on = 'encrypted_customer_id', how = 'left' )
    new.columns = ['transaction_date','encrypted_customer_id','units',
                  'rank','first_date','second_date','third_date','fourth_date','fifth_date']
    
    return new

In [126]:
new_df = add_dates_to_df(df)

In [8]:
new_df.head()

Unnamed: 0,transaction_date,encrypted_customer_id,units,rank,first_date,second_date,third_date,fourth_date,fifth_date
0,2017-11-02,A04723872PFJK4A853LUV,1,1,2017-11-02,2017-12-05,,,
1,2017-12-05,A04723872PFJK4A853LUV,1,2,2017-11-02,2017-12-05,,,
2,2018-01-01,A07527671M5V4HNKCQY0P,1,1,2018-01-01,,,,
3,2017-07-27,A0767575OG93S4ATRH9T,1,1,2017-07-27,2017-08-30,2017-09-08,2017-09-09,2017-10-02
4,2017-08-30,A0767575OG93S4ATRH9T,1,2,2017-07-27,2017-08-30,2017-09-08,2017-09-09,2017-10-02


In [146]:
#new_df[new_df['encrypted_customer_id'] == 'AZXQGY74R5CZT']

In [129]:
def calculate_time_diff(df):
    df['transaction_date'] = pd.to_datetime(df['transaction_date'])
    df['first_date']       = pd.to_datetime(df['first_date'])
    df['second_date']      = pd.to_datetime(df['second_date'])
    df['third_date']       = pd.to_datetime(df['third_date'])
    df['fourth_date']      = pd.to_datetime(df['fourth_date'])
    df['fifth_date']       = pd.to_datetime(df['fifth_date'])
    
    
    df['diff_vs_first'] = (df['transaction_date'] - df['first_date']) / np.timedelta64(1,"D")
    df['diff_vs_second'] = (df['transaction_date'] - df['second_date']) / np.timedelta64(1,"D")
    df['diff_vs_third'] = (df['transaction_date'] - df['third_date']) / np.timedelta64(1,"D")
    df['diff_vs_fourth'] = (df['transaction_date'] - df['fourth_date']) / np.timedelta64(1,"D")
    df['diff_vs_fifth'] = (df['transaction_date'] - df['fifth_date']) / np.timedelta64(1,"D")
    
    df = df.fillna(0)
    return df

In [130]:
new_df=calculate_time_diff(new_df)

In [23]:
new_df.head()

Unnamed: 0,transaction_date,encrypted_customer_id,units,rank,first_date,second_date,third_date,fourth_date,fifth_date,diff_vs_first,diff_vs_second,diff_vs_third,diff_vs_fourth,diff_vs_fifth
0,2017-11-02,A04723872PFJK4A853LUV,1,1,2017-11-02,2017-12-05,1970-01-01,1970-01-01,1970-01-01,0.0,-33.0,0.0,0.0,0.0
1,2017-12-05,A04723872PFJK4A853LUV,1,2,2017-11-02,2017-12-05,1970-01-01,1970-01-01,1970-01-01,33.0,0.0,0.0,0.0,0.0
2,2018-01-01,A07527671M5V4HNKCQY0P,1,1,2018-01-01,1970-01-01,1970-01-01,1970-01-01,1970-01-01,0.0,0.0,0.0,0.0,0.0
3,2017-07-27,A0767575OG93S4ATRH9T,1,1,2017-07-27,2017-08-30,2017-09-08,2017-09-09,2017-10-02,0.0,-34.0,-43.0,-44.0,-67.0
4,2017-08-30,A0767575OG93S4ATRH9T,1,2,2017-07-27,2017-08-30,2017-09-08,2017-09-09,2017-10-02,34.0,0.0,-9.0,-10.0,-33.0


# Brute force approach

<p> The idea is to retrieve all corresponding ranks and look at distribution from previous purchase to the current </p>
<p> Basis is the number of customers who made the previous purchase </p>

In [131]:
def make_subset(df, rank):
    return df[df['rank'] == rank]

In [132]:
def set_buckets(df, column):
    df = df.copy()
    df['buckets'] = 'na'
    
    df['buckets'].loc[(df[column] <= 10) & (df[column]) > 0.0] =  "10days"
    df['buckets'].loc[ (df[column] > 10 ) 
                            & (df[column] <= 20) ] =  "20days"
    df['buckets'].loc[ (df[column] > 20 ) 
                            & (df[column] <= 30) ] =  "30days"
    df['buckets'].loc[ (df[column] > 30 ) 
                            & (df[column] <= 40) ] =  "40days"
    df['buckets'].loc[ (df[column] > 40 ) 
                            & (df[column] <= 50) ] =  "50days"
    df['buckets'].loc[ (df[column] > 50 ) 
                            & (df[column] <= 60) ] =  "60days"
    df['buckets'].loc[ (df[column] > 60 ) 
                            & (df[column] <= 90) ] =  "90days"
    df['buckets'].loc[ (df[column] > 90 ) 
                            & (df[column] <= 120) ] =  "120days"
    df['buckets'].loc[ (df[column] > 120 ) 
                            & (df[column] <= 364) ] =  "364days"
    
    df['buckets'].loc[ (df[column] > 364 ) ] =  "365+days"
    
    return df

In [188]:
def combine_with_bases(df1,df2,rank1):
    t = pd.DataFrame(df1.groupby("buckets").size().reset_index())
    t['total'] = df2[df2['rank'] == rank1 - 1]['encrypted_customer_id'].nunique()
    t.columns = ['buckets','size','total']
    t['share'] = t['size'] / t['total']
    t['name'] = rank1
    return t

## Start with rank 2

In [189]:
rank2 = make_subset(new_df,2)
rank2  = set_buckets(rank2,"diff_vs_first")
rank2 = combine_with_bases(rank2,new_df,2)
rank2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,buckets,size,total,share,name
0,10days,498561,2500000,0.199424,2
1,120days,105859,2500000,0.042344,2
2,20days,180087,2500000,0.072035,2
3,30days,131804,2500000,0.052722,2
4,364days,357065,2500000,0.142826,2
5,365+days,166292,2500000,0.066517,2
6,40days,91819,2500000,0.036728,2
7,50days,82111,2500000,0.032844,2
8,60days,63083,2500000,0.025233,2
9,90days,147134,2500000,0.058854,2


## Proceed with rank 3

In [190]:
rank3 = make_subset(new_df,3)
rank3  = set_buckets(rank3,"diff_vs_second")
rank3 = combine_with_bases(rank3,new_df,3)
rank3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,buckets,size,total,share,name
0,10days,477495,1823815,0.261811,3
1,120days,84340,1823815,0.046244,3
2,20days,176817,1823815,0.096949,3
3,30days,126901,1823815,0.06958,3
4,364days,235533,1823815,0.129143,3
5,365+days,67735,1823815,0.037139,3
6,40days,85516,1823815,0.046889,3
7,50days,74803,1823815,0.041015,3
8,60days,55845,1823815,0.03062,3
9,90days,126759,1823815,0.069502,3


## Proceed with rank 4

In [191]:
rank4 = make_subset(new_df,4)
rank4  = set_buckets(rank4,"diff_vs_third")
rank4 = combine_with_bases(rank4,new_df,4)
rank4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,buckets,size,total,share,name
0,10days,450745,1511744,0.298162,4
1,120days,68201,1511744,0.045114,4
2,20days,166629,1511744,0.110223,4
3,30days,118315,1511744,0.078264,4
4,364days,167828,1511744,0.111016,4
5,365+days,35702,1511744,0.023616,4
6,40days,78620,1511744,0.052006,4
7,50days,67071,1511744,0.044367,4
8,60days,49222,1511744,0.03256,4
9,90days,108299,1511744,0.071638,4


## Proceed with rank 5

In [192]:
rank5 = make_subset(new_df,5)
rank5  = set_buckets(rank5,"diff_vs_fourth")
rank5 = combine_with_bases(rank5,new_df,5)
rank5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,buckets,size,total,share,name
0,10days,425872,1310632,0.324936,5
1,120days,56861,1310632,0.043384,5
2,20days,157488,1310632,0.120162,5
3,30days,109685,1310632,0.083689,5
4,364days,126342,1310632,0.096398,5
5,365+days,21270,1310632,0.016229,5
6,40days,71251,1310632,0.054364,5
7,50days,60476,1310632,0.046143,5
8,60days,43046,1310632,0.032844,5
9,90days,92602,1310632,0.070654,5


## Proceed with rank 6

In [193]:
rank6 = make_subset(new_df,6)
rank6  = set_buckets(rank6,"diff_vs_fifth")
rank6 = combine_with_bases(rank6,new_df,6)
rank6

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,buckets,size,total,share,name
0,10days,402615,1164893,0.345624,6
1,120days,47777,1164893,0.041014,6
2,20days,147911,1164893,0.126974,6
3,30days,102420,1164893,0.087922,6
4,364days,97929,1164893,0.084067,6
5,365+days,13765,1164893,0.011817,6
6,40days,65442,1164893,0.056179,6
7,50days,54702,1164893,0.046959,6
8,60days,38691,1164893,0.033214,6
9,90days,80303,1164893,0.068936,6


In [201]:
list = [rank2,rank3,rank4,rank5,rank6]

In [205]:
f = open("list_files_DE.csv", 'a')
for i in list:
    i.to_csv(f)
f.close()