In [4]:
import pandas as pd
import datetime as dt

In [5]:
# import datasets

fingerhut = pd.read_csv('../Dataset/export.csv')
event_def = pd.read_csv('../Dataset/event_definitions.csv')

In [10]:
# ignoring journey_steps_until_end, find all duplicate rows

figerhut_no_journey_steps_until_end = fingerhut.drop(columns=['journey_steps_until_end'])

duplicates = figerhut_no_journey_steps_until_end[figerhut_no_journey_steps_until_end.duplicated()]

duplicates

In [6]:
fingerhut_copy = fingerhut.copy()

In [7]:
# remove fingerhut from memory

del fingerhut

In [8]:
# finding number of customer_ids per account_id
account_customer_grouped = fingerhut_copy.groupby('account_id')['customer_id'].nunique()

# return only the accounts with more than one customer
account_customer_grouped = account_customer_grouped[account_customer_grouped > 1]

In [9]:
# finding number of account_ids per customer_id
customer_account_grouped = fingerhut_copy.groupby('customer_id')['account_id'].nunique()

# return only the customers with more than one account
customer_account_grouped = customer_account_grouped[customer_account_grouped > 1]

In [10]:
# delete journey_steps_until_end column

del fingerhut_copy['journey_steps_until_end']

In [11]:
# remove duplicate rows

fingerhut_copy = fingerhut_copy[~fingerhut_copy.duplicated()]

In [13]:
# pull out the accounts with more than one customer

fingerhut_many_customers = fingerhut_copy[fingerhut_copy['account_id'].isin(account_customer_grouped.index)]

# pull out the customers with more than one account

fingerhut_many_accounts = fingerhut_copy[fingerhut_copy['customer_id'].isin(customer_account_grouped.index)]

# take out fingerhut_many_customers and fingerhut_many_accounts from fingerhut_copy

fingerhut_copy = fingerhut_copy[~fingerhut_copy['account_id'].isin(account_customer_grouped.index)]
fingerhut_copy = fingerhut_copy[~fingerhut_copy['customer_id'].isin(customer_account_grouped.index)]

In [14]:
# reset index for all three dataframes

fingerhut_copy.reset_index(drop=True, inplace=True)
fingerhut_many_customers.reset_index(drop=True, inplace=True)
fingerhut_many_accounts.reset_index(drop=True, inplace=True)

In [15]:
# create new column in fingerhut_copy called 'combined_id' starting at 0 and incrementing by 1 for each new account_id

fingerhut_copy['combined_id'] = fingerhut_copy.groupby('account_id').ngroup()

In [16]:
# sort fingerhut_copy by combined_id

fingerhut_copy.sort_values(by=['combined_id'], inplace=True)

In [17]:
# create new column in fingerhut_many_customers called 'combined_id' starting at fingerhut_copy['combined_id'].max()

fingerhut_many_customers['combined_id'] = fingerhut_many_customers.groupby('account_id').ngroup() + fingerhut_copy['combined_id'].max() + 1

In [18]:
# sort fingerhut_many_customers by combined_id

fingerhut_many_customers.sort_values(by='combined_id', inplace=True)

In [19]:
# create new column in fingerhut_many_accounts called 'combined_id' starting at fingerhut_many_customers['combined_id'].max()

fingerhut_many_accounts['combined_id'] = fingerhut_many_accounts.groupby('customer_id').ngroup() + fingerhut_many_customers['combined_id'].max() + 1

In [20]:
# sort fingerhut_many_accounts by combined_id

fingerhut_many_accounts.sort_values(by='combined_id', inplace=True)

In [21]:
# append all three dataframes together

fingerhut_combined = fingerhut_copy.append(fingerhut_many_customers)
fingerhut_combined = fingerhut_combined.append(fingerhut_many_accounts)

  fingerhut_combined = fingerhut_copy.append(fingerhut_many_customers)
  fingerhut_combined = fingerhut_combined.append(fingerhut_many_accounts)


In [24]:
# for each combined_id, sort by event_timestamp

fingerhut_combined.sort_values(by=['combined_id', 'event_timestamp'], inplace=True)

In [25]:
# reset index of fingerhut_combined

fingerhut_combined.reset_index(drop=True, inplace=True)

In [26]:
fingerhut_combined

Unnamed: 0,customer_id,account_id,ed_id,event_name,event_timestamp,combined_id
0,1807905151,-2147477843,2,campaign_click,2021-09-03T06:00:00.000Z,0
1,1807905151,-2147477843,12,application_web_approved,2021-09-03T21:46:20.000Z,0
2,1807905151,-2147477843,1,promotion_created,2021-09-03T21:46:26.680Z,0
3,1807905151,-2147477843,4,browse_products,2021-09-03T21:48:59.000Z,0
4,1807905151,-2147477843,4,browse_products,2021-09-03T21:55:06.000Z,0
...,...,...,...,...,...,...
56854843,2147449981,-203497522,4,browse_products,2023-07-22T20:47:49.000Z,1665430
56854844,2147449981,-203497522,5,view_cart,2023-07-22T20:47:49.000Z,1665430
56854845,2147449981,-203497522,11,add_to_cart,2023-07-22T20:47:49.000Z,1665430
56854846,2147449981,-203497522,6,begin_checkout,2023-07-22T20:47:49.000Z,1665430


In [34]:
# for every combined_id, add a column called 'journey_steps_until_end' that increments by 1 for each row

fingerhut_combined['journey_steps_until_end'] = fingerhut_combined.groupby('combined_id').cumcount(ascending=True)

In [35]:
fingerhut_combined

Unnamed: 0,customer_id,account_id,ed_id,event_name,event_timestamp,combined_id,journey_steps_until_end
0,1807905151,-2147477843,2,campaign_click,2021-09-03T06:00:00.000Z,0,0
1,1807905151,-2147477843,12,application_web_approved,2021-09-03T21:46:20.000Z,0,1
2,1807905151,-2147477843,1,promotion_created,2021-09-03T21:46:26.680Z,0,2
3,1807905151,-2147477843,4,browse_products,2021-09-03T21:48:59.000Z,0,3
4,1807905151,-2147477843,4,browse_products,2021-09-03T21:55:06.000Z,0,4
...,...,...,...,...,...,...,...
56854843,2147449981,-203497522,4,browse_products,2023-07-22T20:47:49.000Z,1665430,37
56854844,2147449981,-203497522,5,view_cart,2023-07-22T20:47:49.000Z,1665430,38
56854845,2147449981,-203497522,11,add_to_cart,2023-07-22T20:47:49.000Z,1665430,39
56854846,2147449981,-203497522,6,begin_checkout,2023-07-22T20:47:49.000Z,1665430,40


In [36]:
# export fingerhut_combined to csv

fingerhut_combined.to_csv('../Dataset/fingerhut_combined.csv', index=False)