In [None]:

import numpy as np
import unittest
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor


Initialize customers, transactions_train, and articles as dataframes

In [None]:
# initialize customers data as a dataframe
customers = pd.read_csv('customers.csv', header=0)
customers = pd.DataFrame(customers).fillna(0)

In [None]:
# set club_member_status values equal to ACTIVE equal to 1.0
customers.loc[customers["club_member_status"] == "ACTIVE", "Active"] = 1.0
# set club_member_status values not equal to ACTIVE equal to 0.0
customers.loc[customers["club_member_status"] != "ACTIVE", "Active"] = 0.0
# delete club_member_status, fashion_news_frequency
del customers["club_member_status"]
del customers["fashion_news_frequency"]

In [None]:
tt = pd.read_csv('transactions_train.csv', header=0)
tt = pd.DataFrame(tt)
tt.t_dat = pd.to_datetime(tt.t_dat)

In [None]:
art = pd.read_csv('articles.csv', header=0)
art = pd.DataFrame(art)

Split transactions_train into training and testing data

In [None]:
# get first index of purchases 7 days before the last purchase in transactions_train
index = tt[tt['t_dat'] == '2020-09-15'].index[0]

In [None]:
# create a new dataframe including only these 7 days
test = tt[index:]

In [None]:
# create train dataframe by including all data from transactions_train up to the last day before 7 days before the present
train = tt[0:index]

Add average price feature to each customer in customers data

In [None]:
rev = train.groupby(['customer_id'])['price'].mean().reset_index()

In [None]:
customers = customers.merge(rev, on='customer_id')

Add recency feature to each customer in customers data

In [None]:
# find most recent purchase date for each customer in training data
customers_max_date = train.groupby(['customer_id'])['t_dat'].max().reset_index()
customers_max_date.columns = ['customer_id','max_purchase_date']


In [None]:
# calculate number of days between most recent purchase date and the "present",
# which is the maximum date in the training data

# last day in training data
maxDay = train['t_dat'].max()
# difference between last day in training data and most recent purchase by customer
customers_max_date['recency'] = (maxDay - customers_max_date['max_purchase_date']).dt.days

In [None]:
# merge recency column of customers_max_date DataFrame with customers DataFrame, aligning rows by customer_id
customers = customers.merge(customers_max_date.drop('max_purchase_date', axis=1), on='customer_id')

Add frequency feature to each customer in customers data

In [None]:
# get purchase counts for each customer and use to create a DataFrame
freq = train.groupby('customer_id').t_dat.count().reset_index()
freq.columns = ['customer_id','frequency']

In [None]:
# add frequency column from freq DataFrame to customers DataFrame, aligning rows by customer_id 
customers = pd.merge(customers, freq, on='customer_id')

Add popularity feature to each customer in customers data. Based on k segments of timeframes in the total training data's time, how susceptible is a customer to that segment's most popular items?

In [None]:
# first break up training data into k segments of time

# If K = 29 days, then there are 25 29-day segments in the 725 total days in the training data
(train['t_dat'].max() - train['t_dat'].min()).days / 29


In [None]:
# segment dates by roughly one-month segments
dates = [['2018-09-19', '2018-10-20'], ['2018-10-20', '2018-11-20'], ['2018-11-20', '2018-12-20'],
        ['2018-12-20', '2019-01-20'], ['2019-01-20', '2019-02-20'], ['2019-02-20', '2019-03-20'], 
        ['2019-03-20', '2019-04-20'], ['2019-04-20', '2019-05-20'], ['2019-05-20', '2019-06-20'], 
        ['2019-06-20', '2019-07-20'], ['2019-07-20', '2019-08-20'], ['2019-08-20', '2019-09-20'],
        ['2019-09-20', '2019-10-20'], ['2019-10-20', '2019-11-20'], ['2019-11-20', '2019-12-20'],
        ['2019-12-20', '2020-01-20'], ['2020-01-20', '2020-02-20'], ['2020-02-20', '2020-03-20'],
        ['2020-03-20', '2020-04-20'], ['2020-04-20', '2020-05-20'], ['2020-05-20', '2020-06-20'],
        ['2020-06-20', '2020-07-20'], ['2020-07-20', '2020-08-20'], ['2020-08-20', '2020-09-20']]
dfArr = []
for x in dates:
    mask = (train['t_dat'] > x[0]) & (train['t_dat'] <= x[1])
    dfArr.append(train.loc[mask])

In [None]:
# find most popular items in each segment
popularity = []
for x in dfArr:
    pop = pd.DataFrame(x.groupby('article_id').t_dat.count().sort_values(ascending=False))
    pop.columns = ['popular']
    pop.reset_index()
    popularity.append(pop.reset_index())
popularity[0]

In [None]:
# list of top k articles in each segment
topKArticles = []
k = 10
for x in popularity:
    topKArticles.append(x['article_id'][0:k])

In [None]:
topKArticles[0]

In [None]:
# of the clothes a customer buys in a given time segment, how many are in the list of the
# topKArticles of clothing? In other words, how many of a customer's purchases are aligned with
# the most popular items?

segments = []
for x in dfArr:
	df = pd.DataFrame(x.groupby('customer_id').article_id)
	df.columns = ['customer_id', 'articles']
	freq = x.groupby('customer_id').t_dat.count().reset_index()
	freq.columns = ['customer_id','frequency']
	df = pd.merge(df, freq, on='customer_id')
	segments.append(df)



In [None]:
for i in range(len(topKArticles)):
	arr = []
	for x in segments[i]['articles']:
		arr.append(x[x.isin(topKArticles[i]) == True].count())
	segments[i]['num_pop'] = arr
	segments[i]['pop_fraction'] = segments[i]['num_pop'] / segments[i]['frequency']

In [None]:
customersTestDf = pd.DataFrame(customers['customer_id'].copy(deep=True))
for i in range(0,len(segments)):
	customersTestDf = pd.merge(customersTestDf,segments[i][['customer_id','pop_fraction']],on='customer_id', how='left', suffixes=(i, i+1)).fillna(0)

In [None]:
for i in range(len(segments) - 1):
	customersTestDf['pop_fraction1'] += customersTestDf['pop_fraction' + str(i + 2)]

In [None]:
customers['pop_fraction'] = customersTestDf['pop_fraction1'] / len(segments)

In [None]:

# the top 100 articles comprise roughly 8% of the first segments total purchases
for x in popularity:
    print((x['popular'][0:50].sum() / pop['popular'].sum()) * 100)

In [None]:
def optimize_k_means(data, max_k):
    means = []
    inertias = []
    
    for k in range(1, max_k):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(data)
        
        means.append(k)
        inertias.append(kmeans.inertia_)
        
    fig = plt.subplots(figsize=(10,5))
    plt.plot(means, inertias, 'o-')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.grid(True)
    plt.show()

In [None]:
optimize_k_means(customers[['pop_fraction']], 10)
optimize_k_means(customers[['recency']], 10)
optimize_k_means(customers[['frequency']], 10)
optimize_k_means(customers[['price']], 10)

In [None]:
kmeansOne = KMeans(n_clusters=3)
kmeansTwo = KMeans(n_clusters=2)
kmeansThree = KMeans(n_clusters=3)
kmeansFour = KMeans(n_clusters=3)
kmeansOne.fit(customers[['pop_fraction']])
kmeansTwo.fit(customers[['recency']])
kmeansThree.fit(customers[['frequency']])
kmeansFour.fit(customers[['price']])
customers['pop_fraction_cluster'] = kmeansOne.predict(customers[['pop_fraction']])
customers['recency_cluster'] = kmeansTwo.predict(customers[['recency']])
customers['frequency_cluster'] = kmeansThree.predict(customers[['frequency']])
customers['price_cluster'] = kmeansFour.predict(customers[['price']])

Train prediction model

In [None]:
testTwo = pd.DataFrame(test.groupby('customer_id').article_id)
testTwo.columns = ['customer_id', 'articles_labels']


In [None]:
customersTwo = pd.merge(customers,testTwo[['customer_id','articles_labels']],on='customer_id', how='left')
del customersTwo['postal_code']
customersTwo = customersTwo[~customersTwo['articles_labels'].isna()].reset_index()

In [None]:
for y in range(len(customersTwo['articles_labels'])):
	test = customersTwo['articles_labels'][y]
	df = pd.DataFrame(art['article_id'])
	df['vals'] = 0
	count = 0
	for x in test:
		df.loc[df['article_id'] == x, 'vals'] = 1
		count += 1
	customersTwo['articles_labels'][y] = df['vals'] / count

In [None]:
customersTwo['articles_labels'][4].describe()

In [None]:
X = np.array(customersTwo.iloc[0:5000,2:13])


In [None]:
# initialize a matrix of zeros of (n_samples, n_articles) shape
Y = np.stack(np.array((customersTwo.iloc[0:5000,13])))


In [None]:
XTwo = np.array(customersTwo.iloc[5000:6000,2:13])

In [None]:
YTwo = np.stack(np.array((customersTwo.iloc[5000:6000,13])))

In [None]:
reg = LinearRegression().fit(X, Y)
reg.score(XTwo, YTwo)

In [None]:
regr = MLPRegressor(random_state=1, max_iter=500).fit(X, Y)
regr.score(XTwo, YTwo)