In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn import decomposition

The minimum supported version is 2.4.6



In [2]:
## load data
products = pd.read_csv('Dataset/cproducts.csv')
cproducts = products.copy(deep=False)
tender = pd.read_csv('Dataset/ctender.csv')

In [3]:
## check shape of files
print('product file has {} rows and {} columns'.format(products.shape[0], products.shape[1]))
print('tender file has {} rows and {} columns'.format(tender.shape[0], tender.shape[1]))

product file has 11972 rows and 16 columns
tender file has 149752 rows and 13 columns


In [4]:
# this data file contains product level information of transactions made by customers
products.head()

Unnamed: 0,customerID,DOB,Gender,State,PinCode,transactionDate,store_code,store_description,till_no,transaction_number_by_till,promo_code,promotion_description,product_code,product_description,sale_price_after_promo,discountUsed
0,BBID_20482,1975-10-20,male,MADHYA PRADESH,453441.0,2016-05-01,2655,BB-INDORE-MALHAR MEGA MALL,2,85246,NONPROMO,,1000446431,PATANJALI CHOCOS 125g,55.0,Payback
1,BBID_20485,1955-07-21,female,,999999.0,2017-03-09,2655,BB-INDORE-MALHAR MEGA MALL,8,78829,NONPROMO,,1000010431,SOO FRESH ROLLS HOT DOG 4P,30.0,Payback
2,BBID_20498,1981-03-19,female,MAHARASHTRA,400064.0,2015-08-12,2655,BB-INDORE-MALHAR MEGA MALL,13,1064,0000971754,AD 399 FLAT LADIES JEANS FASHION EXC,1000077851005,"DLJ-0010-BSF-2-CJ-FL, 34, BLACK",418.95,Payback
3,BBID_20499,1986-05-10,male,MAHARASHTRA,400099.0,2016-12-18,2615,BB-AMRITSAR-TRILIUM MALL,14,51080,NONPROMO,,1000443231,PATANJALI TOMATO KETCHUP BT 500g,70.0,Payback
4,BBID_204110,1988-02-29,male,MADHYA PRADESH,474001.0,2015-12-31,2655,BB-INDORE-MALHAR MEGA MALL,9,21911,0000974126,NMP 5% OFF CONCEPT DISCOUNT 2655,300481741,MAGGI NDL MERI MASALA PP 70g,34.2,Payback


In [5]:
# this file contains payment mode information used by customers in their transactions
tender.head()

Unnamed: 0,customerID,DOB,Gender,State,PinCode,transactionDate,store_code,store_description,till_no,tender_type,transaction_number_by_till,payment_amount_by_tender,PaymentUsed
0,BBID_2044,1984-04-02,male,,626125.0,2015-09-03,4986,BB-MADURAI-KOCHADAI VILLAGE,22,GVOWN,9208,1000.0,GiftVoucher
1,BBID_2044,1984-04-02,male,,626125.0,2015-09-03,4986,BB-MADURAI-KOCHADAI VILLAGE,22,CASH,9208,10.46,CASH
2,BBID_2044,1984-04-02,male,,626125.0,2016-04-17,4986,BB-MADURAI-KOCHADAI VILLAGE,19,EDCAXIS,90269,677.28,AXIS
3,BBID_2044,1984-04-02,male,,626125.0,2016-04-17,4986,BB-MADURAI-KOCHADAI VILLAGE,19,MOBI,90268,1042.0,One MobiKwik Systems
4,BBID_2044,1984-04-02,male,,626125.0,2016-04-17,4986,BB-MADURAI-KOCHADAI VILLAGE,19,MOBI,90269,1000.0,One MobiKwik Systems


### Note:
Let's build a simple model using products file only. We have to create clusters of customers at store level. We'll use k-means algorithm. We'll set value of k as 3 (you can tune and find optimal value)

Clustering process is as follows (work-wise):
1. Convert the data into numeric / float.
2. Since clustering using distance measure, don't forget to scale the data. 
3. Create a matrix /array which will be feeded to the algorithm.
4. Run function.

We'll ensure that sequence of data remains unaffected. Let's see how this is done.

In [6]:
## fill missing values

products['promotion_description'].fillna('no_promo', inplace=True)
products['Gender'].fillna('no_gender', inplace=True)
products['State'].fillna('no_state', inplace=True)
products['PinCode'].fillna(-1, inplace=True)
products['DOB'].fillna("1", inplace=True)

tender['Gender'].fillna('no_gender', inplace=True)
tender['State'].fillna('no_state', inplace=True)
tender['PinCode'].fillna(-1, inplace=True)
tender['DOB'].fillna("1", inplace=True)
tender['PaymentUsed'].fillna("no_mode", inplace=True)

In [7]:
bigdata = pd.concat([products, tender])
#bigdata = bigdata[:70000]
print (bigdata.shape)
print (bigdata.head(2))
print (bigdata.isnull().any())
## convert data into numeric / float

for c in products.columns:
    lbl = LabelEncoder()
    if products[c].dtype == 'object' and c not in ['transactionDate']:
        products[c] = lbl.fit_transform(products[c])
        
# for c in bigdata.columns:
#     lbl1 = LabelEncoder()
#     if bigdata[c].dtype == 'object' and c not in ['transactionDate', 'PaymentUsed', 'discountUsed', 'payment_amount_by_tender', 'product_description', 'product_code', 'promo_code', 'promotion_description', 'sale_price_after_promo', 'tender_type']:
#         bigdata[c] = lbl1.fit_transform(bigdata[c])
        

(161724, 19)
          DOB  Gender PaymentUsed   PinCode           State  customerID  \
0  1975-10-20    male         NaN  453441.0  MADHYA PRADESH  BBID_20482   
1  1955-07-21  female         NaN  999999.0        no_state  BBID_20485   

  discountUsed  payment_amount_by_tender  product_code  \
0      Payback                       NaN  1.000446e+09   
1      Payback                       NaN  1.000010e+09   

          product_description promo_code promotion_description  \
0       PATANJALI CHOCOS 125g   NONPROMO              no_promo   
1  SOO FRESH ROLLS HOT DOG 4P   NONPROMO              no_promo   

   sale_price_after_promo  store_code           store_description tender_type  \
0                    55.0        2655  BB-INDORE-MALHAR MEGA MALL         NaN   
1                    30.0        2655  BB-INDORE-MALHAR MEGA MALL         NaN   

   till_no transactionDate  transaction_number_by_till  
0        2      2016-05-01                       85246  
1        8      2017-03-09   

In [60]:
## scaling, creating matrix and running k-means

stores = list(set(products['store_code']))
print (stores)
cluster_labels = []
cluster_store = []
cluster_data = []
cluster_customers = []
cluster_score = []
cluster_scores = []
cluster_errors = []
# for i in range(2, 25):
#     cluster_score = []    
for x in stores:
    cld = products[products['store_code'] == x]
    cld1 = cproducts[products['store_code'] == x]
    cluster_customers.append(cld1['customerID'])
    cld.drop(['transactionDate', 'Gender', 'State', 'PinCode', 'DOB', 'promotion_description', 'promo_code', 'discountUsed'], axis=1, inplace=True)

#     c = bigdata[bigdata['store_code'] == x]
#     c.drop(['transactionDate', 'PaymentUsed', 'discountUsed', 'payment_amount_by_tender', 'product_description', 'product_code', 'promo_code', 'promotion_description', 'sale_price_after_promo', 'tender_type'], axis=1, inplace=True)    

    rbs = RobustScaler()
    #c = rbs.fit_transform(c)
    cld2_reduced = rbs.fit_transform(cld)

#         pca = decomposition.PCA(n_components=2)
#     #     pca.fit(cld2)
#     #     print (pca.explained_variance_)
#         #c_reduced = pca.fit_transform(c)
#         cld2_reduced = pca.fit_transform(cld2)


#     if (x==3692):
#         km1 = KMeans(n_clusters=2)
#     #     km1.fit(cld2_reduced)
#         #km2 = km1.fit(c_reduced)
#         label = km1.fit_predict(cld2_reduced)
#     elif (x == 4796 or x == 4986 or x==2906):
#         km1 = KMeans(n_clusters=3)
#     #     km1.fit(cld2_reduced)
#         #km2 = km1.fit(c_reduced)
#         label = km1.fit_predict(cld2_reduced)
#     elif (x==2655):
#         km1 = KMeans(n_clusters=5)
#     #     km1.fit(cld2_reduced)
#         #km2 = km1.fit(c_reduced)
#         label = km1.fit_predict(cld2_reduced)
#     elif (x==4843 or x==2615):
#         km1 = KMeans(n_clusters=8)
#     #     km1.fit(cld2_reduced)
#         #km2 = km1.fit(c_reduced)
#         label = km1.fit_predict(cld2_reduced)
    km1 = KMeans(n_clusters=2)
#     km1.fit(cld2_reduced)
    #km2 = km1.fit(c_reduced)
    label = km1.fit_predict(cld2_reduced)

    s_score = silhouette_score(cld2_reduced, label)
#         print ("Score", s_score)
    cluster_score.append(s_score)

    cluster_labels.append(label)
    cluster_store.append(np.repeat(x, cld.shape[0]))
    cluster_data.append(cld2_reduced)
#     print (i, cluster_score)
#     cluster_scores.append(cluster_score)
# #print (cluster_scores)    
# print (len(cluster_scores))

[4843, 3692, 2615, 2906, 4796, 4986, 2655]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [61]:
# import matplotlib.pyplot as plt
# %matplotlib inline
# for i in range(8):
#     plt.figure()
#     print (i+2, cluster_scores[i])
#     plt.scatter([i+2]*len(cluster_scores[i]), cluster_scores[i], color=['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'])
#     plt.xlabel('Number of clusters')
#     plt.ylabel('Mean silhouette score')
    
    
# # # check mean score per store
print (cluster_score)
print (np.mean(cluster_score))


# 2 clusters for store no 3692 
# 3 clsuters for store no 4796 4986 2906 
# 5 clusters for store no 2655
# 8 clsuters for store no 4843 2615

[0.9942880681813373, 0.99199826739820385, 0.99407724666182651, 0.99558465836194343, 0.99214752910339077, 0.99217967239295624, 0.99476008164349272]
0.993576503392


In [62]:
## merge list into ndarray
cluster_data = np.concatenate(cluster_data)

In [63]:
## check if the array has same rows as products file - Yes!
cluster_data.shape

(11972, 8)

In [64]:
## convert nested lists as 1d array
cluster_customers = np.concatenate(cluster_customers)
cluster_store = np.concatenate(cluster_store)
cluster_labels = np.concatenate(cluster_labels)

In [65]:
## create submission files
print (cluster_customers[:10])
sub1 = pd.DataFrame({'customerID':cluster_customers, 'store_code':cluster_store, 'cluster':cluster_labels})

['BBID_211492' 'BBID_211496' 'BBID_2114866' 'BBID_2114880' 'BBID_2114885'
 'BBID_2114885' 'BBID_2114944' 'BBID_2114953' 'BBID_2114967' 'BBID_2114979']


In [66]:
np.savetxt('../subOne_reduced_features.txt', cluster_data)
sub1.to_csv('../subtwo_reduced_features.csv', columns=['customerID', 'store_code', 'cluster'], index=False)

#### Next step - go to the folder in your laptop, where these files are saved, zip them and upload on the challenge page and get score.