In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
def load_data(file_path):
    return pd.read_csv(file_path)

In [24]:
address = "C:/Users/hp/Desktop/Kifya/Week_5/credit-risk-model/data/"
df = load_data(address + "raw/data.csv")

df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"])
snapshot_date=df["TransactionStartTime"].max()


# calculate the recency
recency_df=df.groupby("CustomerId")["TransactionStartTime"].max().reset_index()
recency_df["recency"]=(snapshot_date-recency_df["TransactionStartTime"]).dt.days

# calculate the frequency
frequency_df=df.groupby("CustomerId")["TransactionId"].nunique().reset_index()
frequency_df.columns=["CustomerId","frequency"]

# calculate the monetary
monetary_df=df.groupby("CustomerId")["Amount"].sum().reset_index()
monetary_df.columns=["CustomerId","monetary"]
rfm = recency_df.merge(frequency_df, on='CustomerId').merge(monetary_df, on='CustomerId')
rfm[rfm['CustomerId']== "CustomerId_4406"]

Unnamed: 0,CustomerId,TransactionStartTime,recency,frequency,monetary
2584,CustomerId_4406,2019-02-12 10:24:40+00:00,0,119,109921.75


In [4]:
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['recency', 'frequency', 'monetary']])

In [26]:
# clustering the data into 3 group based on value rfm
kmeans=KMeans(n_clusters=3,random_state=42)
clusters=kmeans.fit_predict(rfm_scaled)

rfm["clusters"]=clusters
cluster_summary = rfm.groupby('clusters')[['recency', 'frequency', 'monetary']].mean()
high_risk_cluster = cluster_summary['frequency'].idxmin()
rfm['is_high_risk'] = (rfm['clusters'] == high_risk_cluster).astype(int)

# merge the clusters columns with in rfm to the main cleaned data
df["is_high_risk"]=rfm['is_high_risk']
df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,is_high_risk
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0,1.0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0,1.0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0,1.0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0,0.0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0,0.0


In [27]:
df.to_csv(address + "Processed/Processed_task-4.csv", index=False)