In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv("Complete Loan Data.csv")
print(df['status'].unique())

['not applied' 'A' 'B' 'D' 'C']


In [3]:
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    if column != 'status':
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])

status_encoder = LabelEncoder()
df['status'] = status_encoder.fit_transform(df['status'])

In [4]:
print(df.isna().sum())

city                     0
disp_id                  0
account_id               0
full_name                0
district_id              0
frequency                0
parsed_date              0
card_id                  0
card_type                0
sex                      0
full_date_card           0
age                      0
social_security          0
phone                    0
email                    0
address                  0
state                    0
zipcode             107339
client_id                0
owner_type               0
state_name               0
state_abbrev             0
region                   0
division                 0
loan_id                  0
loan_amount              0
duration                 0
payments                 0
status                   0
purpose                  0
f1                       0
trans_id                 0
transaction_type         0
operation                0
trans_amount             0
balance                  0
k_symbol                 0
b

In [5]:
df.drop(columns=['zipcode'], inplace=True)
df.drop(columns=['age'], inplace=True)

In [6]:
imputer = SimpleImputer(strategy='mean')
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [7]:
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(df)
df['cluster'] = clusters
min_score = 300
max_score = 850
cluster_scores = {i: min_score + (max_score - min_score) * (i / (kmeans.n_clusters - 1)) for i in range(kmeans.n_clusters)}
df['credit_score'] = df['cluster'].map(cluster_scores)
def generate_recommendations(row):
    recommendations = []
    if row['credit_score'] < 580:
        recommendations.append('Consider improving your payment history.')
    if row['credit_score'] < 670:
        recommendations.append('Try to reduce your credit utilization.')
    return recommendations

df['recommendations'] = df.apply(generate_recommendations, axis=1)
print(df[['credit_score', 'recommendations']].head())

   credit_score                                    recommendations
0         575.0  [Consider improving your payment history., Try...
1         575.0  [Consider improving your payment history., Try...
2         575.0  [Consider improving your payment history., Try...
3         575.0  [Consider improving your payment history., Try...
4         575.0  [Consider improving your payment history., Try...


In [8]:
print(df[['credit_score']])

         credit_score
0               575.0
1               575.0
2               575.0
3               575.0
4               575.0
...               ...
1262620         300.0
1262621         300.0
1262622         300.0
1262623         300.0
1262624         300.0

[1262625 rows x 1 columns]


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv("Complete Loan Data.csv")

# Drop zipcode column
df.drop(columns=['zipcode'], inplace=True)

# Display unique status values
print(df['status'].unique())

# Check for missing values and handle them
print(df.isna().sum())

# Aggregate transaction data (assuming columns 'transaction_type' and 'transaction_amount' exist)
# Replace 'transaction_type' and 'transaction_amount' with the correct column names if they are different
transaction_agg = df.groupby('account_id').agg(
    num_credit_transactions=('transaction_type', lambda x: (x == 'credit').sum()),
    num_debit_transactions=('transaction_type', lambda x: (x == 'debit').sum()),
    total_credit_amount=('transaction_amount', lambda x: x[df['transaction_type'] == 'credit'].sum()),
    total_debit_amount=('transaction_amount', lambda x: x[df['transaction_type'] == 'debit'].sum())
).reset_index()

# Aggregate loan data
loan_agg = df.groupby('account_id').agg(
    total_loan_amount=('loan_amount', 'sum'),
    average_loan_duration=('loan_duration', 'mean'),
    num_loans=('loan_amount', 'count')
).reset_index()

# Aggregate credit card data (assuming 'credit_card' is a boolean column)
credit_card_agg = df.groupby('account_id').agg(
    num_credit_cards=('credit_card', 'sum')  # Assuming credit_card is boolean or 1/0
).reset_index()

# MERGE
account_df = pd.merge(transaction_agg, loan_agg, on='account_id', how='left')
account_df = pd.merge(account_df, credit_card_agg, on='account_id', how='left')

# FEATURE ENGINEERING
account_df['payment_history'] = account_df['num_credit_transactions'] / (account_df['num_credit_transactions'] + account_df['num_debit_transactions'])
account_df['amount_owed'] = account_df['total_loan_amount'] / (account_df['total_loan_amount'] + account_df['total_credit_amount'])
account_df['length_of_credit_history'] = account_df['average_loan_duration']
account_df['num_credit_accounts'] = account_df['num_credit_cards'] + account_df['num_loans']

account_df.fillna(0, inplace=True)

# STANDARDIZE
scaler = StandardScaler()
features_to_scale = ['payment_history', 'amount_owed', 'length_of_credit_history', 'num_credit_accounts']
account_df[features_to_scale] = scaler.fit_transform(account_df[features_to_scale])

# WEIGHTS
account_df['credit_score'] = (account_df['payment_history'] * 0.35 +
                              account_df['amount_owed'] * 0.30 +
                              account_df['length_of_credit_history'] * 0.15 +
                              account_df['num_credit_accounts'] * 0.20)

# SCALING
min_score = 300
max_score = 850
account_df['credit_score'] = min_score + (max_score - min_score) * (account_df['credit_score'] - account_df['credit_score'].min()) / (account_df['credit_score'].max() - account_df['credit_score'].min())

plt.figure(figsize=(10, 6))
sns.scatterplot(x='account_id', y='credit_score', data=account_df, palette='viridis')
plt.title('Customer Credit Scores')
plt.show()

# 
def generate_recommendations(row):
    recommendations = []
    if row['credit_score'] < 580:
        recommendations.append('Consider improving your payment history.')
    if row['credit_score'] < 670:
        recommendations.append('Try to reduce your credit utilization.')
    return recommendations

account_df['recommendations'] = account_df.apply(generate_recommendations, axis=1)
print(account_df[['account_id', 'credit_score', 'recommendations']].head())


['not applied' 'A' 'B' 'D' 'C']
city                0
disp_id             0
account_id          0
full_name           0
district_id         0
frequency           0
parsed_date         0
card_id             0
card_type           0
sex                 0
full_date_card      0
age                 0
social_security     0
phone               0
email               0
address             0
state               0
client_id           0
owner_type          0
state_name          0
state_abbrev        0
region              0
division            0
loan_id             0
loan_amount         0
duration            0
payments            0
status              0
purpose             0
f1                  0
trans_id            0
transaction_type    0
operation           0
trans_amount        0
balance             0
k_symbol            0
bank                0
dtype: int64


KeyError: "Column(s) ['transaction_amount'] do not exist"