#                            Credit Score Notebook 

In [None]:
#Importing libraries
import seaborn as sns 
import matplotlib.pyplot as plotly
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

print("Libraries imported successfully....")

In [None]:
#Import data
df =  pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print("Data imported successfully....")

In [None]:
df.drop('Payment_Behaviour', axis=1)

In [None]:
# get the number of missing data points per column in Complaints_df
missing_values_count = df.isnull().sum()

# look at the # of missing points
missing_values_count

##Relevant Null vallues dropped in mining via query. 

In [None]:
df.dtypes

##### Missing data handling plan
1. Name: We wont be using this as a feature 
2. Monthly inhand salary,Number of delayed payment, Num credit inquiries, Amount invested monthly, Monthly balance : all set to zero.
3. Type of Loan: will be split into one hot and null values will sort theselves out.
4. Credit history age will be feature engineered into numerics and null will be set to zero

There is data for eight months of the year for each client. Upon research, I found that credit score is not dependant on the previous month hence each entry in the dataset can and will be used independently. Trying a time series trick would be overfitting.


#### Numeric values handling

In [None]:
#Credit history age to months
def age_to_months(age_str):
    if pd.isna(age_str) or age_str == '':
        return float('nan')
    age_list = age_str.split()
    years = int(age_list[0])
    months = int(age_list[3])
    total_months = years * 12 + months
    return total_months

df['Credit_History_Age'] = df['Credit_History_Age'].apply(age_to_months)

numeric_cols = ['Annual_Income','Monthly_Inhand_Salary','Num_Bank_Accounts','Num_Credit_Card','Interest_Rate','Num_of_Loan','Delay_from_due_date','Num_of_Delayed_Payment','Changed_Credit_Limit','Num_Credit_Inquiries','Outstanding_Debt','Credit_Utilization_Ratio','Credit_History_Age','Total_EMI_per_month','Amount_invested_monthly','Monthly_Balance']
categorical_cols = ['Occupation','Credit_Mix','Payment_of_Min_Amount']


#Cleaning up underscored columns

# remove '_' from values in specified columns and convert to float
numeric_cols

In [None]:
for col in numeric_cols:
    df[col] = df[col].replace('_', '')
    df[col] = pd.to_numeric(df[col], errors='coerce')

# print the updated DataFrame
print(df)

 #### Categorical values handling

1. Segregate into high paying, medium and low paying' and use ordinal encoding
2. Month to be transformed into ordinal encoding
3. Separate types of loans and Payment_Behaviour by commas and count number of loans
4. Transform Credit Mix and Payment_of_Min_Amount with ordinal encoding


In [None]:
#Annual salaries feature engineering

# Define the income thresholds for each category
income_bins = [0, 30000, 70000, float('inf')]
income_labels = ['Low Income', 'Medium Income', 'High Income']

# Segregate the annual income into categories using pandas cut() function
df['Income_Category'] = pd.cut(df['Annual_Income'], bins=income_bins, labels=income_labels)

# Plot the frequency distribution of each income category using a bar plot
df['Income_Category'].value_counts().plot(kind='bar', rot=0)

# Set the axis labels and title
plt.xlabel('Income_Category')
plt.ylabel('Frequency')
plt.title('Frequency Distribution of Annual Income by Category')

# Show the plot
plt.show()


In [None]:
df.loc[:,['Month','Occupation','Type_of_Loan','Credit_Mix','Payment_of_Min_Amount','Payment_Behaviour','Credit_Score']]

In [None]:
unique_count = df['Payment_of_Min_Amount'].value_counts()
print(unique_count)

#### Exploration

In [None]:
# Create a box plot for each numeric column by "Credit_Score"
for col in numeric_cols:
    plt.figure()
    sns.boxplot(x='Credit_Score', y=col, data=df)
    plt.title(col)
    
plt.show()


In [None]:
# Create a bar plot for each categorical column by "Credit_Score"
# Iterate over each categorical column and create a bar plot
for col in categorical_cols:
    # Group the DataFrame by 'Credit_Score' and the current categorical column
    grouped = df.groupby(['Credit_Score', col]).size().unstack()
    
    # Normalize the data to convert counts to proportions
    grouped = grouped.div(grouped.sum(axis=1), axis=0)
    
    # Create the bar plot
    grouped.plot(kind='bar', stacked=True)
    plt.title(col)
    plt.xlabel('Credit_Score')
    plt.ylabel('Proportion')
    plt.show()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
#from sklearn.svm import SVC


# Define the features and label
features = ['Monthly_Inhand_Salary', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance','Credit_Mix', 'Payment_of_Min_Amount']
label = 'Credit_Score'

# Convert the label column to ordinal categories
label_encoder = OrdinalEncoder()
y = label_encoder.fit_transform(df[label].values.reshape(-1, 1))

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], y, test_size=0.3, random_state=0)

#Categorical and numerical cols
cat_cols = ['Credit_Mix', 'Payment_of_Min_Amount']
num_cols = ['Monthly_Inhand_Salary', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance']

# Define a pipeline for numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define a pipeline for categorical columns
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a ColumnTransformer to apply the pipeline to the numeric and categorical columns
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

# Fit the preprocessor to the training data and transform both the training and test data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print('Training Set: %d, Test Set: %d \n' % (len(X_train), len(X_test)))

# Print the transformed DataFrame
print("X_train_Transformed /n",X_train_transformed)
print("X_test_Transformed /n",X_test_transformed)


#### Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier#
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score


from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

#mcm = confusion_matrix(y_test, predictions)

lr_model = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000).fit(X_train_transformed, y_train)
kn_model = KNeighborsClassifier().fit(X_train_transformed, y_train)
dt_model = DecisionTreeClassifier().fit(X_train_transformed, y_train)
rf_model = RandomForestClassifier().fit(X_train_transformed, y_train)
nb_model = GaussianNB().fit(X_train_transformed, y_train)
ab_model = AdaBoostClassifier().fit(X_train_transformed, y_train)
#svc_model = SVC().fit(X_train_transformed,y_train)


models = [lr_model, kn_model, dt_model, rf_model, ab_model, nb_model]

In [None]:
def classification_report_func(curr_model):
    print('\n Current model is: \n', curr_model)
    predictions = curr_model.predict(X_test_transformed)

    print('Predicted labels: ', predictions[:15])
    print('Actual labels   : ' ,y_test[:15]) 
    print(classification_report(y_test, predictions))
    
for model in models:
    classification_report_func(model)


##### Function for checking scores for each model hidden below.

##### Conclusion
Random Forest has the best accuracy score score with 0.73 accuracy score.


In [None]:
import pickle

# Save the model to a file
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

The test dataset is incomplete in terms of columns.

In [None]:
#for col in numeric_cols:
#    test[col] = test[col].replace('_', '')
#    test[col] = pd.to_numeric(test[col], errors='coerce')


#test_transformed = preprocessor.fit_transform(test)

In [None]:
#import joblib

# Load the saved model
#rf_model = joblib.load('random_forest_model.pkl')

# Use the model to predict the target variable for the test data
#test_predictions = rf_model.predict(test_transformed)

# Print the predictions
#print(test_predictions)
