In [48]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [49]:
# read in the file
trainData = pd.read_csv('/Users/eddiekayizzi/PycharmProjects/CaptialOneProj/GiveMeSomeCredit/cs-training.csv')

# Drop unneeded columns, drop NA data
trainData = trainData.drop(columns=["Unnamed: 0", 'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTime60-89DaysPastDueNotWorse']).dropna()

loanAmount = 0

trainData['LoanToIncomeRatio'] = loanAmount / (trainData['MonthlyIncome'] + 1e-5)  # added epsilon to avoid division by zero

# print top most values in csv file
trainData.head(10)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfDependents,LoanToIncomeRatio
0,1,0.766127,45,0.802982,9120.0,13,0,6,2.0,0.0
1,0,0.957151,40,0.121876,2600.0,4,0,0,1.0,0.0
2,0,0.65818,38,0.085113,3042.0,2,1,0,0.0,0.0
3,0,0.23381,30,0.03605,3300.0,5,0,0,0.0,0.0
4,0,0.907239,49,0.024926,63588.0,7,0,1,0.0,0.0
5,0,0.213179,74,0.375607,3500.0,3,0,1,1.0,0.0
7,0,0.754464,39,0.20994,3500.0,8,0,0,0.0,0.0
9,0,0.189169,57,0.606291,23684.0,9,0,4,2.0,0.0
10,0,0.644226,30,0.309476,2500.0,5,0,0,0.0,0.0
11,0,0.018798,51,0.531529,6501.0,7,0,2,2.0,0.0


In [50]:
# Target variable 'SeriousDlqin2yrs' (1 indicates experiencing distress, 0 indicates no distress)
y = trainData['SeriousDlqin2yrs']

# Selecting all other features for prediction
X = trainData.drop(columns=['SeriousDlqin2yrs', 'LoanToIncomeRatio'])

In [51]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
clf.fit(X_train, y_train)

In [96]:
def predict_loan_qualification(age, MonthlyIncome, monthly_debt, lines_of_credit, num_open_loans, NumberOfTimes90DaysLate, NumberRealEstateLoansOrLines, total_credit_balance, credit_sum_limits, NumberOfDependents, loanAmount, model=clf):

    if loanAmount > 50000:
        LongTermLoan = True
    else:
        LongTermLoan = False


    DebtRatio = monthly_debt/MonthlyIncome
    RevolvingUtilizationOfUnsecuredLines = total_credit_balance/credit_sum_limits
    NumberOfOpenCreditLinesAndLoans = num_open_loans + lines_of_credit
    LoanToIncome = (loanAmount / MonthlyIncome)

    # Create a dataframe from the input parameters
    input_data = pd.DataFrame({
        'RevolvingUtilizationOfUnsecuredLines': [RevolvingUtilizationOfUnsecuredLines],
        'age': [age],
        'DebtRatio': [DebtRatio],
        'MonthlyIncome': [MonthlyIncome],
        'NumberOfOpenCreditLinesAndLoans': [NumberOfOpenCreditLinesAndLoans],
        'NumberOfTimes90DaysLate': [NumberOfTimes90DaysLate], 'NumberRealEstateLoansOrLines' : [NumberRealEstateLoansOrLines], 'NumberOfDependents' : [NumberOfDependents]
    })

    # Use the trained model to predict the probability of distress
    prediction_proba = model.predict_proba(input_data)[:, 1]

    # Convert probability to loan qualification score
    score = (1 - prediction_proba) * 10



    # if NumberRealEstateLoansOrLines > 3:
    #     score = score - 1.5
    #
    # if age < 25 & LoanToIncome > 6:
    #     score = score - 2
    #
    # if RevolvingUtilizationOfUnsecuredLines > 0.8:  # Above 80% utilization
    #     score = score - 2.5
    #
    # if NumberOfTimes90DaysLate > 2:
    #     score = score - 2
    #
    # if NumberOfDependents > 3 & LoanToIncome > 7:
    #     score = score - 1.5





    return round(score[0], 1)  # Return the rounded score





In [97]:
print(predict_loan_qualification(24, 600, 500, 4, 4, 1, 6, 9000, 10000, 6, 60000))

4.8


In [38]:
# Make predictions
predictions = clf.predict_proba(X_test)[:, 1]  # We want the probability of the '1' class

# Convert predicted probabilities to a score between 0 and 10
# (assuming higher probability of distress means less chance of loan approval)
loan_qualification_score = (1 - predictions) * 10

In [47]:
# Convert to 0-10 score for each prediction
loan_qualification_score = [round(score, 1) for score in loan_qualification_score]

print(loan_qualification_score[:10])  # Print first 10 scores

# Optional: Calculate accuracy (only if you want to see how well the classifier performs)
accuracy = accuracy_score(y_test, [1 if pred >= 0.5 else 0 for pred in predictions])
print(f"Model Accuracy: {accuracy:.4f}")



[6.7, 9.2, 9.0, 9.7, 8.8, 5.9, 9.7, 10.0, 9.6, 10.0]
Model Accuracy: 0.9309
