In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
loan_df = pd.read_csv('/home/student/filtered_columns_dataset.csv')

In [33]:
# loan_df.head()

In [16]:
# loan_df.isnull().sum()

In [6]:
loan_df['annuity_amount'] = loan_df.groupby(['income_total', 'income_type'])['annuity_amount'].transform(lambda x: x.ffill().bfill())

In [13]:
# Calculate the means for the specified columns
mean_ext_source_1 = loan_df['ext_source_1'].mean()
mean_ext_source_2 = loan_df['ext_source_2'].mean()
mean_ext_source_3 = loan_df['ext_source_3'].mean()

# Display the calculated means
print("Mean of ext_source_1:", mean_ext_source_1)
print("Mean of ext_source_2:", mean_ext_source_2)
print("Mean of ext_source_3:", mean_ext_source_3)

Mean of ext_source_1: 0.5021298056566624
Mean of ext_source_2: 0.5143926741308464
Mean of ext_source_3: 0.5108529061799658


In [14]:
# Fill missing values with the calculated means
loan_df['ext_source_1'] = loan_df['ext_source_1'].fillna(mean_ext_source_1)
loan_df['ext_source_2'] = loan_df['ext_source_2'].fillna(mean_ext_source_2)
loan_df['ext_source_3'] = loan_df['ext_source_3'].fillna(mean_ext_source_3)

# Check to make sure there are no more missing values in the columns
print(loan_df[['ext_source_1', 'ext_source_2', 'ext_source_3']].isnull().sum())

ext_source_1    0
ext_source_2    0
ext_source_3    0
dtype: int64


In [17]:
# Encoding categorical features using LabelEncoder
categorical_columns = ['contract_type', 'gender', 'owns_car', 'owns_realty', 'income_type', 
                       'education_type', 'family_status']

In [20]:
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    loan_df[column] = le.fit_transform(loan_df[column])
    label_encoders[column] = le

In [22]:
# Split the data into features and target
X = loan_df.drop('target', axis=1)
y = loan_df['target']

In [23]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
# Impute remaining missing values across the dataset using the mean strategy
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [25]:
# Initialize and train the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_imputed, y_train)

In [27]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test_imputed)

In [28]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [30]:
# Print the results
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

Accuracy: 0.9197975155548811
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     84841
           1       0.55      0.01      0.02      7413

    accuracy                           0.92     92254
   macro avg       0.73      0.51      0.49     92254
weighted avg       0.89      0.92      0.88     92254

