In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [None]:
loan_df = pd.read_csv('/home/student/application_train.csv', nrows=50000)

In [None]:
# loan_df.head()

In [None]:
new_column_names = {
    'SK_ID_CURR': 'client_id',
    'TARGET': 'loan_status',
    'NAME_CONTRACT_TYPE': 'loan_type',
    'CODE_GENDER': 'gender',
    'FLAG_OWN_CAR': 'owns_car',
    'FLAG_OWN_REALTY': 'owns_property',
    'CNT_CHILDREN': 'num_children',
    'AMT_INCOME_TOTAL': 'total_income',
    'AMT_CREDIT': 'credit_amount',
    'AMT_ANNUITY': 'annuity_amount',
    'AMT_GOODS_PRICE': 'goods_price',
    'NAME_TYPE_SUITE': 'accompaniment_type',
    'NAME_INCOME_TYPE': 'income_type',
    'NAME_EDUCATION_TYPE': 'education_level',
    'NAME_FAMILY_STATUS': 'family_status',
    'NAME_HOUSING_TYPE': 'housing_type',
    'REGION_POPULATION_RELATIVE': 'region_population_relative',
    'DAYS_BIRTH': 'age_in_days',
    'DAYS_EMPLOYED': 'days_employed',
    'DAYS_REGISTRATION': 'days_registered',
    'DAYS_ID_PUBLISH': 'days_ID_published',
    'OWN_CAR_AGE': 'car_age',
    'FLAG_MOBIL': 'has_mobile',
    'FLAG_EMP_PHONE': 'has_work_phone',
    'FLAG_WORK_PHONE': 'has_work_phone_alt',
    'FLAG_CONT_MOBILE': 'has_contactable_mobile',
    'FLAG_PHONE': 'has_phone',
    'FLAG_EMAIL': 'has_email',
    'OCCUPATION_TYPE': 'occupation',
    'CNT_FAM_MEMBERS': 'family_size',
    'REGION_RATING_CLIENT': 'region_rating',
    'REGION_RATING_CLIENT_W_CITY': 'region_rating_with_city',
    'WEEKDAY_APPR_PROCESS_START': 'application_weekday',
    'HOUR_APPR_PROCESS_START': 'application_hour',
    'REG_REGION_NOT_LIVE_REGION': 'region_not_living',
    'REG_REGION_NOT_WORK_REGION': 'region_not_working',
    'LIVE_REGION_NOT_WORK_REGION': 'living_region_not_work_region',
    'REG_CITY_NOT_LIVE_CITY': 'city_not_living',
    'REG_CITY_NOT_WORK_CITY': 'city_not_working',
    'LIVE_CITY_NOT_WORK_CITY': 'living_city_not_work_city',
    'ORGANIZATION_TYPE': 'employer_type',
    'EXT_SOURCE_1': 'external_score_1',
    'EXT_SOURCE_2': 'external_score_2',
    'EXT_SOURCE_3': 'external_score_3',
    'APARTMENTS_AVG': 'apartment_average_size',
    'BASEMENTAREA_AVG': 'basement_average_area',
    'YEARS_BEGINEXPLUATATION_AVG': 'years_since_building_use',
    'YEARS_BUILD_AVG': 'building_age',
    'COMMONAREA_AVG': 'common_area_avg',
    'ELEVATORS_AVG': 'elevators_avg',
    'ENTRANCES_AVG': 'entrances_avg',
    'FLOORSMAX_AVG': 'max_floors_avg',
    'FLOORSMIN_AVG': 'min_floors_avg',
    'LANDAREA_AVG': 'land_area_avg',
    'LIVINGAPARTMENTS_AVG': 'living_apartments_avg',
    'LIVINGAREA_AVG': 'living_area_avg',
    'NONLIVINGAPARTMENTS_AVG': 'non_living_apartments_avg',
    'NONLIVINGAREA_AVG': 'non_living_area_avg',
    'APARTMENTS_MODE': 'apartment_mode',
    'BASEMENTAREA_MODE': 'basement_mode',
    'YEARS_BEGINEXPLUATATION_MODE': 'years_building_use_mode',
    'YEARS_BUILD_MODE': 'building_age_mode',
    'COMMONAREA_MODE': 'common_area_mode',
    'ELEVATORS_MODE': 'elevators_mode',
    'ENTRANCES_MODE': 'entrances_mode',
    'FLOORSMAX_MODE': 'max_floors_mode',
    'FLOORSMIN_MODE': 'min_floors_mode',
    'LANDAREA_MODE': 'land_area_mode',
    'LIVINGAPARTMENTS_MODE': 'living_apartments_mode',
    'LIVINGAREA_MODE': 'living_area_mode',
    'NONLIVINGAPARTMENTS_MODE': 'non_living_apartments_mode',
    'NONLIVINGAREA_MODE': 'non_living_area_mode',
    'APARTMENTS_MEDI': 'apartment_median_size',
    'BASEMENTAREA_MEDI': 'basement_median_area',
    'YEARS_BEGINEXPLUATATION_MEDI': 'years_building_use_median',
    'YEARS_BUILD_MEDI': 'building_age_median',
    'COMMONAREA_MEDI': 'common_area_median',
    'ELEVATORS_MEDI': 'elevators_median',
    'ENTRANCES_MEDI': 'entrances_median',
    'FLOORSMAX_MEDI': 'max_floors_median',
    'FLOORSMIN_MEDI': 'min_floors_median',
    'LANDAREA_MEDI': 'land_area_median',
    'LIVINGAPARTMENTS_MEDI': 'living_apartments_median',
    'LIVINGAREA_MEDI': 'living_area_median',
    'NONLIVINGAPARTMENTS_MEDI': 'non_living_apartments_median',
    'NONLIVINGAREA_MEDI': 'non_living_area_median',
    'FONDKAPREMONT_MODE': 'house_fund_mode',
    'HOUSETYPE_MODE': 'house_type_mode',
    'TOTALAREA_MODE': 'total_area_mode',
    'WALLSMATERIAL_MODE': 'walls_material_mode',
    'EMERGENCYSTATE_MODE': 'emergency_state_mode',
    'OBS_30_CNT_SOCIAL_CIRCLE': 'social_circle_obs_30',
    'DEF_30_CNT_SOCIAL_CIRCLE': 'social_circle_def_30',
    'OBS_60_CNT_SOCIAL_CIRCLE': 'social_circle_obs_60',
    'DEF_60_CNT_SOCIAL_CIRCLE': 'social_circle_def_60',
    'DAYS_LAST_PHONE_CHANGE': 'days_since_last_phone_change',
    'FLAG_DOCUMENT_2': 'flag_document_2',
    'FLAG_DOCUMENT_3': 'flag_document_3',
    'FLAG_DOCUMENT_4': 'flag_document_4',
    'FLAG_DOCUMENT_5': 'flag_document_5',
    'FLAG_DOCUMENT_6': 'flag_document_6',
    'FLAG_DOCUMENT_7': 'flag_document_7',
    'FLAG_DOCUMENT_8': 'flag_document_8',
    'FLAG_DOCUMENT_9': 'flag_document_9',
    'FLAG_DOCUMENT_10': 'flag_document_10',
    'FLAG_DOCUMENT_11': 'flag_document_11',
    'FLAG_DOCUMENT_12': 'flag_document_12',
    'FLAG_DOCUMENT_13': 'flag_document_13',
    'FLAG_DOCUMENT_14': 'flag_document_14',
    'FLAG_DOCUMENT_15': 'flag_document_15',
    'FLAG_DOCUMENT_16': 'flag_document_16',
    'FLAG_DOCUMENT_17': 'flag_document_17',
    'FLAG_DOCUMENT_18': 'flag_document_18',
    'FLAG_DOCUMENT_19': 'flag_document_19',
    'FLAG_DOCUMENT_20': 'flag_document_20',
    'FLAG_DOCUMENT_21': 'flag_document_21',
    'AMT_REQ_CREDIT_BUREAU_HOUR': 'credit_requests_hour',
    'AMT_REQ_CREDIT_BUREAU_DAY': 'credit_requests_day',
    'AMT_REQ_CREDIT_BUREAU_WEEK': 'credit_requests_week',
    'AMT_REQ_CREDIT_BUREAU_MON': 'credit_requests_month',
    'AMT_REQ_CREDIT_BUREAU_QRT': 'credit_requests_quarter',
    'AMT_REQ_CREDIT_BUREAU_YEAR': 'credit_requests_year'
}

### Apply renaming to the dataframe
loan_df.rename(columns=new_column_names, inplace=True)

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

missing_values = loan_df.isnull().sum()
# print(missing_values)

missing_columns = missing_values[missing_values > 0]
# print(missing_columns)

In [None]:
loan_df['annuity_amount'] = loan_df.groupby(['total_income', 'income_type'])['annuity_amount'].transform(lambda x: x.ffill().bfill())

In [None]:
loan_df['goods_price'] = loan_df['goods_price'].fillna(loan_df['goods_price'].quantile(0.25))
loan_df['car_age'] = loan_df['car_age'].fillna(loan_df['car_age'].quantile(0.25))
loan_df['credit_requests_hour'] = loan_df['credit_requests_hour'].fillna(loan_df['credit_requests_hour'].quantile(0.25))
loan_df['credit_requests_day'] = loan_df['credit_requests_day'].fillna(loan_df['credit_requests_day'].quantile(0.25))
loan_df['credit_requests_week'] = loan_df['credit_requests_week'].fillna(loan_df['credit_requests_week'].quantile(0.25))
loan_df['credit_requests_month'] = loan_df['credit_requests_month'].fillna(loan_df['credit_requests_month'].quantile(0.25))
loan_df['credit_requests_quarter'] = loan_df['credit_requests_quarter'].fillna(loan_df['credit_requests_quarter'].quantile(0.25))
loan_df['accompaniment_type'] = loan_df['accompaniment_type'].fillna(loan_df['accompaniment_type'].mode()[0])  # Using mode for categorical data
loan_df['entrances_median'] = loan_df['entrances_median'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['living_area_median'] = loan_df['living_area_median'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['days_since_last_phone_change'] = loan_df['days_since_last_phone_change'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['external_score_1'] = loan_df['external_score_1'].fillna(loan_df['external_score_1'].quantile(0.25))
loan_df['external_score_2'] = loan_df['external_score_2'].fillna(loan_df['external_score_2'].quantile(0.25))
loan_df['external_score_3'] = loan_df['external_score_3'].fillna(loan_df['external_score_3'].quantile(0.25))
loan_df['social_circle_obs_30'] = loan_df['social_circle_obs_30'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['social_circle_def_30'] = loan_df['social_circle_def_30'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['social_circle_obs_60'] = loan_df['social_circle_obs_60'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['social_circle_def_60'] = loan_df['social_circle_def_60'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['non_living_apartments_median'] = loan_df['non_living_apartments_median'].fillna(loan_df['non_living_apartments_median'].quantile(0.25))
loan_df['land_area_mode'] = loan_df['land_area_mode'].fillna(loan_df['land_area_mode'].quantile(0.25))
loan_df['living_apartments_mode'] = loan_df['living_apartments_mode'].fillna(loan_df['living_apartments_mode'].quantile(0.25))
loan_df['living_area_mode'] = loan_df['living_area_mode'].fillna(loan_df['living_area_mode'].quantile(0.25))
loan_df['non_living_apartments_mode'] = loan_df['non_living_apartments_mode'].fillna(loan_df['non_living_apartments_mode'].quantile(0.25))
loan_df['non_living_area_mode'] = loan_df['non_living_area_mode'].fillna(loan_df['non_living_area_mode'].quantile(0.25))
loan_df['apartment_median_size'] = loan_df['apartment_median_size'].fillna(loan_df['apartment_median_size'].quantile(0.25))
loan_df['entrances_avg'] = loan_df['entrances_avg'].fillna(loan_df['entrances_avg'].quantile(0.25))
loan_df['basement_median_area'] = loan_df['basement_median_area'].fillna(loan_df['basement_median_area'].quantile(0.25))
loan_df['years_building_use_median'] = loan_df['years_building_use_median'].fillna(loan_df['years_building_use_median'].quantile(0.25))
loan_df['building_age_median'] = loan_df['building_age_median'].fillna(loan_df['building_age_median'].quantile(0.25))
loan_df['living_apartments_avg'] = loan_df['living_apartments_avg'].fillna(loan_df['living_apartments_avg'].quantile(0.25))
loan_df['non_living_apartments_avg'] = loan_df['non_living_apartments_avg'].fillna(loan_df['non_living_apartments_avg'].quantile(0.25))
loan_df['common_area_median'] = loan_df['common_area_median'].fillna(loan_df['common_area_median'].quantile(0.25))
loan_df['elevators_median'] = loan_df['elevators_median'].fillna(loan_df['elevators_median'].quantile(0.25))
loan_df['max_floors_median'] = loan_df['max_floors_median'].fillna(loan_df['max_floors_median'].quantile(0.25))
loan_df['min_floors_median'] = loan_df['min_floors_median'].fillna(loan_df['min_floors_median'].quantile(0.25))
loan_df['land_area_median'] = loan_df['land_area_median'].fillna(loan_df['land_area_median'].quantile(0.25))
loan_df['living_apartments_median'] = loan_df['living_apartments_median'].fillna(loan_df['living_apartments_median'].quantile(0.25))
loan_df['non_living_apartments_median'] = loan_df['non_living_apartments_median'].fillna(loan_df['non_living_apartments_median'].quantile(0.25))
loan_df['non_living_area_median'] = loan_df['non_living_area_median'].fillna(loan_df['non_living_area_median'].quantile(0.25))
loan_df['living_area_avg'] = loan_df['living_area_avg'].fillna(loan_df['living_area_avg'].quantile(0.25))
loan_df['family_size'] = loan_df['family_size'].fillna(loan_df['family_size'].quantile(0.25))
loan_df['non_living_area_avg'] = loan_df['non_living_area_avg'].fillna(loan_df['non_living_area_avg'].quantile(0.25))
loan_df['total_area_mode'] = loan_df['total_area_mode'].fillna(loan_df['total_area_mode'].quantile(0.25))
loan_df['annuity_amount'] = loan_df['annuity_amount'].fillna(loan_df['annuity_amount'].quantile(0.25))

In [None]:
# Fill missing values with the 25th percentile
loan_df['goods_price'] = loan_df['goods_price'].fillna(loan_df['goods_price'].quantile(0.25))

In [None]:
# Fill missing values with mode for categorical columns
loan_df['max_floors_mode'] = loan_df['max_floors_mode'].fillna(loan_df['max_floors_mode'].mode()[0])
loan_df['occupation'] = loan_df['occupation'].fillna(loan_df['occupation'].mode()[0])
loan_df['min_floors_mode'] = loan_df['min_floors_mode'].fillna(loan_df['min_floors_mode'].mode()[0])
loan_df['elevators_mode'] = loan_df['elevators_mode'].fillna(loan_df['elevators_mode'].mode()[0])
loan_df['apartment_average_size'] = loan_df['apartment_average_size'].fillna(loan_df['apartment_average_size'].mode()[0])
loan_df['basement_average_area'] = loan_df['basement_average_area'].fillna(loan_df['basement_average_area'].mode()[0])
loan_df['years_since_building_use'] = loan_df['years_since_building_use'].fillna(loan_df['years_since_building_use'].mode()[0])
loan_df['building_age'] = loan_df['building_age'].fillna(loan_df['building_age'].mode()[0])
loan_df['common_area_avg'] = loan_df['common_area_avg'].fillna(loan_df['common_area_avg'].mode()[0])
loan_df['elevators_avg'] = loan_df['elevators_avg'].fillna(loan_df['elevators_avg'].mode()[0])
loan_df['entrances_avg'] = loan_df['entrances_avg'].fillna(loan_df['entrances_avg'].mode()[0])
loan_df['max_floors_avg'] = loan_df['max_floors_avg'].fillna(loan_df['max_floors_avg'].mode()[0])
loan_df['min_floors_avg'] = loan_df['min_floors_avg'].fillna(loan_df['min_floors_avg'].mode()[0])
loan_df['land_area_avg'] = loan_df['land_area_avg'].fillna(loan_df['land_area_avg'].mode()[0])
loan_df['living_apartments_avg'] = loan_df['living_apartments_avg'].fillna(loan_df['living_apartments_avg'].mode()[0])
loan_df['non_living_apartments_avg'] = loan_df['non_living_apartments_avg'].fillna(loan_df['non_living_apartments_avg'].mode()[0])
loan_df['apartment_mode'] = loan_df['apartment_mode'].fillna(loan_df['apartment_mode'].mode()[0])
loan_df['basement_mode'] = loan_df['basement_mode'].fillna(loan_df['basement_mode'].mode()[0])
loan_df['years_building_use_mode'] = loan_df['years_building_use_mode'].fillna(loan_df['years_building_use_mode'].mode()[0])
loan_df['building_age_mode'] = loan_df['building_age_mode'].fillna(loan_df['building_age_mode'].mode()[0])
loan_df['house_fund_mode'] = loan_df['house_fund_mode'].fillna(loan_df['common_area_mode'].mode()[0])
loan_df['walls_material_mode'] = loan_df['walls_material_mode'].fillna(loan_df['house_type_mode'].mode()[0])
loan_df['emergency_state_mode'] = loan_df['emergency_state_mode'].fillna(loan_df['emergency_state_mode'].mode()[0])
loan_df['entrances_mode'] = loan_df['entrances_mode'].fillna(loan_df['entrances_mode'].mode()[0])
loan_df['house_type_mode'] = loan_df['house_type_mode'].fillna(loan_df['house_type_mode'].mode()[0])
loan_df['occupation'] = loan_df['occupation'].fillna(loan_df['occupation'].mode()[0])
loan_df['common_area_mode'] = loan_df['common_area_mode'].fillna(loan_df['common_area_mode'].mode()[0])
loan_df['credit_requests_year'] = loan_df['credit_requests_year'].fillna(loan_df['credit_requests_year'].mode()[0])

In [None]:
# Convert age from days to years (absolute values to handle negative days)
loan_df['employed_years'] = abs(loan_df['days_employed']) / 365.25

# Round age_years to the nearest whole number
loan_df['employed_years'] = loan_df['employed_years'].round(0).astype(int)

In [None]:
# Split the data into features and target
X = loan_df.drop(columns=['client_id', 'loan_status'])
y = loan_df['loan_status']

In [None]:
features = [
    'total_income',
    'credit_amount',
    'num_children',
    'gender',
    'family_status',
    'housing_type',
    'region_population_relative',
    'age_in_days',
    'owns_car',
    'occupation'
]

In [None]:
# Create your feature matrix and target variable
X = loan_df[features]
y = loan_df['loan_status']

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

missing_values = loan_df.isnull().sum()
# print(missing_values)

missing_columns = missing_values[missing_values > 0]
print(missing_columns)

In [None]:
# Frequency encoding for categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

for col in categorical_cols:
    frequency_encoding = X[col].value_counts() / len(X)
    X.loc[:, col] = X[col].map(frequency_encoding)

# Check lengths again
print(len(X), len(y))

In [None]:
import pickle

# List of categorical columns
categorical_cols = ['gender', 'family_status', 'housing_type', 'owns_car', 'occupation']

# Create and save frequency encodings
for col in categorical_cols:
    # Calculate frequency encoding
    frequency_encoding = X[col].value_counts() / len(X)
    
    # Save frequency encoding
    with open(f'{col}_frequency_encoding.pkl', 'wb') as f:
        pickle.dump(frequency_encoding, f)
    
    # Apply frequency encoding to the column
    X[col] = X[col].map(frequency_encoding)

print("Frequency encodings have been created and saved.")

In [None]:
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardize numerical features
scaler = StandardScaler()
X_train[['total_income', 'credit_amount']] = scaler.fit_transform(X_train[['total_income', 'credit_amount']])
X_test[['total_income', 'credit_amount']] = scaler.transform(X_test[['total_income', 'credit_amount']])

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train a Random Forest model with class weights
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, zero_division=1))

# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Overall Accuracy: {accuracy:.2f}')

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Approved', 'Approved'], yticklabels=['Not Approved', 'Approved'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


# After fitting the model, check the feature names
print("Features used for training the model:")
print(model.feature_names_in_)

# Model Serialization

In [None]:
import pickle

# Save the model
with open('machine_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [None]:

import pickle

# Save the scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# Save frequency encodings for categorical columns
categorical_cols = ['gender', 'family_status', 'housing_type', 'owns_car', 'occupation']
for col in categorical_cols:
    frequency_encoding = X[col].value_counts() / len(X)
    with open(f'{col}_frequency_encoding.pkl', 'wb') as f:
        pickle.dump(frequency_encoding, f)

# Linear Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression model
log_model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
log_model.fit(X_train_sm, y_train_sm)

# Make predictions
y_pred = log_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, zero_division=1))

# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Overall Accuracy: {accuracy:.2f}')

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Approved', 'Approved'], yticklabels=['Not Approved', 'Approved'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

In [None]:
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# Standardize numerical features
scaler = StandardScaler()
X_train_sm[['total_income', 'credit_amount']] = scaler.fit_transform(X_train_sm[['total_income', 'credit_amount']])
X_test[['total_income', 'credit_amount']] = scaler.transform(X_test[['total_income', 'credit_amount']])

# Train LinearSVM model
svm_model = LinearSVC(random_state=42, class_weight='balanced', max_iter=10000)
svm_model.fit(X_train_sm, y_train_sm)

# Make predictions
y_pred = svm_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, zero_division=1))

# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Overall Accuracy: {accuracy:.2f}')

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Approved', 'Approved'], yticklabels=['Not Approved', 'Approved'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Train Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_sm, y_train_sm)

# Make predictions
y_pred = gb_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, zero_division=1))

# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Overall Accuracy: {accuracy:.2f}')

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Approved', 'Approved'], yticklabels=['Not Approved', 'Approved'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()