In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [2]:
loan_df = pd.read_csv('/home/student/application_train.csv', nrows=10000)

In [3]:
loan_df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Column Description with their relevance to loan application
## Personal and Demographic Information
CODE_GENDER: Gender of the applicant. This feature can help capture behavioral differences that may indirectly influence credit risk (but be mindful of fairness in using this variable).<br>
FLAG_OWN_CAR and FLAG_OWN_REALTY: Indicates whether the applicant owns a car or real estate. Ownership of assets typically correlates with lower credit risk because these can serve as collateral or demonstrate financial stability.<br>
CNT_CHILDREN: Number of children in the household. More dependents can lead to higher financial obligations, potentially increasing default risk.<br>
DAYS_BIRTH: The age of the applicant (in days). Age can impact credit risk, with younger individuals often seen as higher risk, but it could also correlate with different financial behaviors.<br>
OCCUPATION_TYPE: This reflects the applicant's profession. Certain professions are more stable (e.g., government jobs) and could indicate lower risk compared to more unstable occupations.<br>
NAME_EDUCATION_TYPE: Education level can affect loan decisions since higher education is often associated with better job stability and income.<br>
NAME_FAMILY_STATUS: Marital status might influence financial stability, with married individuals often seen as more stable.<br>
CNT_FAM_MEMBERS: Number of family members, similar to children count, indicates financial dependency, which might increase credit risk.<br>

## Income and Loan Details
AMT_INCOME_TOTAL: Total annual income of the applicant. Higher income typically implies greater ability to repay the loan.<br>
AMT_CREDIT: Total amount of credit requested. A larger requested loan amount relative to income might increase risk.<br>
AMT_ANNUITY: The annual loan repayment amount. The proportion of annuity to income<br>(AMT_ANNUITY/AMT_INCOME_TOTAL) can indicate loan affordability.<br>
AMT_GOODS_PRICE: The price of the goods for which the loan is taken (e.g., equipment or seeds for farmers). This may help assess whether the loan is reasonable compared to income and other factors.<br>

## Employment Information
DAYS_EMPLOYED: How long the applicant has been employed. Longer employment periods are usually associated with greater stability and a lower likelihood of default.<br>
FLAG_EMP_PHONE: Indicates whether the applicant provided a work phone. Providing more contact details can be an indicator of transparency and stability.<br>
FLAG_WORK_PHONE, FLAG_MOBIL, FLAG_PHONE, FLAG_EMAIL: These flags indicate whether the applicant provided various forms of contact. Availability of contact information might help reduce risk, as the bank can reach the applicant more easily.<br>


## External Scores
EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3: External credit scores or sources. These scores are likely provided by credit bureaus or other third-party sources and are highly relevant for predicting creditworthiness.

## Housing and Property Features
NAME_HOUSING_TYPE: Type of housing (e.g., apartment, house). Certain housing types could be associated with more financial stability.<br>
APARTMENTS_AVG, FLOORSMAX_AVG, YEARS_BUILD_AVG: These features describe the condition or characteristics of the applicant’s property. Newer or more valuable property can indicate better financial security.<br>
OWN_CAR_AGE: Age of the applicant's car. A newer car might be a proxy for financial status.<br>
LANDAREA_AVG, LIVINGAREA_AVG, COMMONAREA_AVG: Average sizes of various areas of the property. Larger or better-maintained properties can imply higher wealth and lower credit risk.<br>


## Regional and Population Features
REGION_POPULATION_RELATIVE: This reflects the relative population of the region where the applicant lives. Living in a densely populated region might impact the economic opportunities available and influence loan decisions.<br>
REG_REGION_NOT_LIVE_REGION, REG_CITY_NOT_LIVE_CITY: Flags indicating whether the applicant's work region is different from their living region. This might affect commuting costs or job stability.<br>


## Loan Application Information
NAME_CONTRACT_TYPE: The type of loan contract (cash loans, revolving loans). Different loan types might have different risk profiles (e.g., revolving credit is generally riskier than installment loans).<br>
WEEKDAY_APPR_PROCESS_START and HOUR_APPR_PROCESS_START: The day and hour the application was started. These features could capture certain behavioral patterns, such as applying for loans at certain times of the week when individuals might be more or less rational.<br>

## Document Flags and Credit Bureau Requests
FLAG_DOCUMENT_*: Flags indicating which documents the applicant provided. Missing important documents might indicate higher risk.<br>
AMT_REQ_CREDIT_BUREAU_*: The number of credit bureau inquiries over different time periods (hour, day, week, month, quarter, year). Frequent credit bureau checks might suggest financial distress or higher risk.<br>

## Behavioral and Social Features
OBS_30_CNT_SOCIAL_CIRCLE, DEF_30_CNT_SOCIAL_CIRCLE, OBS_60_CNT_SOCIAL_CIRCLE, DEF_60_CNT_SOCIAL_CIRCLE: These features count the number of observations of the applicant’s social circle, which might be proxies for peer influence and social stability. Defaults within their social circle may signal increased risk.<br>
DAYS_LAST_PHONE_CHANGE: The number of days since the applicant last changed their phone. Frequent phone changes could indicate instability or potential fraud.<br>

In [4]:
new_column_names = {
    'SK_ID_CURR': 'client_id',
    'TARGET': 'loan_status',
    'NAME_CONTRACT_TYPE': 'loan_type',
    'CODE_GENDER': 'gender',
    'FLAG_OWN_CAR': 'owns_car',
    'FLAG_OWN_REALTY': 'owns_property',
    'CNT_CHILDREN': 'num_children',
    'AMT_INCOME_TOTAL': 'total_income',
    'AMT_CREDIT': 'credit_amount',
    'AMT_ANNUITY': 'annuity_amount',
    'AMT_GOODS_PRICE': 'goods_price',
    'NAME_TYPE_SUITE': 'accompaniment_type',
    'NAME_INCOME_TYPE': 'income_type',
    'NAME_EDUCATION_TYPE': 'education_level',
    'NAME_FAMILY_STATUS': 'family_status',
    'NAME_HOUSING_TYPE': 'housing_type',
    'REGION_POPULATION_RELATIVE': 'region_population_relative',
    'DAYS_BIRTH': 'age_in_days',
    'DAYS_EMPLOYED': 'days_employed',
    'DAYS_REGISTRATION': 'days_registered',
    'DAYS_ID_PUBLISH': 'days_ID_published',
    'OWN_CAR_AGE': 'car_age',
    'FLAG_MOBIL': 'has_mobile',
    'FLAG_EMP_PHONE': 'has_work_phone',
    'FLAG_WORK_PHONE': 'has_work_phone_alt',
    'FLAG_CONT_MOBILE': 'has_contactable_mobile',
    'FLAG_PHONE': 'has_phone',
    'FLAG_EMAIL': 'has_email',
    'OCCUPATION_TYPE': 'occupation',
    'CNT_FAM_MEMBERS': 'family_size',
    'REGION_RATING_CLIENT': 'region_rating',
    'REGION_RATING_CLIENT_W_CITY': 'region_rating_with_city',
    'WEEKDAY_APPR_PROCESS_START': 'application_weekday',
    'HOUR_APPR_PROCESS_START': 'application_hour',
    'REG_REGION_NOT_LIVE_REGION': 'region_not_living',
    'REG_REGION_NOT_WORK_REGION': 'region_not_working',
    'LIVE_REGION_NOT_WORK_REGION': 'living_region_not_work_region',
    'REG_CITY_NOT_LIVE_CITY': 'city_not_living',
    'REG_CITY_NOT_WORK_CITY': 'city_not_working',
    'LIVE_CITY_NOT_WORK_CITY': 'living_city_not_work_city',
    'ORGANIZATION_TYPE': 'employer_type',
    'EXT_SOURCE_1': 'external_score_1',
    'EXT_SOURCE_2': 'external_score_2',
    'EXT_SOURCE_3': 'external_score_3',
    'APARTMENTS_AVG': 'apartment_average_size',
    'BASEMENTAREA_AVG': 'basement_average_area',
    'YEARS_BEGINEXPLUATATION_AVG': 'years_since_building_use',
    'YEARS_BUILD_AVG': 'building_age',
    'COMMONAREA_AVG': 'common_area_avg',
    'ELEVATORS_AVG': 'elevators_avg',
    'ENTRANCES_AVG': 'entrances_avg',
    'FLOORSMAX_AVG': 'max_floors_avg',
    'FLOORSMIN_AVG': 'min_floors_avg',
    'LANDAREA_AVG': 'land_area_avg',
    'LIVINGAPARTMENTS_AVG': 'living_apartments_avg',
    'LIVINGAREA_AVG': 'living_area_avg',
    'NONLIVINGAPARTMENTS_AVG': 'non_living_apartments_avg',
    'NONLIVINGAREA_AVG': 'non_living_area_avg',
    'APARTMENTS_MODE': 'apartment_mode',
    'BASEMENTAREA_MODE': 'basement_mode',
    'YEARS_BEGINEXPLUATATION_MODE': 'years_building_use_mode',
    'YEARS_BUILD_MODE': 'building_age_mode',
    'COMMONAREA_MODE': 'common_area_mode',
    'ELEVATORS_MODE': 'elevators_mode',
    'ENTRANCES_MODE': 'entrances_mode',
    'FLOORSMAX_MODE': 'max_floors_mode',
    'FLOORSMIN_MODE': 'min_floors_mode',
    'LANDAREA_MODE': 'land_area_mode',
    'LIVINGAPARTMENTS_MODE': 'living_apartments_mode',
    'LIVINGAREA_MODE': 'living_area_mode',
    'NONLIVINGAPARTMENTS_MODE': 'non_living_apartments_mode',
    'NONLIVINGAREA_MODE': 'non_living_area_mode',
    'APARTMENTS_MEDI': 'apartment_median_size',
    'BASEMENTAREA_MEDI': 'basement_median_area',
    'YEARS_BEGINEXPLUATATION_MEDI': 'years_building_use_median',
    'YEARS_BUILD_MEDI': 'building_age_median',
    'COMMONAREA_MEDI': 'common_area_median',
    'ELEVATORS_MEDI': 'elevators_median',
    'ENTRANCES_MEDI': 'entrances_median',
    'FLOORSMAX_MEDI': 'max_floors_median',
    'FLOORSMIN_MEDI': 'min_floors_median',
    'LANDAREA_MEDI': 'land_area_median',
    'LIVINGAPARTMENTS_MEDI': 'living_apartments_median',
    'LIVINGAREA_MEDI': 'living_area_median',
    'NONLIVINGAPARTMENTS_MEDI': 'non_living_apartments_median',
    'NONLIVINGAREA_MEDI': 'non_living_area_median',
    'FONDKAPREMONT_MODE': 'house_fund_mode',
    'HOUSETYPE_MODE': 'house_type_mode',
    'TOTALAREA_MODE': 'total_area_mode',
    'WALLSMATERIAL_MODE': 'walls_material_mode',
    'EMERGENCYSTATE_MODE': 'emergency_state_mode',
    'OBS_30_CNT_SOCIAL_CIRCLE': 'social_circle_obs_30',
    'DEF_30_CNT_SOCIAL_CIRCLE': 'social_circle_def_30',
    'OBS_60_CNT_SOCIAL_CIRCLE': 'social_circle_obs_60',
    'DEF_60_CNT_SOCIAL_CIRCLE': 'social_circle_def_60',
    'DAYS_LAST_PHONE_CHANGE': 'days_since_last_phone_change',
    'FLAG_DOCUMENT_2': 'flag_document_2',
    'FLAG_DOCUMENT_3': 'flag_document_3',
    'FLAG_DOCUMENT_4': 'flag_document_4',
    'FLAG_DOCUMENT_5': 'flag_document_5',
    'FLAG_DOCUMENT_6': 'flag_document_6',
    'FLAG_DOCUMENT_7': 'flag_document_7',
    'FLAG_DOCUMENT_8': 'flag_document_8',
    'FLAG_DOCUMENT_9': 'flag_document_9',
    'FLAG_DOCUMENT_10': 'flag_document_10',
    'FLAG_DOCUMENT_11': 'flag_document_11',
    'FLAG_DOCUMENT_12': 'flag_document_12',
    'FLAG_DOCUMENT_13': 'flag_document_13',
    'FLAG_DOCUMENT_14': 'flag_document_14',
    'FLAG_DOCUMENT_15': 'flag_document_15',
    'FLAG_DOCUMENT_16': 'flag_document_16',
    'FLAG_DOCUMENT_17': 'flag_document_17',
    'FLAG_DOCUMENT_18': 'flag_document_18',
    'FLAG_DOCUMENT_19': 'flag_document_19',
    'FLAG_DOCUMENT_20': 'flag_document_20',
    'FLAG_DOCUMENT_21': 'flag_document_21',
    'AMT_REQ_CREDIT_BUREAU_HOUR': 'credit_requests_hour',
    'AMT_REQ_CREDIT_BUREAU_DAY': 'credit_requests_day',
    'AMT_REQ_CREDIT_BUREAU_WEEK': 'credit_requests_week',
    'AMT_REQ_CREDIT_BUREAU_MON': 'credit_requests_month',
    'AMT_REQ_CREDIT_BUREAU_QRT': 'credit_requests_quarter',
    'AMT_REQ_CREDIT_BUREAU_YEAR': 'credit_requests_year'
}

### Apply renaming to the dataframe
loan_df.rename(columns=new_column_names, inplace=True)

## pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

missing_values = loan_df.isnull().sum() 
print(missing_values)

missing_columns = missing_values[missing_values > 0]
print(missing_columns)

In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

missing_values = loan_df.isnull().sum()
# print(missing_values)

missing_columns = missing_values[missing_values > 0]
# print(missing_columns)

## Data Cleaning

# Imputation Techniques
Mean/Mode Imputation: For numerical columns, I have filled in the missing values with the mean. This approach is simple but can reduce variance in the data.

In [6]:
loan_df['annuity_amount'] = loan_df.groupby(['total_income', 'income_type'])['annuity_amount'].transform(lambda x: x.ffill().bfill())

## Filling in the numerical columns using the 25th Percentile
This method helps maintain the original distribution of your data. It ensures that the filled values are still within the range of the existing data. Also, In contexts like risk assessment, using lower percentiles can be particularly appropriate, as it might better represent a cautious estimate of the missing values.


Also, the 25th percentile is less affected by outliers(extremely high or low values) compared to the average(mean)

In [7]:
loan_df['goods_price'] = loan_df['goods_price'].fillna(loan_df['goods_price'].quantile(0.25))
loan_df['car_age'] = loan_df['car_age'].fillna(loan_df['car_age'].quantile(0.25))
loan_df['credit_requests_hour'] = loan_df['credit_requests_hour'].fillna(loan_df['credit_requests_hour'].quantile(0.25))
loan_df['credit_requests_day'] = loan_df['credit_requests_day'].fillna(loan_df['credit_requests_day'].quantile(0.25))
loan_df['credit_requests_week'] = loan_df['credit_requests_week'].fillna(loan_df['credit_requests_week'].quantile(0.25))
loan_df['credit_requests_month'] = loan_df['credit_requests_month'].fillna(loan_df['credit_requests_month'].quantile(0.25))
loan_df['credit_requests_quarter'] = loan_df['credit_requests_quarter'].fillna(loan_df['credit_requests_quarter'].quantile(0.25))
loan_df['accompaniment_type'] = loan_df['accompaniment_type'].fillna(loan_df['accompaniment_type'].mode()[0])  # Using mode for categorical data
loan_df['entrances_median'] = loan_df['entrances_median'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['living_area_median'] = loan_df['living_area_median'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['days_since_last_phone_change'] = loan_df['days_since_last_phone_change'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['external_score_1'] = loan_df['external_score_1'].fillna(loan_df['external_score_1'].quantile(0.25))
loan_df['external_score_2'] = loan_df['external_score_2'].fillna(loan_df['external_score_2'].quantile(0.25))
loan_df['external_score_3'] = loan_df['external_score_3'].fillna(loan_df['external_score_3'].quantile(0.25))
loan_df['social_circle_obs_30'] = loan_df['social_circle_obs_30'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['social_circle_def_30'] = loan_df['social_circle_def_30'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['social_circle_obs_60'] = loan_df['social_circle_obs_60'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['social_circle_def_60'] = loan_df['social_circle_def_60'].fillna(loan_df['credit_requests_year'].quantile(0.25))
loan_df['non_living_apartments_median'] = loan_df['non_living_apartments_median'].fillna(loan_df['non_living_apartments_median'].quantile(0.25))
loan_df['land_area_mode'] = loan_df['land_area_mode'].fillna(loan_df['land_area_mode'].quantile(0.25))
loan_df['living_apartments_mode'] = loan_df['living_apartments_mode'].fillna(loan_df['living_apartments_mode'].quantile(0.25))
loan_df['living_area_mode'] = loan_df['living_area_mode'].fillna(loan_df['living_area_mode'].quantile(0.25))
loan_df['non_living_apartments_mode'] = loan_df['non_living_apartments_mode'].fillna(loan_df['non_living_apartments_mode'].quantile(0.25))
loan_df['non_living_area_mode'] = loan_df['non_living_area_mode'].fillna(loan_df['non_living_area_mode'].quantile(0.25))
loan_df['apartment_median_size'] = loan_df['apartment_median_size'].fillna(loan_df['apartment_median_size'].quantile(0.25))
loan_df['entrances_avg'] = loan_df['entrances_avg'].fillna(loan_df['entrances_avg'].quantile(0.25))
loan_df['basement_median_area'] = loan_df['basement_median_area'].fillna(loan_df['basement_median_area'].quantile(0.25))
loan_df['years_building_use_median'] = loan_df['years_building_use_median'].fillna(loan_df['years_building_use_median'].quantile(0.25))
loan_df['building_age_median'] = loan_df['building_age_median'].fillna(loan_df['building_age_median'].quantile(0.25))
loan_df['living_apartments_avg'] = loan_df['living_apartments_avg'].fillna(loan_df['living_apartments_avg'].quantile(0.25))
loan_df['non_living_apartments_avg'] = loan_df['non_living_apartments_avg'].fillna(loan_df['non_living_apartments_avg'].quantile(0.25))
loan_df['common_area_median'] = loan_df['common_area_median'].fillna(loan_df['common_area_median'].quantile(0.25))
loan_df['elevators_median'] = loan_df['elevators_median'].fillna(loan_df['elevators_median'].quantile(0.25))
loan_df['max_floors_median'] = loan_df['max_floors_median'].fillna(loan_df['max_floors_median'].quantile(0.25))
loan_df['min_floors_median'] = loan_df['min_floors_median'].fillna(loan_df['min_floors_median'].quantile(0.25))
loan_df['land_area_median'] = loan_df['land_area_median'].fillna(loan_df['land_area_median'].quantile(0.25))
loan_df['living_apartments_median'] = loan_df['living_apartments_median'].fillna(loan_df['living_apartments_median'].quantile(0.25))
loan_df['non_living_apartments_median'] = loan_df['non_living_apartments_median'].fillna(loan_df['non_living_apartments_median'].quantile(0.25))
loan_df['non_living_area_median'] = loan_df['non_living_area_median'].fillna(loan_df['non_living_area_median'].quantile(0.25))
loan_df['living_area_avg'] = loan_df['living_area_avg'].fillna(loan_df['living_area_avg'].quantile(0.25))
loan_df['family_size'] = loan_df['family_size'].fillna(loan_df['family_size'].quantile(0.25))
loan_df['non_living_area_avg'] = loan_df['non_living_area_avg'].fillna(loan_df['non_living_area_avg'].quantile(0.25))
loan_df['total_area_mode'] = loan_df['total_area_mode'].fillna(loan_df['total_area_mode'].quantile(0.25))
loan_df['annuity_amount'] = loan_df['annuity_amount'].fillna(loan_df['annuity_amount'].quantile(0.25))

In [8]:
# Fill missing values with the 25th percentile
loan_df['goods_price'] = loan_df['goods_price'].fillna(loan_df['goods_price'].quantile(0.25))

# Filling in the missing values in the categorical columns using the mode
Mode-based imputation is less likely to overfit the model compared to group-based imputation, as it doesn’t rely on specific group characteristics.<br>

It also ensures that the filled values are meaningful and relevant, while preserving the integrity of the dataset.

In [9]:
# Fill missing values with mode for categorical columns
loan_df['max_floors_mode'] = loan_df['max_floors_mode'].fillna(loan_df['max_floors_mode'].mode()[0])
loan_df['occupation'] = loan_df['occupation'].fillna(loan_df['occupation'].mode()[0])
loan_df['min_floors_mode'] = loan_df['min_floors_mode'].fillna(loan_df['min_floors_mode'].mode()[0])
loan_df['elevators_mode'] = loan_df['elevators_mode'].fillna(loan_df['elevators_mode'].mode()[0])
loan_df['apartment_average_size'] = loan_df['apartment_average_size'].fillna(loan_df['apartment_average_size'].mode()[0])
loan_df['basement_average_area'] = loan_df['basement_average_area'].fillna(loan_df['basement_average_area'].mode()[0])
loan_df['years_since_building_use'] = loan_df['years_since_building_use'].fillna(loan_df['years_since_building_use'].mode()[0])
loan_df['building_age'] = loan_df['building_age'].fillna(loan_df['building_age'].mode()[0])
loan_df['common_area_avg'] = loan_df['common_area_avg'].fillna(loan_df['common_area_avg'].mode()[0])
loan_df['elevators_avg'] = loan_df['elevators_avg'].fillna(loan_df['elevators_avg'].mode()[0])
loan_df['entrances_avg'] = loan_df['entrances_avg'].fillna(loan_df['entrances_avg'].mode()[0])
loan_df['max_floors_avg'] = loan_df['max_floors_avg'].fillna(loan_df['max_floors_avg'].mode()[0])
loan_df['min_floors_avg'] = loan_df['min_floors_avg'].fillna(loan_df['min_floors_avg'].mode()[0])
loan_df['land_area_avg'] = loan_df['land_area_avg'].fillna(loan_df['land_area_avg'].mode()[0])
loan_df['living_apartments_avg'] = loan_df['living_apartments_avg'].fillna(loan_df['living_apartments_avg'].mode()[0])
loan_df['non_living_apartments_avg'] = loan_df['non_living_apartments_avg'].fillna(loan_df['non_living_apartments_avg'].mode()[0])
loan_df['apartment_mode'] = loan_df['apartment_mode'].fillna(loan_df['apartment_mode'].mode()[0])
loan_df['basement_mode'] = loan_df['basement_mode'].fillna(loan_df['basement_mode'].mode()[0])
loan_df['years_building_use_mode'] = loan_df['years_building_use_mode'].fillna(loan_df['years_building_use_mode'].mode()[0])
loan_df['building_age_mode'] = loan_df['building_age_mode'].fillna(loan_df['building_age_mode'].mode()[0])
loan_df['house_fund_mode'] = loan_df['house_fund_mode'].fillna(loan_df['common_area_mode'].mode()[0])
loan_df['walls_material_mode'] = loan_df['walls_material_mode'].fillna(loan_df['house_type_mode'].mode()[0])
loan_df['emergency_state_mode'] = loan_df['emergency_state_mode'].fillna(loan_df['emergency_state_mode'].mode()[0])
loan_df['entrances_mode'] = loan_df['entrances_mode'].fillna(loan_df['entrances_mode'].mode()[0])
loan_df['house_type_mode'] = loan_df['house_type_mode'].fillna(loan_df['house_type_mode'].mode()[0])
loan_df['occupation'] = loan_df['occupation'].fillna(loan_df['occupation'].mode()[0])
loan_df['common_area_mode'] = loan_df['common_area_mode'].fillna(loan_df['common_area_mode'].mode()[0])
loan_df['credit_requests_year'] = loan_df['credit_requests_year'].fillna(loan_df['credit_requests_year'].mode()[0])

In [10]:
# Convert age from days to years (absolute values to handle negative days)
loan_df['employed_years'] = abs(loan_df['days_employed']) / 365.25

# Round age_years to the nearest whole number
loan_df['employed_years'] = loan_df['employed_years'].round(0).astype(int)

In [11]:
# Split the data into features and target
X = loan_df.drop(columns=['client_id', 'loan_status'])
y = loan_df['loan_status']

X: This is the input data (features) where I have dropped the columns that aren't needed for predictions: client_id (since it is unique, it does not show any trends) and loan_status (the outcome I am trying to predict).<br>
y: This is the target data (what I want to predict) which is the loan_status (approved or rejected).

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Feature Engineering
### Frequency Encoding
It helps mitigate overfitting, since it reduces the number of new columns created compared to one-hot encoding.<br>

For categorical features with many unique values (high cardinality), frequency encoding reduces dimensionality by not creating a separate binary feature for each category (as in one-hot encoding).<br>

Frequency encoding is a technique used in feature engineering to convert categorical variables into numerical representations based on the frequency of each category

In [13]:
# Identify categorical columns in the training set
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Apply frequency encoding for each categorical column in the training and test set
for col in categorical_cols:
    # Frequency encoding based on the training data
    freq_encoding = X_train[col].value_counts()
    X_train[col + '_freq'] = X_train[col].map(freq_encoding)
    X_test[col + '_freq'] = X_test[col].map(freq_encoding)  # Apply the same encoding to the test set

In [14]:
# Drop original categorical columns from both training and test sets after encoding
X_train.drop(columns=categorical_cols, inplace=True)
X_test.drop(columns=categorical_cols, inplace=True)

In [15]:
# List of numerical columns to standardize
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [16]:
# # Impute numerical columns with the mean (or median) for both training and test sets
# imputer = SimpleImputer(strategy='mean')
# X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
# X_test[numerical_cols] = imputer.transform(X_test[numerical_cols])

In [17]:
# Standardize the features (optional but recommended)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Key Features for Loan Range Prediction:


### Credit Information and Loan Details:<br>

AMT_ANNUITY: Annuity of the loan (monthly payments).<br>
AMT_GOODS_PRICE: The price of the goods tied to the loan gives context about the loan purpose. For example, if this is related to agricultural equipment, it could reflect an investment that increases the farmer's income.<br>
AMT_INCOME_TOTAL: Total income of the applicant.<br>
AMT_REQ_CREDIT_BUREAU_YEAR: The number of credit inquiries over the past year is an indicator of how often the applicant seeks loans or credit. Frequent requests can suggest higher credit risk.<br>
AMT_REQ_CREDIT_BUREAU_MON: Number of credit inquiries in the last month.<br>
### Demographics and Employment Information:<br>

DAYS_BIRTH: Age of the applicant<br>
DAYS_EMPLOYED: Duration of employment. A longer employment history indicates stability and reliability.<br>
OCCUPATION_TYPE: Occupation of the applicant (e.g., farmer, laborer).<br>
NAME_INCOME_TYPE: Source of income (e.g., Working, State servant).A more stable source of income implies a lower risk.<br>
NAME_EDUCATION_TYPE: Education level (higher education can indicate more financial literacy).Higher levels of education tend to be correlated with higher incomes and financial literacy, which can lead to better loan repayment behavior.<br>
CNT_CHILDREN: Number of children supported (affects financial burden). More dependents typically mean higher expenses, which could influence repayment ability<br>
CNT_FAM_MEMBERS: Number of family members (financial dependency).<br>
NAME_FAMILY_STATUS: Marital status (can indicate stability).<br>
### External Scores:<br>

EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3: External credit scores. These are important predictors of the applicant's creditworthiness.<br>
### Property Ownership:<br>

FLAG_OWN_CAR: Whether the applicant owns a car (indicates assets).<br>
FLAG_OWN_REALTY: Whether the applicant owns real estate.<br>
OWN_CAR_AGE: Age of the applicant's car.<br>
### Risk Indicators:<br>

REGION_POPULATION_RELATIVE: The relative population of the region, which can influence income stability.<br>
DAYS_LAST_PHONE_CHANGE: Time since last phone change (longer durations can indicate stability).<br>
FLAG_WORK_PHONE, FLAG_PHONE, FLAG_EMAIL: Indicators of communication channels, which reflect reliability.<br>
### Client's Stability:

DAYS_ID_PUBLISH: Time since ID was published (stability in identification).<br>
DAYS_REGISTRATION: Time since client registered their current residence.<br>
REG_REGION_NOT_LIVE_REGION: Whether the current region of residence is different from the registration region (can indicate mobility or instability).<br>
REG_CITY_NOT_LIVE_CITY: Similar to above but at the city level.<br>

## Reasons for using RandomForest
RandomForest often achieves high accuracy in classification tasks due to its ability to capture complex relationships in data by averaging the predictions from multiple trees.<br>
Scalability: Random Forest can handle large datasets efficiently because it builds trees independently and in parallel. This parallel processing capability allows it to scale well as the dataset size increases.<br>
Instead of relying on one decision tree (which can get too focused on the training data), Random Forest uses many trees. This helps make better predictions on new data.<br>
Random Forest can show you which features (data columns) are most important for making predictions, helping you understand your data better

## SMOTE:

### Purpose: To balance the class distribution in the training set by generating synthetic examples for the minority class (class 1).
Result: The training data (X_train_res, y_train_res) will have an equal number of samples for both classes.
RandomForestClassifier:

### Parameters:
n_estimators: The number of decision trees in the forest.<br>
random_state: Ensures reproducibility of results by setting a fixed seed.<br>
class_weight: Adjusts the weights of classes, penalizing the model more for misclassifying the minority class to address class imbalance.<br>
Model Training:

The model is trained using the balanced training data, allowing it to learn better representations of both classes.<br>
Predictions:

After training, the model makes predictions on the test set, allowing for evaluation of its performance.<br>

In [18]:
def determine_loan_amount(credit_score, requested_loan, income_level, repayment_history, collateral):
    # Define maximum loan amounts based on credit score ranges
    if credit_score <= 30:
        max_loan = 100000  # Maximum loan for low credit score
    elif credit_score <= 60:
        max_loan = 500000  # Maximum loan for moderate credit score
    else:
        max_loan = 1000000  # Maximum loan for high credit score

    # Adjust loan amount based on other factors
    if repayment_history == 'good' and income_level > 200000:  # Example threshold
        max_loan *= 1.2  # Increase maximum loan by 20%
    
    if collateral > 500000:  # Example collateral value
        max_loan *= 1.1  # Increase maximum loan by 10%

    # Determine approved loan amount
    if requested_loan <= max_loan:
        return requested_loan  # Loan approved
    else:
        return max_loan  # Approve maximum allowable loan

# Example variable definitions
farmer_credit_score = 65            # Example credit score
loan_amount_requested = 800000       # Example requested loan amount
farmer_income = 300000               # Example income level
repayment_history = 'good'           # Example repayment history
collateral_value = 600000            # Example collateral value

# Example usage
approved_loan = determine_loan_amount(farmer_credit_score, loan_amount_requested, farmer_income, repayment_history, collateral_value)

# Print the approved loan amount
print(f"Approved Loan Amount: {approved_loan} RWF")

Approved Loan Amount: 800000 RWF


In [19]:
# loan_df.head(

# LogisticRegression
Since the relationship between the independent variables and the target variable is approximately linear, Logistic Regression can perform very well.

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Standardize the features (this should already be done)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Logistic Regression model with increased max_iter
model = LogisticRegression(max_iter=300)  # Increase the number of iterations

# Alternatively, try a different solver
# model = LogisticRegression(solver='liblinear', max_iter=300)

# Train the model
model.fit(X_train, y_train)

In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(confusion)
print("Classification Report:")
print(report)

Accuracy: 0.92
Confusion Matrix:
[[2762    6]
 [ 228    4]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      2768
           1       0.40      0.02      0.03       232

    accuracy                           0.92      3000
   macro avg       0.66      0.51      0.50      3000
weighted avg       0.88      0.92      0.89      3000



In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

# Create the model
gb_model = GradientBoostingClassifier()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, 
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters found: ", best_params)

# Make predictions on the test set with the best model
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
confusion_best = confusion_matrix(y_test, y_pred_best)
report_best = classification_report(y_test, y_pred_best)

# Print evaluation results for the best model
print(f"Best Model Accuracy: {accuracy_best:.2f}")
print("Best Model Confusion Matrix:")
print(confusion_best)
print("Best Model Classification Report:")
print(report_best)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits
Best parameters found:  {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 0.8}
Best Model Accuracy: 0.92
Best Model Confusion Matrix:
[[2766    2]
 [ 230    2]]
Best Model Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      2768
           1       0.50      0.01      0.02       232

    accuracy                           0.92      3000
   macro avg       0.71      0.50      0.49      3000
weighted avg       0.89      0.92      0.89      3000



#  RandomForestClassifier
Random Forest typically provides high accuracy in classification tasks. It combines the predictions from multiple decision trees, which helps in reducing overfitting and improving overall performance.

In [23]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust n_estimators

# Train the model
rf_model.fit(X_train, y_train)

In [24]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Fit SMOTE on the training data
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Train the Random Forest model on the resampled data
rf_model.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)  # Set zero_division to control undefined metrics

# Print the evaluation results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(confusion)
print("Classification Report:")
print(report)

Accuracy: 0.92
Confusion Matrix:
[[2743   25]
 [ 225    7]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96      2768
           1       0.22      0.03      0.05       232

    accuracy                           0.92      3000
   macro avg       0.57      0.51      0.50      3000
weighted avg       0.87      0.92      0.89      3000



# LinearSVC
The relationship between your features and the target variable is approximately linear, Linear SVC can perform very well, providing high accuracy in classification tasks.

In [25]:
from sklearn.svm import SVC

# Initialize the Linear SVM model
svm_model = SVC(kernel='linear', random_state=42)  # Using linear kernel

# Train the model
svm_model.fit(X_train, y_train)

In [26]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)  # Set zero_division to control undefined metrics

# Print the evaluation results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(confusion)
print("Classification Report:")
print(report)

Accuracy: 0.92
Confusion Matrix:
[[2768    0]
 [ 232    0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      2768
           1       0.00      0.00      0.00       232

    accuracy                           0.92      3000
   macro avg       0.46      0.50      0.48      3000
weighted avg       0.85      0.92      0.89      3000



In [27]:
pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# GradientBoostingClassifier<br>
Gradient boosting tends to be less sensitive to outliers compared to other algorithms, making it suitable for financial datasets where outliers can exist.

In [28]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)  # You can adjust n_estimators

# Train the model
gb_model.fit(X_train, y_train)

In [29]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the test set
y_pred = gb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

# Print the evaluation results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(confusion)
print("Classification Report:")
print(report)

Accuracy: 0.92
Confusion Matrix:
[[2752   16]
 [ 228    4]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96      2768
           1       0.20      0.02      0.03       232

    accuracy                           0.92      3000
   macro avg       0.56      0.51      0.49      3000
weighted avg       0.87      0.92      0.89      3000



In [30]:
# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Adjusting class weights for RandomForestClassifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight={0: 1, 1: 30}
)

# Train the model
rf_model.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

In [31]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Accuracy: 0.9040
Confusion Matrix:
[[2702   66]
 [ 222   10]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      2768
           1       0.13      0.04      0.06       232

    accuracy                           0.90      3000
   macro avg       0.53      0.51      0.51      3000
weighted avg       0.86      0.90      0.88      3000



SMOTE (Synthetic Minority Over-sampling Technique) is applied to balance the classes in the training set. Since the dataset is imbalanced, this technique generates synthetic examples for the minority class (class 1) to help the model learn better.<br>
sampling_strategy=0.5: This specifies that the number of synthetic examples generated for the minority class will be half the size of the majority class. This means that class 1 will have half the number of samples as class 0.<br>
X_train_res, y_train_res = smote.fit_resample(X_train, y_train): This resamples the training data to create a more balanced dataset (X_train_res and y_train_res), which will be used for training the model.<br>

In [32]:

# Apply SMOTE to balance the classes in the training set
smote = SMOTE(sampling_strategy=0.5, random_state=42)  # Adjust sampling_strategy to balance more effectively
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

rf_model = RandomForestClassifier(
    n_estimators=200,  # Increase the number of trees
    max_depth=10,  # Limit the depth to avoid overfitting
    random_state=42,
    class_weight={0: 1, 1: 5},  # Increase class weight for the minority class (class 1)
    max_features='sqrt',  # Randomly select a subset of features for each split
    min_samples_split=5,  # Increase min_samples_split to make splits more meaningful
    min_samples_leaf=2,  # Ensure each leaf has a minimum number of samples
    bootstrap=True  # Use bootstrap samples for creating trees
)

# Train the model
rf_model.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Accuracy: 0.7597
Confusion Matrix:
[[2155  613]
 [ 108  124]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.78      0.86      2768
           1       0.17      0.53      0.26       232

    accuracy                           0.76      3000
   macro avg       0.56      0.66      0.56      3000
weighted avg       0.89      0.76      0.81      3000



In [33]:
# from sklearn.ensemble import RandomForestClassifier

# # Define and train the model
# rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_clf.fit(X_train, y_train)

# # Now you can make predictions
# temp = pd.DataFrame(rf_clf.predict_proba(X_test).tolist(), columns=rf_clf.classes_)

In [34]:
# temp

Precision: The proportion of correct positive predictions (TP) out of all predictions for that class (TP + FP).<br>
Class 0: 0.92 precision means 92% of the time, when the model predicts class 0, it is correct.<br>
Class 1: 0.26 precision means only 26% of the time, when the model predicts class 1, it is correct.<br>
### Recall: The proportion of correctly predicted positive instances (TP) out of all actual instances for that class (TP + FN).<br>
Class 0: 1.00 recall means the model correctly identified all actual class 0 instances (100% recall).<br>
Class 1: 0.01 recall means the model only correctly identifies 1% of the actual class 1 instances.<br>
### F1-score: The harmonic mean of precision and recall. This is a more balanced measure between precision and recall, and is especially useful in imbalanced classes.<br>
Class 0: 0.96 F1-score means the model balances precision and recall well for class 0.
Class 1: 0.01 F1-score indicates very poor performance for class 1.<br>
### Support: The number of actual instances in each class.<br>
For class 0 (majority class), there are 84,806 instances.<br>
For class 1 (minority class), there are 7,448 instances.<br>
### 4. Macro Avg and Weighted Avg:<br>
Macro avg: This averages the precision, recall, and F1-score across all classes without considering class imbalance. It gives equal weight to both classes.<br>
Macro avg for precision: 0.59<br>
Macro avg for recall: 0.50<br>
Macro avg for F1-score: 0.48<br>
Weighted avg: This averages the precision, recall, and F1-score across all classes, but it takes into account the class distribution (i.e., it weighs more heavily the performance on the majority class).<br>
Weighted avg for precision: 0.87<br>
Weighted avg for recall: 0.92<br>
Weighted avg for F1-score: 0.88<br>

In [35]:
# Check for missing values in training and test data
# print(X_train_res.isnull().sum())
# print(X_test.isnull().sum())

#### GridSearchCV is a tool that helps you find the best settings (called hyperparameters) for a machine learning model by trying out different combinations of them.

GridSearchCV will:

Train the model with 100 trees and depth 10<br>
Train the model with 100 trees and depth 20<br>
Train the model with 100 trees and depth 30<br>
Train the model with 200 trees and depth 10<br>
And so on...

## Model Training using LogisticRegression
##### Prediction of Probabilities:

Logistic regression predicts the probability that a given input belongs to a certain class.
It uses the sigmoid function (also called the logistic function) to squash the output of a linear function into a value between 0 and 1.

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

In [37]:
# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [38]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# Initialize Logistic Regression with more iterations
log_reg_model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000)

# Train the Logistic Regression model
log_reg_model.fit(X_train_scaled, y_train_res)

# Make predictions on the test set
y_pred = log_reg_model.predict(X_test_scaled)

In [39]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Accuracy: 0.7000
Confusion Matrix:
[[1954  814]
 [  86  146]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.71      0.81      2768
           1       0.15      0.63      0.24       232

    accuracy                           0.70      3000
   macro avg       0.55      0.67      0.53      3000
weighted avg       0.90      0.70      0.77      3000



In [40]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Scale the training and test data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# Initialize Logistic Regression with more iterations and a different solver
log_reg_model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=5000, solver='liblinear')

# Train the Logistic Regression model
log_reg_model.fit(X_train_scaled, y_train_res)

# Make predictions on the test set
y_pred = log_reg_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Accuracy: 0.7007
Confusion Matrix:
[[1956  812]
 [  86  146]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.71      0.81      2768
           1       0.15      0.63      0.25       232

    accuracy                           0.70      3000
   macro avg       0.56      0.67      0.53      3000
weighted avg       0.90      0.70      0.77      3000



## Model Training using LinearSVC
LinearSVC draws a straight line (or flat plane) that divides the data into two classes.
It is used when the data is easily separable by a line.
It tries to make the line as wide as possible while still getting the correct answers.

In [41]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

In [42]:
# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Initialize LinearSVC with balanced class weights
linear_svm_model = LinearSVC(random_state=42, class_weight='balanced', max_iter=10000)

What does rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced') do?

Here, you’re saying, "Let’s gather 100 friends (trees) to help make decisions!"
The random_state=42 makes sure they always make decisions the same way when you ask them.
The class_weight='balanced' tells them to pay extra attention to the blue toys, so they don’t ignore them just because there are fewer.

In [43]:
# Train the LinearSVC model
linear_svm_model.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred = linear_svm_model.predict(X_test)

In [44]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Accuracy: 0.7010
Confusion Matrix:
[[1955  813]
 [  84  148]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.71      0.81      2768
           1       0.15      0.64      0.25       232

    accuracy                           0.70      3000
   macro avg       0.56      0.67      0.53      3000
weighted avg       0.90      0.70      0.77      3000



In [45]:
# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Scale the data (important for SVM models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# Define the LinearSVC with class weights and more iterations
# We'll use GridSearchCV for hyperparameter tuning
param_grid = {
    'C': [0.1, 0.5, 1, 5, 10],  # Experiment with C values
    'max_iter': [5000, 10000],  # Increase iterations if needed
    'class_weight': ['balanced', {0: 1, 1: 5}, {0: 1, 1: 10}]  # Different class weight strategies
}

In [46]:
# Initialize LinearSVC model
linear_svm_model = LinearSVC(random_state=42)

# Use StratifiedKFold cross-validation to maintain class balance across folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform GridSearchCV to tune hyperparameters
grid_search = GridSearchCV(estimator=linear_svm_model, param_grid=param_grid, 
                           cv=cv, scoring='f1', n_jobs=-1, verbose=1)

# Train the model using GridSearchCV
grid_search.fit(X_train_scaled, y_train_res)

# Get the best model from GridSearch
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

NameError: name 'StratifiedKFold' is not defined

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

# Best hyperparameters from GridSearchCV
print("\nBest hyperparameters:", grid_search.best_params_)

### Model Training using Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

In [None]:
# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [None]:
# Initialize GradientBoostingClassifier with default settings
gb_model = GradientBoostingClassifier(random_state=42)

# Train the GradientBoostingClassifier model
gb_model.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred = gb_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

We import HistGradientBoostingClassifier, which is a powerful machine learning model that can handle large datasets and is useful for both regression and classification problems. This model uses a technique called gradient boosting to combine multiple weak models (small decision trees) to make stronger predictions.

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Train the HistGradientBoostingClassifier
hist_gb_model = HistGradientBoostingClassifier(random_state=42)
hist_gb_model.fit(X_train_res, y_train_res)

# Make predictions
y_pred = hist_gb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

In [None]:
import joblib

joblib.dump(model, 'model.pkl')

In [None]:
# Docker code
FROM python:3.8-slim
COPY model.pkl /app/model.pkl
COPY app.py /app/app.py
WORKDIR /app
RUN pip install -r requirements.txt
CMD ["python", "app.py"]