In [138]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostClassifier

In [139]:
loan_df = pd.read_csv('/home/student/application_train.csv')

# Personal and Demographic Information
CODE_GENDER: Gender of the applicant. This feature can help capture behavioral differences that may indirectly influence credit risk (but be mindful of fairness in using this variable).<br>
FLAG_OWN_CAR and FLAG_OWN_REALTY: Indicates whether the applicant owns a car or real estate. Ownership of assets typically correlates with lower credit risk because these can serve as collateral or demonstrate financial stability.<br>
CNT_CHILDREN: Number of children in the household. More dependents can lead to higher financial obligations, potentially increasing default risk.<br>
DAYS_BIRTH: The age of the applicant (in days). Age can impact credit risk, with younger individuals often seen as higher risk, but it could also correlate with different financial behaviors.<br>
OCCUPATION_TYPE: This reflects the applicant's profession. Certain professions are more stable (e.g., government jobs) and could indicate lower risk compared to more unstable occupations.<br>
NAME_EDUCATION_TYPE: Education level can affect loan decisions since higher education is often associated with better job stability and income.<br>
NAME_FAMILY_STATUS: Marital status might influence financial stability, with married individuals often seen as more stable.<br>
CNT_FAM_MEMBERS: Number of family members, similar to children count, indicates financial dependency, which might increase credit risk.<br>

# Income and Loan Details
AMT_INCOME_TOTAL: Total annual income of the applicant. Higher income typically implies greater ability to repay the loan.<br>
AMT_CREDIT: Total amount of credit requested. A larger requested loan amount relative to income might increase risk.<br>
AMT_ANNUITY: The annual loan repayment amount. The proportion of annuity to income (AMT_ANNUITY/AMT_INCOME_TOTAL) can indicate loan affordability.<br>
AMT_GOODS_PRICE: The price of the goods for which the loan is taken (e.g., equipment or seeds for farmers). This may help assess whether the loan is reasonable compared to income and other factors.<br>

# Employment Information
DAYS_EMPLOYED: How long the applicant has been employed. Longer employment periods are usually associated with greater stability and a lower likelihood of default.<br>
FLAG_EMP_PHONE: Indicates whether the applicant provided a work phone. Providing more contact details can be an indicator of transparency and stability.<br>
FLAG_WORK_PHONE, FLAG_MOBIL, FLAG_PHONE, FLAG_EMAIL: These flags indicate whether the applicant provided various forms of contact. Availability of contact information might help reduce risk, as the bank can reach the applicant more easily.<br>


# External Scores
EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3: External credit scores or sources. These scores are likely provided by credit bureaus or other third-party sources and are highly relevant for predicting creditworthiness.

# Housing and Property Features
NAME_HOUSING_TYPE: Type of housing (e.g., apartment, house). Certain housing types could be associated with more financial stability.<br>
APARTMENTS_AVG, FLOORSMAX_AVG, YEARS_BUILD_AVG: These features describe the condition or characteristics of the applicant’s property. Newer or more valuable property can indicate better financial security.<br>
OWN_CAR_AGE: Age of the applicant's car. A newer car might be a proxy for financial status.
LANDAREA_AVG, LIVINGAREA_AVG, COMMONAREA_AVG: Average sizes of various areas of the property. Larger or better-maintained properties can imply higher wealth and lower credit risk.<br>


# Regional and Population Features
REGION_POPULATION_RELATIVE: This reflects the relative population of the region where the applicant lives. Living in a densely populated region might impact the economic opportunities available and influence loan decisions.<br>
REG_REGION_NOT_LIVE_REGION, REG_CITY_NOT_LIVE_CITY: Flags indicating whether the applicant's work region is different from their living region. This might affect commuting costs or job stability.<br>


# Loan Application Information
NAME_CONTRACT_TYPE: The type of loan contract (cash loans, revolving loans). Different loan types might have different risk profiles (e.g., revolving credit is generally riskier than installment loans).<br>
WEEKDAY_APPR_PROCESS_START and HOUR_APPR_PROCESS_START: The day and hour the application was started. These features could capture certain behavioral patterns, such as applying for loans at certain times of the week when individuals might be more or less rational.<br>

# Document Flags and Credit Bureau Requests
FLAG_DOCUMENT_*: Flags indicating which documents the applicant provided. Missing important documents might indicate higher risk.<br>
AMT_REQ_CREDIT_BUREAU_*: The number of credit bureau inquiries over different time periods (hour, day, week, month, quarter, year). Frequent credit bureau checks might suggest financial distress or higher risk.<br>

# Behavioral and Social Features
OBS_30_CNT_SOCIAL_CIRCLE, DEF_30_CNT_SOCIAL_CIRCLE, OBS_60_CNT_SOCIAL_CIRCLE, DEF_60_CNT_SOCIAL_CIRCLE: These features count the number of observations of the applicant’s social circle, which might be proxies for peer influence and social stability. Defaults within their social circle may signal increased risk.<br>
DAYS_LAST_PHONE_CHANGE: The number of days since the applicant last changed their phone. Frequent phone changes could indicate instability or potential fraud.<br>

In [140]:
columns_to_keep = [
    'TARGET', 'NAME_CONTRACT_TYPE','AMT_INCOME_TOTAL', 
    'AMT_CREDIT', 'AMT_ANNUITY', 'CODE_GENDER',
    'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'NAME_INCOME_TYPE', 
    'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
    'REGION_POPULATION_RELATIVE', 'DAYS_LAST_PHONE_CHANGE',
    'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'
]

In [141]:
df_filtered = loan_df[columns_to_keep]

In [150]:
# df_filtered.head()

In [142]:
rename_columns = {
    'TARGET': 'target',
    'NAME_CONTRACT_TYPE': 'contract_type',
    'AMT_INCOME_TOTAL': 'income_total',
    'AMT_CREDIT': 'credit_amount',
    'AMT_ANNUITY': 'annuity_amount',
    'CODE_GENDER': 'gender',
    'FLAG_OWN_CAR': 'owns_car',
    'FLAG_OWN_REALTY': 'owns_realty',
    'CNT_CHILDREN': 'children_count',
    'NAME_INCOME_TYPE': 'income_type',
    'NAME_EDUCATION_TYPE': 'education_type',
    'NAME_FAMILY_STATUS': 'family_status',
    'DAYS_BIRTH': 'age_days',
    'DAYS_EMPLOYED': 'employment_days',
    'REGION_POPULATION_RELATIVE': 'region_population_relative',
    'DAYS_LAST_PHONE_CHANGE': 'days_last_phone_change',
    'CNT_FAM_MEMBERS': 'family_members_count',
    'EXT_SOURCE_1': 'ext_source_1',
    'EXT_SOURCE_2': 'ext_source_2',
    'EXT_SOURCE_3': 'ext_source_3'
}

# Create a copy of the filtered DataFrame to avoid the warning
df_filtered_copy = df_filtered.copy()

# Rename the columns in the copy
df_filtered_copy.rename(columns=rename_columns, inplace=True)

# Check the new column names
print(df_filtered_copy.columns)

Index(['target', 'contract_type', 'income_total', 'credit_amount',
       'annuity_amount', 'gender', 'owns_car', 'owns_realty', 'children_count',
       'income_type', 'education_type', 'family_status', 'age_days',
       'employment_days', 'region_population_relative',
       'days_last_phone_change', 'family_members_count', 'ext_source_1',
       'ext_source_2', 'ext_source_3'],
      dtype='object')


In [143]:
# Convert age from days to years (absolute values to handle negative days)
df_filtered_copy['age_years'] = abs(df_filtered_copy['age_days']) / 365.25

# Round age_years to the nearest whole number
df_filtered_copy['age_years'] = df_filtered_copy['age_years'].round(0).astype(int)

In [43]:
# df_filtered_copy.head

In [144]:
df_filtered_copy.to_csv('filtered_columns_dataset.csv', index=False)

In [40]:
# df_filtered.head()

In [145]:
# Check for missing values in each column
missing_values = df_filtered_copy.isnull().sum()

# Filter and display only columns that have missing values
missing_columns = missing_values[missing_values > 0]

# Use `to_string()` to display the full output without truncation
print(missing_columns.to_string())

annuity_amount                12
days_last_phone_change         1
family_members_count           2
ext_source_1              173378
ext_source_2                 660
ext_source_3               60965


# Data Cleaning

In [110]:
df_filtered_copy['annuity_amount'] = df_filtered_copy.groupby(['income_total', 'income_type'])['annuity_amount'].transform(lambda x: x.ffill().bfill())

In [111]:
df_filtered_copy['days_last_phone_change'] = df_filtered_copy.groupby(['income_total', 'income_type'])['days_last_phone_change'].transform(lambda x: x.ffill().bfill())

In [112]:
pd.set_option('future.no_silent_downcasting', True)
df_filtered_copy['family_members_count'] = df_filtered_copy.groupby(['family_status', 'children_count'])['family_members_count'].transform(lambda x: x.ffill())
df_filtered_copy['family_members_count'] = df_filtered_copy.groupby(['family_status', 'children_count'])['family_members_count'].transform(lambda x: x.bfill())

In [67]:
 # df_filtered_copy.isnull().sum()

In [113]:
df_filtered_copy['ext_source_1'] = df_filtered_copy['ext_source_1'].fillna(df_filtered_copy['ext_source_1'].mean())
print(df_filtered_copy['ext_source_1'].isnull().sum())  # Check remaining missing values

0


In [114]:
# Calculate the mean for the specified columns
mean_ext_source_1 = df_filtered_copy['ext_source_1'].mean()
mean_ext_source_2 = df_filtered_copy['ext_source_2'].mean()
mean_ext_source_3 = df_filtered_copy['ext_source_3'].mean()

# Display the calculated means
print("Mean of ext_source_1:", mean_ext_source_1)
print("Mean of ext_source_2:", mean_ext_source_2)
print("Mean of ext_source_3:", mean_ext_source_3)

Mean of ext_source_1: 0.5021298056566624
Mean of ext_source_2: 0.5143926741308462
Mean of ext_source_3: 0.5108529061799658


In [116]:
# Fill missing values with the calculated means
df_filtered_copy['ext_source_1'] = df_filtered_copy['ext_source_1'].fillna(mean_ext_source_1)
df_filtered_copy['ext_source_2'] = df_filtered_copy['ext_source_2'].fillna(mean_ext_source_2)
df_filtered_copy['ext_source_3'] = df_filtered_copy['ext_source_3'].fillna(mean_ext_source_3)

# Check to make sure there are no more missing values in the columns
print(df_filtered_copy[['ext_source_1', 'ext_source_2', 'ext_source_3']].isnull().sum())

ext_source_1    0
ext_source_2    0
ext_source_3    0
dtype: int64


In [132]:
# Check if the column exists and if there are missing values
if 'family_members_count' in df_filtered_copy.columns:
    missing_values = df_filtered_copy['family_members_count'].isnull().sum()
    print(f"Missing values before imputation: {missing_values}")
    
    if missing_values > 0:
        # Create an imputer for the family_members_count column
        imputer = SimpleImputer(strategy='mean')  # Use 'median' or 'most_frequent' as needed

        # Fit and transform the imputer on the specific column
        df_filtered_copy['family_members_count'] = imputer.fit_transform(df_filtered_copy[['family_members_count']])

        # Verify that there are no missing values left in that column
        missing_values_after_imputation = df_filtered_copy['family_members_count'].isnull().sum()
        print(f"Missing values after imputation: {missing_values_after_imputation}")
    else:
        print("No missing values found in family_members_count.")
else:
    print("Column 'family_members_count' does not exist in the DataFrame.")

Missing values before imputation: 0
No missing values found in family_members_count.


# Feature Engineering - Binning

In [None]:
# 1. Binning income_total into Low, Medium, High based on percentiles
loan_df['income_bin'] = pd.qcut(loan_df['income_total'], q=3, labels=['Low', 'Medium', 'High'])

In [None]:
# 2. Binning age_years into Young, Middle-aged, Senior
loan_df['age_bin'] = pd.cut(loan_df['age_years'], bins=[0, 30, 50, float('inf')], labels=['Young', 'Middle-aged', 'Senior'])

In [None]:
# 3. Binning credit_amount into categories based on quartiles
loan_df['credit_bin'] = pd.qcut(loan_df['credit_amount'], q=4, labels=['Very Low', 'Low', 'Medium', 'High'])

In [None]:
# 4. Binning annuity_amount into categories based on quartiles
loan_df['annuity_bin'] = pd.qcut(loan_df['annuity_amount'].dropna(), q=4, labels=['Very Low', 'Low', 'Medium', 'High'])

In [None]:
# 5. Binning employment_days into "Recently employed", "Long-term employed", etc.
loan_df['employment_bin'] = pd.cut(loan_df['employment_days'], bins=[-float('inf'), -1000, -500, 0], labels=['Long-term', 'Mid-term', 'Recently employed'])

In [144]:
# df_filtered_copy.isnull().sum()

In [133]:
categorical_cols = ['contract_type', 'gender', 'owns_car', 'owns_realty', 'income_type',
                    'education_type', 'family_status']

In [134]:
numeric_cols = ['income_total', 'credit_amount', 'annuity_amount', 'children_count', 'age_days',
                'employment_days', 'region_population_relative', 'days_last_phone_change',
                'family_members_count', 'ext_source_1', 'ext_source_2', 'ext_source_3']

In [135]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_filtered_copy[col] = le.fit_transform(df_filtered_copy[col])
    label_encoders[col] = le

Numeric transformer<br>
A pipeline that first imputes missing values using the mean and then standardizes the numeric features.

Scaler<br>
Makes sure that different scales (like income vs. age) are treated fairly. For example, income values could be much larger than age values, and scaling helps balance that.

In [121]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

A ColumnTransformer that applies the numeric_transformer to the specified numeric_cols. This allows different preprocessing for different types of data in the same DataFrame.

In [122]:
# Combine numeric transformer (categorical columns are now label encoded, no need for a separate transformer)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols)
    ])

X: The feature set, created by dropping the target variable from the DataFrame.<br>
y: The target variable, which is the column named 'target'.

In [123]:
# Split the data into features and target
X = df_filtered_copy.drop('target', axis=1)
y = df_filtered_copy['target']

# Model Training

In [124]:
# Define the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)

In [125]:
# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', log_reg)
])

In [126]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'saga'],
}

In [127]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model: A pipeline combining the preprocessing steps and the logistic regression classifier. The liblinear solver is specified for binary classification, and max_iter=1000 allows for sufficient iterations during training.

In [128]:
# Set up GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [129]:
# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'classifier__C': 0.001, 'classifier__solver': 'liblinear'}


In [131]:
# Make predictions using the best estimator
y_pred = grid_search.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_rep}")

Accuracy: 0.9195161211648212
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56554
           1       0.47      0.00      0.00      4949

    accuracy                           0.92     61503
   macro avg       0.69      0.50      0.48     61503
weighted avg       0.88      0.92      0.88     61503

