In [9]:
pip install lightgbm

Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Using cached lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.5.0
Note: you may need to restart the kernel to use updated packages.


In [56]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [41]:
loan_df = pd.read_csv('/home/student/filtered_columns_dataset.csv')

In [17]:
# loan_df.head()

In [42]:
categorical_columns = ['contract_type', 'gender', 'owns_car', 'owns_realty', 'income_type', 
                       'education_type', 'family_status']

In [28]:
# loan_df.isnull().sum()

In [43]:
loan_df['annuity_amount'] = loan_df.groupby(['income_total', 'income_type'])['annuity_amount'].transform(lambda x: x.ffill().bfill())

In [44]:
loan_df['days_last_phone_change'] = loan_df.groupby(['income_total', 'income_type'])['days_last_phone_change'].transform(lambda x: x.ffill().bfill())

In [45]:
# Calculate the means for the specified columns
mean_ext_source_1 = loan_df['ext_source_1'].mean()
mean_ext_source_2 = loan_df['ext_source_2'].mean()
mean_ext_source_3 = loan_df['ext_source_3'].mean()

# Display the calculated means
print("Mean of ext_source_1:", mean_ext_source_1)
print("Mean of ext_source_2:", mean_ext_source_2)
print("Mean of ext_source_3:", mean_ext_source_3)

Mean of ext_source_1: 0.5021298056566625
Mean of ext_source_2: 0.5143926741308462
Mean of ext_source_3: 0.5108529061799658


In [46]:
# Fill missing values with the calculated means
loan_df['ext_source_1'] = loan_df['ext_source_1'].fillna(mean_ext_source_1)
loan_df['ext_source_2'] = loan_df['ext_source_2'].fillna(mean_ext_source_2)
loan_df['ext_source_3'] = loan_df['ext_source_3'].fillna(mean_ext_source_3)

# Check to make sure there are no more missing values in the columns
print(loan_df[['ext_source_1', 'ext_source_2', 'ext_source_3']].isnull().sum())

ext_source_1    0
ext_source_2    0
ext_source_3    0
dtype: int64


In [57]:
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    loan_df[column] = le.fit_transform(loan_df[column])
    label_encoders[column] = le

In [60]:
# loan_df.head()

In [58]:
# Split the data into features and target
X = loan_df.drop('target', axis=1)
y = loan_df['target']

In [48]:
numeric_cols = ['income_total', 'credit_amount', 'annuity_amount', 'children_count', 'age_days',
                'employment_days', 'region_population_relative', 'days_last_phone_change',
                'family_members_count', 'ext_source_1', 'ext_source_2', 'ext_source_3']

In [61]:
# Convert all columns to appropriate types (ensuring categorical columns are integers)
for column in categorical_columns:
    X[column] = X[column].astype(int)

In [63]:
# loan_df.head()

In [64]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [65]:
# Standardize the numeric columns
scaler = StandardScaler()
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

In [66]:

# Initialize the LightGBM classifier
lgb_model = lgb.LGBMClassifier(random_state=42, is_unbalance=True)

In [67]:
# Train the model
lgb_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 17412, number of negative: 197845
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007904 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2419
[LightGBM] [Info] Number of data points in the train set: 215257, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080889 -> initscore=-2.430324
[LightGBM] [Info] Start training from score -2.430324


In [68]:
# Make predictions on the test set
y_pred = lgb_model.predict(X_test)

In [69]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [70]:
# Print the results
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

Accuracy: 0.6927396102066035
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.69      0.81     84841
           1       0.16      0.68      0.26      7413

    accuracy                           0.69     92254
   macro avg       0.56      0.69      0.53     92254
weighted avg       0.90      0.69      0.76     92254

