In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb

In [21]:
# Load dataset (gantilah dengan path yang sesuai)
df = pd.read_csv('Data Scientist - Assignment.csv')

# Menghapus baris dengan FICO Score < 300
df = df[df['FICO Score'] >= 300]

# Menghapus baris yang memiliki semua nilai NaN
df = df.dropna(how='all')

In [22]:
df.shape

(52955, 23)

In [23]:
# Memilih fitur dan target
features = ['Loan To Value', 'Branch ID', 'Age', 'Employment Type', 'Number of Accounts',
            'Number of Active Accounts', 'Number of Overdue Accounts', 'Current Balance Amount',
            'Sanctioned Amount', 'Disbursed Amount', 'Instalment Amount',
            'Number of Accounts Opened Last 6 Months', 'Number of Delinquencies Last 6 Months',
            'Average Account Age', 'Number of Inquiries']

target = 'FICO Score'

# Memisahkan fitur dan target
X = df[features]
y = df[target]

In [24]:
# Daftar fitur yang akan digunakan
categorical_features = ['Employment Type']  # Fitur kategorikal yang perlu encoding
numeric_features = list(set(features) - set(categorical_features))  # Fitur numerik

In [25]:

# Preprocessing: Menangani nilai hilang dan encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # Mengisi nilai kosong dengan rata-rata
            ('scaler', MinMaxScaler())  # Normalisasi fitur numerik ke rentang 0-1
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Mengisi nilai kosong dengan nilai paling sering muncul
            ('encoder', OneHotEncoder(handle_unknown='ignore'))  # OneHotEncoder untuk fitur kategorikal
        ]), categorical_features)
    ]
)

In [26]:

# Feature Selection: Memilih 10 fitur terbaik berdasarkan korelasi dengan target
feature_selector = SelectKBest(score_func=f_regression, k=10)

# Model pipeline untuk Linear Regression
lr_model = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('regressor', LinearRegression())  # Model regresi linear
])

# Model pipeline untuk Random Forest Regressor
rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))  # Model Random Forest dengan 100 pohon
])

# Model pipeline untuk Gradient Boosting
gb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))  # Model Gradient Boosting
])

# Model pipeline untuk XGBoost
xgb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('regressor', xgb.XGBRegressor(n_estimators=100, random_state=42))  # Model XGBoost
])

# Model pipeline untuk LightGBM
lgb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('regressor', lgb.LGBMRegressor(n_estimators=100, random_state=42))  # Model LightGBM
])

# Model pipeline untuk Neural Network (MLPRegressor)
nn_model = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('regressor', MLPRegressor(hidden_layer_sizes=(100,50), max_iter=500, random_state=42))  # Neural Network dengan 2 layer (100, 50)
])


In [None]:
# Membagi dataset menjadi data latih (train) dan uji (test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary untuk menyimpan model
models = {
    'Linear Regression': lr_model,
    'Random Forest': rf_model,
    'Gradient Boosting': gb_model,
    'XGBoost': xgb_model,
    'LightGBM': lgb_model,
    'Neural Network': nn_model
}

# Melatih semua model dan membuat prediksi
predictions_train = {}
predictions_test = {}

for name, model in models.items():
    model.fit(X_train, y_train)  # Melatih model
    
    # Prediksi pada data latih dan uji
    predictions_train[name] = model.predict(X_train)
    predictions_test[name] = model.predict(X_test)

# Evaluasi performa setiap model pada train set dan test set
for name in models.keys():
    # Evaluasi pada data latih
    mae_train = mean_absolute_error(y_train, predictions_train[name])
    mse_train = mean_squared_error(y_train, predictions_train[name])
    rmse_train = np.sqrt(mse_train)
    mape_train = np.mean(np.abs((y_train - predictions_train[name]) / y_train)) * 100
    r2_train = r2_score(y_train, predictions_train[name])

    # Evaluasi pada data uji
    mae_test = mean_absolute_error(y_test, predictions_test[name])
    mse_test = mean_squared_error(y_test, predictions_test[name])
    rmse_test = np.sqrt(mse_test)
    mape_test = np.mean(np.abs((y_test - predictions_test[name]) / y_test)) * 100
    r2_test = r2_score(y_test, predictions_test[name])

    # Menampilkan hasil evaluasi 
    print(f'=== {name} ===')
    print(f'Train Set:')
    print(f'  MAE : {mae_train:.2f}')
    print(f'  MSE : {mse_train:.2f}')
    print(f'  RMSE: {rmse_train:.2f}')
    print(f'  MAPE: {mape_train:.2f}%')
    print(f'  R²  : {r2_train:.2f}')
    
    print(f'Test Set:')
    print(f'  MAE : {mae_test:.2f}')
    print(f'  MSE : {mse_test:.2f}')
    print(f'  RMSE: {rmse_test:.2f}')
    print(f'  MAPE: {mape_test:.2f}%')
    print(f'  R²  : {r2_test:.2f}')
    print('\n')




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001806 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 783
[LightGBM] [Info] Number of data points in the train set: 42364, number of used features: 10
[LightGBM] [Info] Start training from score 655.872297




In [28]:

# Fungsi untuk mengonversi skor FICO ke kategori kredit
def categorize_fico(score):
    if 800 <= score <= 850:
        return 'Excellent'
    elif 740 <= score < 800:
        return 'Very Good'
    elif 670 <= score < 740:
        return 'Good'
    elif 580 <= score < 670:
        return 'Fair'
    elif 300 <= score < 580:
        return 'Poor'
    return 'Excluded'

# Menambahkan kategori FICO Score ke dalam dataframe hasil prediksi (menggunakan model Linear Regression)
lr_test = pd.DataFrame({
    'Actual FICO Score': y_test,  # Kolom berisi nilai FICO Score asli (dari dataset)
    'Predicted FICO Score': predictions['Linear Regression']  # Menggunakan prediksi dari model Linear Regression
})

# Mengonversi hasil prediksi ke kategori FICO Score
lr_test['Predicted Category'] = lr_test['Predicted FICO Score'].apply(categorize_fico)

# Menampilkan beberapa hasil
print(lr_test.head())


       Actual FICO Score  Predicted FICO Score Predicted Category
42289                762            708.189747               Good
25064                511            560.037568               Poor
51376                738            712.271674               Good
4652                 706            711.315032               Good
9536                 825            716.898530               Good
