In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression,RidgeCV,LassoCV,ElasticNetCV,BayesianRidge,SGDRegressor
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

In [2]:
X = pd.read_csv("X_Processed.csv")
Y = pd.read_csv("Y_Processed.csv")

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [4]:
X_train = X_train.sort_index(axis=1)
X_test = X_test.sort_index(axis=1)

In [5]:
y_train = y_train.values.ravel()

LINEAR_REGRESSION

In [6]:
LR_model = LinearRegression()
LR_model.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [7]:
Y_predict = LR_model.predict(X_test)
mse = mean_squared_error(y_test, Y_predict)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.07892576887927068


RIDGE and LASSO REGRESSION

In [8]:
ridge_model = RidgeCV(alphas=np.logspace(-16, 16, 10))
ridge_model.fit(X_train,y_train)

0,1,2
,alphas,array([1.0000...00000000e+16])
,fit_intercept,True
,scoring,
,cv,
,gcv_mode,
,store_cv_results,False
,alpha_per_target,False


In [9]:
joblib.dump(ridge_model,"ridge_model.joblib")

['ridge_model.joblib']

In [10]:
Y_predict = ridge_model.predict(X_test)

In [11]:
mse = mean_squared_error(y_test, Y_predict)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.07892576753010988


In [12]:
lasso_cv = LassoCV(cv=5, random_state=42)
lasso_cv.fit(X_train, y_train)

0,1,2
,eps,0.001
,n_alphas,'deprecated'
,alphas,'warn'
,fit_intercept,True
,precompute,'auto'
,max_iter,1000
,tol,0.0001
,copy_X,True
,cv,5
,verbose,False


In [13]:
joblib.dump(lasso_cv,"lasso_model.joblib")

['lasso_model.joblib']

In [14]:
Y_predict = lasso_cv.predict(X_test)

In [15]:
mse = mean_squared_error(y_test, Y_predict)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.07893370422682808


Elastic Net Regression (Ridge + Lasso)

In [16]:
ER_model = ElasticNetCV(l1_ratio=[.1,.2,.3,.4,.5,.6,.7,.8,.99], cv=5,random_state=42,n_jobs=-1)
ER_model.fit(X_train, y_train)

0,1,2
,l1_ratio,"[0.1, 0.2, ...]"
,eps,0.001
,n_alphas,'deprecated'
,alphas,'warn'
,fit_intercept,True
,precompute,'auto'
,max_iter,1000
,tol,0.0001
,cv,5
,copy_X,True


In [17]:
joblib.dump(ER_model,"ER_model.joblib")

['ER_model.joblib']

In [18]:
Y_predict = ER_model.predict(X_test)

In [19]:
mse = mean_squared_error(y_test, Y_predict)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.07893372972029458


Bayesian Regression Model

In [20]:
BR_model = BayesianRidge()
BR_model.fit(X_train,y_train)

0,1,2
,max_iter,300
,tol,0.001
,alpha_1,1e-06
,alpha_2,1e-06
,lambda_1,1e-06
,lambda_2,1e-06
,alpha_init,
,lambda_init,
,compute_score,False
,fit_intercept,True


In [21]:
joblib.dump(BR_model,"BR_model.joblib")

['BR_model.joblib']

In [22]:
Y_predict = ER_model.predict(X_test)

In [23]:
mse = mean_squared_error(y_test, Y_predict)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.07893372972029458


Stochastic Gradient Descent Regression

In [24]:
SGD_model = SGDRegressor(loss='epsilon_insensitive', epsilon=0.1)

In [25]:
joblib.dump(SGD_model,"SGD_model.joblib")

['SGD_model.joblib']

In [26]:
param_grid = {
    'alpha': [1e-4, 1e-3, 0.01, 0.1, 1.0],
    'learning_rate': ['adaptive', 'optimal', 'invscaling'],    
    'eta0': [0.001, 0.01, 0.1],
    'penalty': ['l2', 'l1', 'elasticnet']
}
search = GridSearchCV(SGD_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
search.fit(X_train, y_train)
SGD_model = search.best_estimator_

Fitting 5 folds for each of 135 candidates, totalling 675 fits


In [27]:
SGD_model.fit(X_train,y_train)

0,1,2
,loss,'epsilon_insensitive'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [28]:
Y_predict = SGD_model.predict(X_test)

In [29]:
mse = mean_squared_error(y_test, Y_predict)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.08946307282709454


Decision Tree Regression

In [30]:
DT_model = DecisionTreeRegressor(max_depth=10)
DT_model.fit(X_train,y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [31]:
joblib.dump(DT_model,"DT_model.joblib")

['DT_model.joblib']

In [32]:
Y_predict = DT_model.predict(X_test)

In [33]:
mse = mean_squared_error(y_test, Y_predict)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.07503128942883155


Random Forest regression

In [34]:
RF_model = RandomForestRegressor(n_estimators=250, max_depth=11, random_state=42)
RF_model.fit(X_train,y_train)

0,1,2
,n_estimators,250
,criterion,'squared_error'
,max_depth,11
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [35]:
joblib.dump(RF_model,"RF_model.joblib")

['RF_model.joblib']

In [36]:
Y_predict = RF_model.predict(X_test)

In [37]:
mse = mean_squared_error(y_test, Y_predict)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.07364074451390075


In [38]:
print(f"DT Train Score: {DT_model.score(X_train, y_train)}")
print(f"RF Train Score: {RF_model.score(X_train, y_train)}")

DT Train Score: 0.5473440498066204
RF Train Score: 0.5612083818089932


XGBoost Regression

In [39]:
XGB_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,
    learning_rate=0.04,
    max_depth=5,
    subsample=0.8,        
    colsample_bytree=1.0, 
    random_state=42
)
XGB_model.fit(
    X_train, 
    y_train, 
    eval_set=[(X_test, y_test)], 
    verbose=False 
)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1.0
,device,
,early_stopping_rounds,
,enable_categorical,False


In [40]:
Y_predict = XGB_model.predict(X_test)

In [41]:
joblib.dump(XGB_model,"XGB_model.joblib")

['XGB_model.joblib']

In [42]:
mse = mean_squared_error(y_test, Y_predict)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.07241301983594894


In [43]:
LGB_model=lgb.LGBMRegressor(
    objective='regression',
    num_leaves=35,
    learning_rate=0.08,
    n_estimators=1000,
    random_state=42
)
LGB_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='l2',
    callbacks=[lgb.early_stopping(stopping_rounds=50)],
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1385
[LightGBM] [Info] Number of data points in the train set: 475195, number of used features: 60
[LightGBM] [Info] Start training from score 0.799024
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[570]	valid_0's l2: 0.0720302


0,1,2
,boosting_type,'gbdt'
,num_leaves,35
,max_depth,-1
,learning_rate,0.08
,n_estimators,1000
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [44]:
Y_predict = LGB_model.predict(X_test)

In [45]:
mse = mean_squared_error(y_test, Y_predict)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.07203021952795047


In [46]:
joblib.dump(LGB_model,"LGB_model.joblib")

['LGB_model.joblib']

Categorical Boosting

In [48]:
cat_features_indices= ['gender','marital_status','education_level','employment_status','loan_purpose','grade_subgrade']

In [None]:
CAT_model = CatBoostRegressor(
    iterations=2600,
    depth=6,
    learning_rate=0.2,
    loss_function='RMSE',
    verbose=100, 
    random_state=42
)
CAT_model.fit(
    X_train, y_train,
    cat_features=cat_features_indices,
    eval_set=(X_test, y_test),
    early_stopping_rounds=50
)

In [None]:
Y_predict = CAT_model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, Y_predict)
print(f"Mean Squared Error: {mse}")

#Mean Squared Error: 0.07154084919662529


In [None]:
joblib.dump(CAT_model,"CAT_model.joblib")