# Model Training

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, r2_score, mean_absolute_error,mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

In [3]:
df=pd.read_csv("./Data/CreditCardCleaned.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [5]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [6]:
X=df.iloc[:,:-1]

In [7]:
X

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23
0,20000,2,2,1,24,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,120000,2,2,2,26,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,2,2,2,34,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,37,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000,1,3,1,39,0,0,0,0,0,...,208365,88004,31237,15980,8500,20000,5003,3047,5000,1000
29996,150000,1,3,2,43,-1,-1,-1,-1,0,...,3502,8979,5190,0,1837,3526,8998,129,0,0
29997,30000,1,2,2,37,4,3,2,-1,0,...,2758,20878,20582,19357,0,0,22000,4200,2000,3100
29998,80000,1,3,1,41,1,-1,0,0,0,...,76304,52774,11855,48944,85900,3409,1178,1926,52964,1804


In [8]:
y=df[['Y']]

In [9]:
y

Unnamed: 0,Y
0,1
1,1
2,0
3,0
4,0
...,...
29995,0
29996,0
29997,1
29998,1


In [17]:
cat=[col for col in X.columns if type(col)=='object']
num=[col for col in X.columns if type(col)!='object']

In [19]:
num

['X1',
 'X2',
 'X3',
 'X4',
 'X5',
 'X6',
 'X7',
 'X8',
 'X9',
 'X10',
 'X11',
 'X12',
 'X13',
 'X14',
 'X15',
 'X16',
 'X17',
 'X18',
 'X19',
 'X20',
 'X21',
 'X22',
 'X23']

In [20]:
# Separating Numerical and categorical columns
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [21]:
categorical_cols, numerical_cols

(Index([], dtype='object'),
 Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11',
        'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21',
        'X22', 'X23'],
       dtype='object'))

In [23]:
#numerical pipeline
num_pipeline=Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

#categorical pipeline
cat_pipeline=Pipeline(
    steps=[
        ('impute', SimpleImputer(strategy='most frequent')),
        ('OrdinalEncoder', OrdinalEncoder(categories=[])),
        ('scaler', StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
    ('num_pipeline', num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [24]:
#Train test split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=35)

In [25]:
X_train.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23
26265,30000,2,2,1,46,1,2,0,0,0,...,25906,25494,24986,24915,0,1804,1200,500,737,0
23989,500000,1,3,2,30,-1,0,0,0,-1,...,5453,5540,6263,1831,4303,1096,1020,6315,1833,9649
3531,160000,2,3,2,41,-1,2,2,0,0,...,28328,29099,29724,30624,3000,0,1200,1200,1500,2000
27683,30000,2,2,2,22,2,0,0,0,0,...,27032,28070,29402,29854,1420,1762,1800,2089,1059,1123
21834,140000,2,2,1,27,0,0,0,0,0,...,132730,124689,105447,106704,5000,10000,5000,5000,5000,5000


In [26]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [27]:
X_train.head()

Unnamed: 0,num_pipeline__X1,num_pipeline__X2,num_pipeline__X3,num_pipeline__X4,num_pipeline__X5,num_pipeline__X6,num_pipeline__X7,num_pipeline__X8,num_pipeline__X9,num_pipeline__X10,...,num_pipeline__X14,num_pipeline__X15,num_pipeline__X16,num_pipeline__X17,num_pipeline__X18,num_pipeline__X19,num_pipeline__X20,num_pipeline__X21,num_pipeline__X22,num_pipeline__X23
0,-1.052491,0.811887,0.183753,-1.059629,1.135714,0.903906,1.776998,0.134715,0.183017,0.232774,...,-0.305562,-0.278671,-0.254652,-0.237762,-0.347559,-0.176429,-0.221052,-0.274411,-0.261381,-0.294612
1,2.554839,-1.231698,1.45435,0.857508,-0.594819,-0.881125,0.106177,0.134715,0.183017,-0.649422,...,-0.59611,-0.583823,-0.559336,-0.620323,-0.087547,-0.206229,-0.230601,0.0845,-0.19232,0.258083
2,-0.054719,0.811887,1.45435,0.857508,0.594922,-0.881125,1.776998,1.807198,0.183017,0.232774,...,-0.271156,-0.223541,-0.177549,-0.143149,-0.166282,-0.25236,-0.221052,-0.231206,-0.213303,-0.180052
3,-1.052491,0.811887,0.183753,0.857508,-1.460086,1.796421,0.106177,0.134715,0.183017,0.232774,...,-0.289566,-0.239277,-0.182789,-0.15591,-0.261754,-0.178197,-0.189224,-0.176336,-0.241091,-0.230287
4,-0.208222,0.811887,0.183753,-1.059629,-0.919294,0.01139,0.106177,0.134715,0.183017,0.232774,...,1.211944,1.238296,1.05471,1.117691,-0.04543,0.168545,-0.019471,0.003336,0.007237,-0.008212


In [16]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [17]:
regression.coef_

array([[-0.01129964, -0.00617859, -0.01228883, -0.0127792 ,  0.01430012,
         0.10856019,  0.02579982,  0.01241877,  0.00631333,  0.00524547,
         0.00046024, -0.04637913,  0.01838447, -0.01045494,  0.00095486,
        -0.00265562,  0.00515394, -0.0123925 , -0.00332424, -0.00050695,
        -0.0013787 , -0.00509286, -0.00358826]])

In [18]:
regression.intercept_

array([0.22004762])

In [19]:
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)
    return mae, rmse, r2_square

In [20]:
models={
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'ElasticNet':ElasticNet()
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    y_pred=model.predict(X_test)
    mae,rmse,r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print('Model Training Performance')
    print('RMSE:',rmse)
    print('MAE:',mae)
    print('R2 Score:',r2_square*100)

    r2_list.append(r2_square)

    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 0.39229943758794383
MAE: 0.30882936961380597
R2 Score: 11.431573525849048


Ridge
Model Training Performance
RMSE: 0.3922992493118276
MAE: 0.30882959660441095
R2 Score: 11.43165853904945


Lasso
Model Training Performance
RMSE: 0.41686617717797775
MAE: 0.3454040740740741
R2 Score: -0.0084916716144523


ElasticNet
Model Training Performance
RMSE: 0.41686617717797775
MAE: 0.3454040740740741
R2 Score: -0.0084916716144523




In [21]:
model_list

['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet']