### Loading the data

In [16]:
import pandas as pd
df = pd.read_csv("datasets/cc_approvals.data", header=None)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


### Preprocessing

The data is confusing but we find data with numerical and strings, so first we do some preprocessing

In [17]:
print(df.describe())
print(df.info())

# print(df.tail(20))

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-

###### inspecting the data we find some missing values.

#### Missing Values

In [18]:
import numpy as np
df = df.replace("?",np.NaN)

df.fillna(df.mean(),inplace=True)

print(df.isnull().sum())

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64


###### now only the cols with non numeric values have missing data

In [19]:
for col in df:
    if df[col].dtypes=='object':
        df = df.fillna(df[col].value_counts().index[0])
        
        
        
print(df.isnull().sum())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64


##### converting the non numeric data to numeric

In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in df:
    if df[col].dtype=='object':
        df[col] = le.fit_transform(df[col])
        
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1,156,0.0,2,1,13,8,1.25,1,1,1,0,0,68,0,0
1,0,328,4.46,2,1,11,4,3.04,1,1,6,0,0,11,560,0
2,0,89,0.5,2,1,11,4,1.5,1,0,0,0,0,96,824,0
3,1,125,1.54,2,1,13,8,3.75,1,1,5,1,0,31,3,0
4,1,43,5.625,2,1,13,8,1.71,1,0,0,0,2,37,0,0


##### Preprocessing

In [30]:
# df = df.drop([11,13],axis=1)
# df.head()
features = df.drop([15],axis=1)
labels = df[15]

In [31]:
from sklearn.model_selection import train_test_split
train_data, test_data, train_labels, test_labels = train_test_split(features,labels,test_size=33,random_state=42 )

###### since values in the cols vary a lot , so we rescale the data in 0-1

In [32]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
Transformed_train_data = scaler.fit_transform(train_data)
Transformed_test_data = scaler.fit_transform(test_data)

#### Training the model

In [34]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(Transformed_train_data, train_labels)

LogisticRegression()

In [36]:
pred_labels = logreg.predict(Transformed_test_data)
print(logreg.score(Transformed_test_data, test_labels))
from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels,pred_labels)

0.7575757575757576


array([[10,  3],
       [ 5, 15]], dtype=int64)

#### Grid Search CV

In [37]:
from sklearn.model_selection import GridSearchCV
tol = [0.01,0.001,0.0001]
max_iter = [100,150,200]

param_grid = dict({'tol':tol,'max_iter':max_iter})

In [39]:
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

TransformedX = scaler.fit_transform(features)

grid_model_res = grid_model.fit(TransformedX, labels)

best_score, best_params = grid_model_res.best_score_, grid_model_res.best_params_
print("Best: %f using %s" % (best_score, best_params))

Best: 0.850725 using {'max_iter': 100, 'tol': 0.01}
