# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression

# Loading Dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/chiraggarg95/Assignment-Datasets/main/haberman.csv')
df

Unnamed: 0,X1,X2,X3,Y
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1
...,...,...,...,...
301,75,62,1,1
302,76,67,0,1
303,77,65,3,1
304,78,65,1,2


# Preprocessing

In [3]:
df = df.rename(columns={"X1":"age", "X2":"oprn_yr", "X3":"pos_nodes", "Y":"survival_status"})
df

Unnamed: 0,age,oprn_yr,pos_nodes,survival_status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1
...,...,...,...,...
301,75,62,1,1
302,76,67,0,1
303,77,65,3,1
304,78,65,1,2


# Logistic Regression

In [4]:
X = df[[ 'age', 'oprn_yr', 'pos_nodes']]
Y = df[[ 'survival_status']]

In [5]:
def logistic_reg_model(data_x, data_y, split=0.1):
    
    x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=split)

    log_reg = LogisticRegression()
    log_reg.fit(x_train, y_train)

    print("Train-test split: " + str(int(100-split*100)) + ":" + str(int(100*split)))
    print()

    print("Co-efficients of the trained model are: ")
    print(log_reg.coef_)
    print()

    print("Intercepts of the trained model are: ")
    print(log_reg.intercept_)
    print()

    y_pred_test = log_reg.predict(x_test)
    y_pred_test_df = pd.DataFrame(y_pred_test, columns=["survival_status_predicted"])

    y_test_np = np.array(y_test).squeeze()
    class_error = 1-(y_pred_test==y_test_np).sum()/len(y_test_np)

    print("Classification Error: " + str(class_error))
    print()

    return y_pred_test_df

## Train test split: 90:10

In [6]:
log_reg_test_pred_df1 = logistic_reg_model(X, Y, split=0.1)

Train-test split: 90:10

Co-efficients of the trained model are: 
[[ 0.01854579 -0.00813039  0.09429827]]

Intercepts of the trained model are: 
[-1.87919797]

Classification Error: 0.22580645161290325



  y = column_or_1d(y, warn=True)


In [7]:
log_reg_test_pred_df1

Unnamed: 0,survival_status_predicted
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,2


## Train test split: 80:20

In [8]:
log_reg_test_pred_df2 = logistic_reg_model(X, Y, split=0.2)

Train-test split: 80:20

Co-efficients of the trained model are: 
[[0.01442385 0.003457   0.11989674]]

Intercepts of the trained model are: 
[-2.37314813]

Classification Error: 0.17741935483870963



  y = column_or_1d(y, warn=True)


In [9]:
log_reg_test_pred_df2

Unnamed: 0,survival_status_predicted
0,1
1,1
2,1
3,1
4,1
...,...
57,1
58,1
59,1
60,1


## Train test split: 70:30

In [10]:
log_reg_test_pred_df3 = logistic_reg_model(X, Y, split=0.3)

Train-test split: 70:30

Co-efficients of the trained model are: 
[[0.01818574 0.01642648 0.07851337]]

Intercepts of the trained model are: 
[-3.46478824]

Classification Error: 0.2934782608695652



  y = column_or_1d(y, warn=True)


In [11]:
log_reg_test_pred_df3

Unnamed: 0,survival_status_predicted
0,1
1,1
2,1
3,2
4,1
...,...
87,1
88,2
89,1
90,1


## Train test split: 60:40

In [12]:
log_reg_test_pred_df4 = logistic_reg_model(X, Y, split=0.4)

Train-test split: 60:40

Co-efficients of the trained model are: 
[[0.00387179 0.03475478 0.12633636]]

Intercepts of the trained model are: 
[-3.87269371]

Classification Error: 0.24390243902439024



  y = column_or_1d(y, warn=True)


In [13]:
log_reg_test_pred_df4

Unnamed: 0,survival_status_predicted
0,1
1,1
2,1
3,1
4,1
...,...
118,1
119,1
120,1
121,1
