In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from scipy.stats import pearsonr
import seaborn as sns
import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from tabulate import tabulate


import warnings
warnings.filterwarnings('ignore')


### Question 3

In [16]:
Diabetes = pd.read_excel("Diabetes_Data.xlsx")
x_diabetes = Diabetes[["AGE","SEX","BMI","BP","S1","S2","S3","S4","S5","S6"]]#independent
y_diabetes = Diabetes[["Y"]]#dependent  

#correlation 
corr_ = x_diabetes.corr()
#print(corr_)

#heatmap plotting
mask = np.zeros_like(corr_, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr_[mask] = np.nan

(corr_
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(null_color='#f1f1f1')  # Color all Nan values grey
 .set_precision(2))
 


Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6
AGE,,,,,,,,,,
SEX,0.17,,,,,,,,,
BMI,0.19,0.09,,,,,,,,
BP,0.34,0.24,0.4,,,,,,,
S1,0.26,0.04,0.25,0.24,,,,,,
S2,0.22,0.14,0.26,0.19,0.9,,,,,
S3,-0.08,-0.38,-0.37,-0.18,0.05,-0.2,,,,
S4,0.2,0.33,0.41,0.26,0.54,0.66,-0.74,,,
S5,0.27,0.15,0.45,0.39,0.52,0.32,-0.4,0.62,,
S6,0.3,0.21,0.39,0.39,0.33,0.29,-0.27,0.42,0.46,


In [17]:
#multivariate and adding a constant to dataframe..used 1 coz it doesnt change a value.
Diabetes_Df = pd.concat([pd.Series(1, index=Diabetes.index, name='Constant'), Diabetes], axis=1)

#drop the response var column
X = Diabetes_Df.drop(Diabetes_Df.columns[[11]],axis = 1)
Y = Diabetes_Df[["Y"]]

#model creation
model = sm.OLS(Y,X)
model1 = model.fit()

#rsquared adjusted
y_pred = model1.predict(X)

#mean square
msqre_value = mean_squared_error(Y,y_pred)


print(model1.pvalues)
print("Mean squared error is:",msqre_value)
print("Adjusted R squared value is:",model1.rsquared_adj)


Constant    1.016617e-06
AGE         8.670306e-01
SEX         1.041671e-04
BMI         4.296391e-14
BP          1.024278e-06
S1          5.794761e-02
S2          1.603902e-01
S3          6.347233e-01
S4          2.734587e-01
S5          1.555899e-05
S6          3.059895e-01
dtype: float64
Mean squared error is: 2859.69634758675
Adjusted R squared value is: 0.5065592904853231


In [7]:
#forward regression

def forward_regression(X, y,
                       threshold_in,
                       verbose=False):
    initial_list = []
    included = list(initial_list)
    while True:
        changed=False
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            print(model.pvalues)
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))
                
        if not changed:
            break

    return included

forward_regression(x_diabetes,y_diabetes,0.05,verbose=True)



const    0.001760
S1       0.000007
dtype: float64
const    4.401020e-04
BP       1.649372e-22
dtype: float64
const    2.710866e-03
S6       7.580083e-17
dtype: float64
const    7.433496e-31
SEX      3.664293e-01
dtype: float64
const    4.437242e-12
AGE      7.055686e-05
dtype: float64
const    1.892632e-05
S4       2.304253e-21
dtype: float64
const    7.126475e-17
S5       8.826459e-39
dtype: float64
const    5.188287e-64
S3       6.162865e-18
dtype: float64
const    4.301065e-12
S2       2.359848e-04
dtype: float64
const    1.751618e-10
BMI      3.466006e-42
dtype: float64
Add  BMI                            with p-value 3.46601e-42
const    3.792195e-10
BMI      2.319276e-38
S1       7.961267e-02
dtype: float64
const    2.020878e-18
BMI      3.047885e-29
BP       1.725601e-09
dtype: float64
const    3.239627e-14
BMI      3.871059e-31
S6       1.165706e-05
dtype: float64
const    4.674330e-09
BMI      6.288631e-42
SEX      8.225591e-01
dtype: float64
const    2.455363e-11
BMI      1.

['BMI', 'S5', 'BP', 'S1', 'SEX', 'S2']

In [18]:
#mse for stepwise model
X2 = Diabetes_Df.drop(columns=["AGE","S3","S4","S6","Y"],axis = 1)
Y2 = Diabetes_Df[["Y"]]
 
model2 = sm.OLS(Y2,X2)
model2 = model2.fit()
y2_pred = model2.predict(X2)
#r_square = model2.rsquared


mse2_value = mean_squared_error(Y2,y2_pred)
#model2 mse and r square values
print("MSE value for useful forward regression variables:", mse2_value)
print("R_square value is:",model2.rsquared)



MSE value for useful forward regression variables: 2876.683251787016
R_square value is: 0.5148837959256445


In [9]:
print("forward selection useful features are:")
print(model2.pvalues)

forward selection useful features are:
Constant    2.750430e-30
SEX         1.758474e-04
BMI         6.687185e-15
BP          2.786882e-07
S1          3.122573e-06
S2          2.723024e-04
S5          1.938635e-21
dtype: float64


### Question 4

In [20]:
#data set
Titanic = pd.read_csv("titanic3.csv")

#4.2 probability of survival for a passenger on the titanic.
survivors = Titanic[Titanic["survived"] ==1].shape[0]
total_passengers = Titanic["sex"].count()

p_surviving = survivors/total_passengers
print("The probability of a passenger surviving is {:0.3f}".format(p_surviving))

The probability of a passenger surviving is 0.382


In [12]:
#4.3 Provide a table giving survival probabilities broken down by passenger class, gender, and age.
#set limits for age
bins= [0,5,10,15,23,100]
labels = ['Infant','Toddler','Kid','Teen','Adult']
Titanic['AgeGroup'] = pd.cut(Titanic['age'], bins=bins, labels=labels, right=False)

#calculating survival proberbilities
survived_sex = pd.pivot_table(Titanic,index = "survived",columns = "sex", values = "ticket",aggfunc="count")
survived_class = pd.pivot_table(Titanic,index = "survived",columns = "pclass", values = "ticket",aggfunc="count")
survived_age = pd.pivot_table(Titanic,index = "survived",columns = "AgeGroup", values = "ticket",aggfunc="count")

print("Number of survival by gender")
print(survived_sex)
print("Number of survival by class")
print(survived_class)
print("Number of survival by age")
print(survived_age)
print("\n")

#probability survival by Age 
PageInf = survived_age["Infant"][1]/survived_age["Infant"].sum()
PageTodd = survived_age["Toddler"][1]/survived_age["Toddler"].sum()
PageKid = survived_age["Kid"][1]/survived_age["Kid"].sum()
PageTeen = survived_age["Teen"][1]/survived_age["Teen"].sum()
PageAd = survived_age["Adult"][1]/survived_age["Adult"].sum()

#probability survival by class
Pclass1 = survived_class[1][1]/ survived_class[1].sum()
Pclass2 = survived_class[2][1]/ survived_class[2].sum()
Pclass3 = survived_class[3][1]/ survived_class[3].sum()

#probability survival by gender
Pfemale = survived_sex["female"][1]/survived_sex["female"].sum()
Pmale = survived_sex["male"][1]/survived_sex["male"].sum()


#tabulate   
head = ["Female", "Male","Clas1 "," Clas2","Clas3","Infant","Toddler","Kid","Teen","Adult"]
stats = [(Pfemale,Pmale,Pclass1,Pclass2,Pclass3,PageInf,PageTodd,PageKid,PageTeen,PageAd)]
print(tabulate(stats, headers=head, tablefmt="grid")) 
print()

Number of survival by gender
sex       female  male
survived              
0            127   682
1            339   161
Number of survival by class
pclass      1    2    3
survived               
0         123  158  528
1         200  119  181
Number of survival by age
AgeGroup  Infant  Toddler  Kid  Teen  Adult
survived                                   
0             18       14   16   141    430
1             33       17   11    84    282


+----------+----------+----------+----------+----------+----------+-----------+----------+----------+----------+
|   Female |     Male |   Clas1  |    Clas2 |    Clas3 |   Infant |   Toddler |      Kid |     Teen |    Adult |
| 0.727468 | 0.190985 | 0.619195 | 0.429603 | 0.255289 | 0.647059 |  0.548387 | 0.407407 | 0.373333 | 0.396067 |
+----------+----------+----------+----------+----------+----------+-----------+----------+----------+----------+



In [22]:
# 4.4 Build a logistic regression model for survival rates based on passenger class, sex, and age.
#function to fill age nan values with mean
def add_age(cols):
    age = cols[0]
    pclass = cols[1]
    if pd.isnull(age):
        return int(Titanic[Titanic["pclass"] == pclass]["age"].mean())
    else:
        return age
Titanic["age"] = Titanic[["age", "pclass"]].apply(add_age,axis=1) 

#change sex values from string to numeric
pd.get_dummies(Titanic["sex"])
sex_new = pd.get_dummies(Titanic["sex"],drop_first=True)
Titanic = pd.concat([Titanic,sex_new],axis=1)

#creating variables
x_survived = Titanic[["pclass","male","age"]]#predictor
y_survived = Titanic["survived"]

#regression line
logmodel = LogisticRegression()
logmodel.fit(x_survived,y_survived)

#predictions
y_predictions = logmodel.predict(x_survived)

#pvalues
print(logmodel.coef_)
print("\n")

#confusion matrix
c_matrix = confusion_matrix(y_survived,y_predictions)
print("Confusion Matrix")
c_matrix


[[-1.11370347 -1.23020583 -1.23020583 -0.03347261]]


Confusion Matrix


array([[687, 122],
       [158, 342]], dtype=int64)

In [23]:
#confusion matrix accuracy
tn, fp, fn, tp = c_matrix.ravel()
accuracy = ((tn+tp)/(tn+fp+fn+tp))*100
print("model accuracy is {:.3f}".format(accuracy),"%")


model accuracy is 78.610 %
