This dataset is about predicting the skincancer based mortality rate depending on location


In [28]:

#import dataset

import pandas as pd

df = pd.read_table('https://online.stat.psu.edu/onlinecourses/sites/stat501/files/data/skincancer.txt',sep='\s+')


In [29]:
df.head()

Unnamed: 0,State,Lat,Mort,Ocean,Long
0,Alabama,33.0,219,1,87.0
1,Arizona,34.5,160,0,112.0
2,Arkansas,35.0,170,0,92.5
3,California,37.5,182,1,119.5
4,Colorado,39.0,149,0,105.5


In [30]:
#data descrition
df.describe(include='all')

Unnamed: 0,State,Lat,Mort,Ocean,Long
count,49,49.0,49.0,49.0,49.0
unique,49,,,,
top,NewHampshire,,,,
freq,1,,,,
mean,,39.532653,152.877551,0.44898,90.936735
std,,4.610874,33.428177,0.502545,14.866485
min,,28.0,86.0,0.0,69.0
25%,,36.0,128.0,0.0,78.5
50%,,39.5,147.0,0.0,89.5
75%,,43.0,178.0,1.0,100.0


We can see that the data has no missing values. The column State has unique string values for each row, so will not be a useful column to predict Mortality

In [7]:
#do correlation 
df.corr()


Unnamed: 0,Lat,Mort,Ocean,Long
Lat,1.0,-0.824518,-0.219542,0.098974
Mort,-0.824518,1.0,0.473355,-0.146188
Ocean,-0.219542,0.473355,1.0,-0.38261
Long,0.098974,-0.146188,-0.38261,1.0


In [31]:
#extract dependent and independent variables
X = df.drop(['State','Mort'],axis=1)
y = df.Mort

In [33]:
#importing OLS statsmodel to check the p-values of the X variable
import statsmodels.api as sm
X2 = sm.add_constant(X) #to account for b0
ols = sm.OLS(y,X2)
lr = ols.fit()
print(lr.summary())

                            OLS Regression Results                            
Dep. Variable:                   Mort   R-squared:                       0.772
Model:                            OLS   Adj. R-squared:                  0.757
Method:                 Least Squares   F-statistic:                     50.83
Date:                Mon, 19 Apr 2021   Prob (F-statistic):           1.70e-14
Time:                        14:30:41   Log-Likelihood:                -204.75
No. Observations:                  49   AIC:                             417.5
Df Residuals:                      45   BIC:                             425.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        349.2369     27.060     12.906      0.0

Since p-values of some variable is higher than 0.05 (significance level) we have to apply Backward Feature Elimination

In [34]:
#backward feature elimination
while (lr.pvalues.max()>0.05):
    X2.drop(lr.pvalues.idxmax(),axis=1,inplace=True)
    ols = sm.OLS(y,X2)
    lr = ols.fit()
print(lr.summary())

                            OLS Regression Results                            
Dep. Variable:                   Mort   R-squared:                       0.770
Model:                            OLS   Adj. R-squared:                  0.760
Method:                 Least Squares   F-statistic:                     76.83
Date:                Mon, 19 Apr 2021   Prob (F-statistic):           2.17e-15
Time:                        14:30:43   Log-Likelihood:                -205.02
No. Observations:                  49   AIC:                             416.0
Df Residuals:                      46   BIC:                             421.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        360.6905     21.498     16.778      0.0

In [35]:
X2.head()

Unnamed: 0,const,Lat,Ocean
0,1.0,33.0,1
1,1.0,34.5,0
2,1.0,35.0,0
3,1.0,37.5,1
4,1.0,39.0,0


In [36]:
X=X2.drop('const',axis=1) #we dont need constant column for sklearn package
X.head()

Unnamed: 0,Lat,Ocean
0,33.0,1
1,34.5,0
2,35.0,0
3,37.5,1
4,39.0,0


In [38]:
#For cross-validation using train-test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                 random_state=1,test_size=0.2)

In [39]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.7123670686295531

In [40]:
y_pred = model.predict(X_test)
from sklearn.metrics import r2_score,mean_squared_error
import math

print(r2_score(y_test,y_pred)) #R^2
print(mean_squared_error(y_test,y_pred)) #MSE
print(math.sqrt(mean_squared_error(y_test,y_pred))) #RMSE

0.7123670686295531
231.73147115860056
15.22272876847645


In [41]:
#k-fold cross-validation
from sklearn.model_selection import cross_val_score
cross_val_score(LinearRegression(),X,y,cv=5).mean()

0.6287374432945536

In [42]:
#dimensions of data
n = len(X_test)
k = len(X_test.iloc[0])
R2 = r2_score(y_test,y_pred)
R2

0.7123670686295531

In [44]:
#Adj R^2 is useful in multiple regression 
#as it accounts for number of variables in the scoring

Adj_R2 = 1 - ((n-1)*(1- R2)/(n-k-1))
print(Adj_R2)

0.6301862310951397
