# $$Life Expectancy Post Thoracic Surgery$$

# $Importing Necessary Libraries$

In [40]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [41]:
# read dataset
df = pd.read_csv(r'ThoraricSurgery.csv')

In [42]:
#Let's print the first 5 records of the data set
df.head()

Unnamed: 0,id,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,1,DGN2,2.88,2.16,PRZ1,F,F,F,T,T,OC14,F,F,F,T,F,60,F
1,2,DGN3,3.4,1.88,PRZ0,F,F,F,F,F,OC12,F,F,F,T,F,51,F
2,3,DGN3,2.76,2.08,PRZ1,F,F,F,T,F,OC11,F,F,F,T,F,59,F
3,4,DGN3,3.68,3.04,PRZ0,F,F,F,F,F,OC11,F,F,F,F,F,54,F
4,5,DGN3,2.44,0.96,PRZ2,F,T,F,T,T,OC11,F,F,F,T,F,73,T


In [43]:
#Let's print the last 5 records of the dataset 
df.tail()

Unnamed: 0,id,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
465,466,DGN2,3.88,2.12,PRZ1,F,F,F,T,F,OC13,F,F,F,T,F,63,F
466,467,DGN3,3.76,3.12,PRZ0,F,F,F,F,F,OC11,F,F,F,T,F,61,F
467,468,DGN3,3.04,2.08,PRZ1,F,F,F,T,F,OC13,F,F,F,F,F,52,F
468,469,DGN3,1.96,1.68,PRZ1,F,F,F,T,T,OC12,F,F,F,T,F,79,F
469,470,DGN3,4.72,3.56,PRZ0,F,F,F,F,F,OC12,F,F,F,T,F,51,F


In [44]:
#Let's print the column name
df.columns

Index(['id', 'DGN', 'PRE4', 'PRE5', 'PRE6', 'PRE7', 'PRE8', 'PRE9', 'PRE10',
       'PRE11', 'PRE14', 'PRE17', 'PRE19', 'PRE25', 'PRE30', 'PRE32', 'AGE',
       'Risk1Yr'],
      dtype='object')

In [45]:
# shape of the dataset
df.shape

(470, 18)

In [46]:
#Let's print the total number of duplicated value
df.duplicated().sum()

0

In [47]:
#Let's print the data types in our data
df.dtypes

id           int64
DGN         object
PRE4       float64
PRE5       float64
PRE6        object
PRE7        object
PRE8        object
PRE9        object
PRE10       object
PRE11       object
PRE14       object
PRE17       object
PRE19       object
PRE25       object
PRE30       object
PRE32       object
AGE          int64
Risk1Yr     object
dtype: object

In [48]:
#Let's get the overall information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 18 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       470 non-null    int64  
 1   DGN      470 non-null    object 
 2   PRE4     470 non-null    float64
 3   PRE5     470 non-null    float64
 4   PRE6     470 non-null    object 
 5   PRE7     470 non-null    object 
 6   PRE8     470 non-null    object 
 7   PRE9     470 non-null    object 
 8   PRE10    470 non-null    object 
 9   PRE11    470 non-null    object 
 10  PRE14    470 non-null    object 
 11  PRE17    470 non-null    object 
 12  PRE19    470 non-null    object 
 13  PRE25    470 non-null    object 
 14  PRE30    470 non-null    object 
 15  PRE32    470 non-null    object 
 16  AGE      470 non-null    int64  
 17  Risk1Yr  470 non-null    object 
dtypes: float64(2), int64(2), object(14)
memory usage: 66.2+ KB


In [49]:
#Let's check the total number of null values present in our dataset 
df.isnull().sum()

id         0
DGN        0
PRE4       0
PRE5       0
PRE6       0
PRE7       0
PRE8       0
PRE9       0
PRE10      0
PRE11      0
PRE14      0
PRE17      0
PRE19      0
PRE25      0
PRE30      0
PRE32      0
AGE        0
Risk1Yr    0
dtype: int64

In [50]:
df.drop('id',axis=1,inplace=True)

In [51]:
df.head()

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,DGN2,2.88,2.16,PRZ1,F,F,F,T,T,OC14,F,F,F,T,F,60,F
1,DGN3,3.4,1.88,PRZ0,F,F,F,F,F,OC12,F,F,F,T,F,51,F
2,DGN3,2.76,2.08,PRZ1,F,F,F,T,F,OC11,F,F,F,T,F,59,F
3,DGN3,3.68,3.04,PRZ0,F,F,F,F,F,OC11,F,F,F,F,F,54,F
4,DGN3,2.44,0.96,PRZ2,F,T,F,T,T,OC11,F,F,F,T,F,73,T


In [52]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [53]:
df['DGN']= le.fit_transform(df['DGN'])
df['PRE6']= le.fit_transform(df['PRE6'])
df['PRE7']= le.fit_transform(df['PRE7'])
df['PRE8']= le.fit_transform(df['PRE8'])
df['PRE9']= le.fit_transform(df['PRE9'])
df['PRE10']= le.fit_transform(df['PRE10'])
df['PRE11']= le.fit_transform(df['PRE11'])
df['PRE14']= le.fit_transform(df['PRE14'])
df['PRE17']= le.fit_transform(df['PRE17'])
df['PRE19']= le.fit_transform(df['PRE19'])
df['PRE25']= le.fit_transform(df['PRE25'])
df['PRE30']= le.fit_transform(df['PRE30'])
df['PRE32']= le.fit_transform(df['PRE32'])
df['Risk1Yr']= le.fit_transform(df['Risk1Yr'])

False(0) - Survive , True(1) - Die

In [54]:
from sklearn.decomposition import PCA
pca = PCA(n_components=16, svd_solver='full')

In [55]:
pca.fit(df)

PCA(n_components=16, svd_solver='full')

In [56]:
x_pca= pca.transform(df)

In [57]:
x_pca.shape

(470, 16)

In [58]:
x= pd.DataFrame(x_pca, columns=['DGN', 'PRE4', 'PRE5', 'PRE6', 'PRE7', 'PRE8', 'PRE9', 'PRE10','PRE11', 'PRE14', 'PRE17', 'PRE19', 'PRE25', 'PRE30', 'PRE32', 'AGE'])

In [59]:
x.head()

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE
0,-1.922676,2.908989,-0.938489,2.270796,0.038364,0.653114,-0.22073,-0.020698,0.849264,-0.323541,-0.239473,-0.189482,-0.03517,0.194481,-0.000185,-0.010601
1,-0.554785,11.842449,-0.125392,-0.178787,-0.500854,0.729909,-0.300656,0.04771,0.109144,-0.248172,0.054296,-0.024746,0.035639,0.022767,-0.010488,0.001341
2,-1.817516,3.907796,-0.602812,-0.532573,0.178868,-0.593252,-0.162315,-0.302252,-0.200566,0.042966,0.031327,-0.013652,-0.042849,-0.014516,-0.012748,-0.000821
3,0.04909,8.695786,0.263822,-0.938065,-0.814852,0.177817,0.552689,-0.028258,0.270219,0.242121,-0.047877,-0.095419,-0.058862,-0.013325,0.009732,0.009392
4,-5.45603,-9.685874,-0.699427,-0.122267,0.747668,-1.153229,0.23908,1.16662,-0.065484,-0.014716,0.04743,-0.212456,-0.07995,-0.376574,-0.063673,-0.004872


In [60]:
x

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE
0,-1.922676,2.908989,-0.938489,2.270796,0.038364,0.653114,-0.220730,-0.020698,0.849264,-0.323541,-0.239473,-0.189482,-0.035170,0.194481,-0.000185,-0.010601
1,-0.554785,11.842449,-0.125392,-0.178787,-0.500854,0.729909,-0.300656,0.047710,0.109144,-0.248172,0.054296,-0.024746,0.035639,0.022767,-0.010488,0.001341
2,-1.817516,3.907796,-0.602812,-0.532573,0.178868,-0.593252,-0.162315,-0.302252,-0.200566,0.042966,0.031327,-0.013652,-0.042849,-0.014516,-0.012748,-0.000821
3,0.049090,8.695786,0.263822,-0.938065,-0.814852,0.177817,0.552689,-0.028258,0.270219,0.242121,-0.047877,-0.095419,-0.058862,-0.013325,0.009732,0.009392
4,-5.456030,-9.685874,-0.699427,-0.122267,0.747668,-1.153229,0.239080,1.166620,-0.065484,-0.014716,0.047430,-0.212456,-0.079950,-0.376574,-0.063673,-0.004872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,-2.495212,0.005890,0.142066,1.717809,-0.517275,0.043708,-0.127761,-0.356985,0.017022,-0.018769,0.014979,0.003469,-0.055083,-0.092717,0.000145,-0.001808
466,-1.138010,1.800918,0.506393,-0.809599,-0.932342,0.040711,-0.288554,0.094939,-0.016001,-0.117461,0.045637,-0.009224,0.007937,0.012179,-0.021074,0.001158
467,-0.548319,10.795330,-0.573364,0.929757,0.529893,0.718453,0.662082,-0.533730,0.296948,0.230341,-0.068798,-0.122115,-0.078430,-0.088261,0.031360,0.007505
468,-5.825589,-15.712128,-0.881297,0.137951,0.194345,0.005005,-0.254861,-0.024133,0.575542,-0.177747,-0.221454,-0.227730,-0.036123,0.233801,-0.026374,-0.006957


In [61]:
y= df['Risk1Yr']
y

0      0
1      0
2      0
3      0
4      1
      ..
465    0
466    0
467    0
468    0
469    0
Name: Risk1Yr, Length: 470, dtype: int32

In [62]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [63]:
x_train

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE
192,0.382045,13.684594,-0.896063,-0.107481,-0.414661,-0.935383,0.721502,-0.368041,0.109486,0.484799,-0.065633,-0.062157,-0.148909,-0.052853,0.021292,0.006324
249,-4.618597,-9.810582,-0.445873,0.214117,0.057067,0.084206,-0.181774,-0.414138,-0.130230,0.002220,0.041258,-0.051709,-0.032897,-0.050094,-0.015289,0.002209
26,-1.011898,2.758121,-0.130950,-0.411166,0.071721,-0.691282,-0.156254,-0.285018,-0.199293,0.053995,0.018792,-0.000764,-0.048524,-0.032294,-0.011713,-0.002030
7,-2.669549,-3.030210,-0.378276,0.133568,-0.636672,-1.002598,-0.260936,0.422817,-0.454323,0.795204,0.045080,-0.112335,-0.133281,-0.127840,0.975006,0.003718
428,-2.320205,0.952306,-0.256066,0.364450,0.358390,-0.239307,-0.225636,0.077115,0.619952,-0.224779,-0.254514,-0.175470,-0.032492,0.230815,-0.013697,-0.012482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,-1.096660,7.829955,-1.002167,0.140223,0.465961,0.078629,-0.180042,-0.365118,-0.086321,-0.062822,0.038170,-0.033075,-0.022724,-0.021184,-0.004759,-0.000076
270,-4.341639,-7.840659,-0.589695,0.322158,0.389139,-0.040006,-0.374999,0.689159,0.368081,0.447567,-0.213309,-0.319660,-0.173440,0.136600,-0.024589,-0.007343
348,3.289483,23.339853,-0.141557,0.087237,-0.453663,-1.074692,-0.276182,1.286808,0.664630,0.335404,0.295633,-0.075969,-0.134121,-0.135230,-0.009016,-0.009966
435,-1.314650,6.869684,-0.496624,-0.494134,0.208532,-0.634674,-0.156712,-0.284447,-0.193059,0.034381,0.025744,-0.004634,-0.041839,-0.014371,-0.010592,-0.001779


In [64]:
y_train

192    0
249    0
26     0
7      1
428    0
      ..
106    0
270    1
348    1
435    0
102    0
Name: Risk1Yr, Length: 329, dtype: int32

In [65]:
x_test

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE
55,-2.618108,-2.035360,0.351972,-0.101878,0.808610,0.329192,-0.200497,-0.390487,-0.122534,-0.104769,0.013481,-0.044790,0.004844,-0.051595,-0.009081,-0.001313
73,4.088866,17.169323,2.801082,-0.795992,-0.381187,0.043390,0.569086,0.080811,0.289225,0.156860,-0.124145,-0.033059,-0.034490,-0.067495,0.021692,0.000487
33,-4.111222,-4.845631,-1.285907,-0.072307,0.281271,0.384948,0.659678,-0.475256,0.216157,0.507254,-0.320694,0.737709,0.177269,-0.074074,-0.006689,0.011593
446,2.013722,13.428782,2.880899,-1.695818,1.992554,2.078358,0.432364,-0.134445,0.335299,-0.194834,-0.116503,-0.153741,0.102154,-0.043434,0.021410,0.003327
425,0.989370,15.596513,-0.565264,0.285218,0.515925,-0.068972,-0.164830,-0.311147,-0.063909,-0.082326,0.018362,-0.003900,-0.023008,-0.028371,0.001800,-0.003114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,-0.161404,11.770869,-0.062111,0.678307,-0.252560,1.397917,-0.447155,0.655243,-0.012281,0.313370,0.075876,-0.154152,-0.089012,-0.097891,-0.007860,0.003781
110,1.008153,7.531010,0.640812,0.565502,-1.485459,0.114896,-0.253004,0.112579,0.122500,-0.129396,0.034058,0.025042,-0.016218,-0.026056,-0.008801,-0.000705
299,-1.778158,4.954462,0.061664,-0.156983,-0.654103,0.737073,-0.302745,0.027070,0.092412,-0.222937,0.056077,-0.032771,0.031379,0.011557,-0.014560,0.002308
316,-0.897342,5.797170,0.112430,0.328991,0.171147,-0.010014,0.683342,-0.462008,0.182379,0.344014,-0.082818,-0.095668,-0.102043,-0.092509,0.023717,0.006114


In [66]:
y_test

55     0
73     0
33     0
446    0
425    0
      ..
60     1
110    0
299    0
316    0
29     0
Name: Risk1Yr, Length: 141, dtype: int32

# $Model Building$

# $Decision tree$

In [67]:
dt = DecisionTreeClassifier()
dt = dt.fit(x_train,y_train)
y_pred = dt.predict(x_test)
acc=accuracy_score(y_test,y_pred)
acc

0.9787234042553191

# $Random Forest$

In [68]:
rf = RandomForestClassifier(random_state=87)
rf = rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
acc=accuracy_score(y_test,y_pred)
acc

0.9574468085106383

# $XGBoost$

In [73]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb = xgb.fit(x_train,y_train)
y_pred = xgb.predict(x_test)
acc=accuracy_score(y_test,y_pred)
acc

0.9645390070921985

# $Hybrid Classifier(MLP)$

In [72]:
from mlxtend.classifier import StackingClassifier
model1 =RandomForestClassifier(random_state=58)
model3 = DecisionTreeClassifier(random_state=5)

gnb = RandomForestClassifier()
clf_stack = StackingClassifier(classifiers=[model1,model3], meta_classifier=gnb, use_probas=True,
                                         use_features_in_secondary=True)
model_stack = clf_stack.fit(x_train, y_train)
pred_stack = model_stack.predict(x_test)
acc_stack = accuracy_score(y_test, pred_stack)
acc_stack

0.9787234042553191

# $Prediction$

In [74]:
abc=[0.382045,	13.684594,	-0.896063,	-0.107481,	-0.414661,	-0.935383,	0.721502,	-0.368041,	0.109486,	0.484799,	-0.065633,	-0.062157,	-0.148909,	-0.052853,	0.021292,	0.006324]
result=rf.predict([abc])
result=result[0]
if result==0:
    print("The Person will Survive")
else:
    print("The Person will Die")

The Person will Survive




In [75]:
abc=[-2.669549,	-3.030210,	-0.378276,	0.133568,	-0.636672,	-1.002598,	-0.260936,	0.422817,	-0.454323,	0.795204,	0.045080,	-0.112335,	-0.133281,	-0.127840,	0.975006,	0.003718]
result=rf.predict([abc])
result=result[0]
if result==0:
    print("The Person will Survive")
else:
    print("The Person will Die")

The Person will Die


