In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report,confusion_matrix
import itertools
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv(r"C:\Users\cc\Downloads\ThoracicSurgery.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
live=df[df['Death_1yr']==0]
death=df[df['Death_1yr']==1]

cond=['FVC', 'FEV1', 'Performance', 'Pain', 'Haemoptysis',
       'Dyspnoea', 'Cough', 'Weakness', 'Tumor_Size', 'Diabetes_Mellitus',
       'MI_6mo', 'PAD', 'Smoking', 'Asthma', 'Age']
l=[np.mean(live[c]) for c in cond]
d=[np.mean(death[c]) for c in cond]

ld=pd.DataFrame(data={'Attribute':cond,'Live 1yr Mean':l,'Death 1yr Mean':d})
ld=ld.set_index('Attribute')

print('Death: {:d}'.format(len(death),len(live)))
print('Live: {:d}'.format(len(live),len(death)))
print("1 year death: {:.2f}% out of 454 patients".format(np.mean(df.Death_1yr)*100))
ld

In [None]:
#HOW MANY PATIENTS DIED IN 1 YEAR
#PERCENTAGE DIFFERENCE IN MEANS OF LIVE VS DEATH PATIENTS
d=np.array(d)
l=np.array(l)
p_diff=(d-l)/l*100

fig,axes=plt.subplots(2,1,figsize=(12,18))
axes[0].bar(cond,p_diff)
axes[0].set_title('Mean Difference % between Dead and Live 1yr',fontsize=18)
axes[0].set_xticks(cond)
axes[0].set_xticklabels(cond,rotation=90)
axes[0].set_ylabel('Percent',fontsize=13)

#COUNT PLOTS OF TRUE/FALSE CONDITION COLUMNS
tf_col=['Pain','Haemoptysis','Dyspnoea','Cough','Weakness','Diabetes_Mellitus','MI_6mo','PAD','Smoking','Asthma']
tf_sum=[df[col].sum()/454 for col in tf_col]

axes[1].bar(tf_col,tf_sum)
axes[1].set_xticks(tf_col)
axes[1].set_xticklabels(tf_col,rotation=90)
axes[1].set_ylabel('Proportion of Total Patients',fontsize=13)
axes[1].set_title('Proportion of Patient Conditions before Surgery',fontsize=18)

plt.tight_layout()

In [None]:
#Categorical Data(Diagnosis,Tumor_Size,Performance)
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df is your DataFrame

fig, axes = plt.subplots(3, 1, figsize=(10, 15))

sns.countplot(x='Diagnosis', hue='Death_1yr', data=df, palette='Blues_d', ax=axes[0])
axes[0].set_title('Diagnosis')

sns.countplot(x='Tumor_Size', hue='Death_1yr', data=df, palette='Blues_d', ax=axes[1])
axes[1].set_title('Tumor_Size')

sns.countplot(x='Performance', hue='Death_1yr', data=df, palette='Blues_d', ax=axes[2])
axes[2].set_title('Performance')

plt.tight_layout()
plt.show()


In [None]:
def permutation_sample(data1,data2):
    data=np.concatenate((data1,data2))
    permuted_data=np.random.permutation(data)
    
    perm_sample_1=permuted_data[:len(data1)]
    perm_sample_2=permuted_data[len(data2):]
    
    return perm_sample_1,perm_sample_2

In [None]:
condition=['FVC', 'FEV1', 'Performance', 'Pain', 'Haemoptysis',
       'Dyspnoea', 'Cough', 'Weakness', 'Tumor_Size', 'Diabetes_Mellitus',
       'MI_6mo', 'PAD', 'Smoking', 'Asthma', 'Age']
import numpy as np

def diff_of_means(data1, data2):
    """Difference in means of two arrays."""
    return np.mean(data1) - np.mean(data2)

def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""
    data = np.concatenate((data1, data2))
    permuted_data = np.random.permutation(data)
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]
    return perm_sample_1, perm_sample_2

def draw_perm_reps(data1, data2, func, size=1):
    """Generate multiple permutation replicates."""
    perm_replicates = np.empty(size)  # Initialize perm_replicates as an empty array of size 'size'
    for i in range(size):
        perm_sample_1, perm_sample_2 = permutation_sample(data1, data2)
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)
    return perm_replicates

# Assuming 'death' and 'live' are pandas DataFrames and 'condition' is a list of columns
for c in condition:
    empirical_diff_means = diff_of_means(death[c], live[c])
    perm_replicates = draw_perm_reps(death[c], live[c], diff_of_means, size=10000)
    if empirical_diff_means > 0:
        p = np.sum(perm_replicates >= empirical_diff_means) / len(perm_replicates)
    else:
        p = np.sum(perm_replicates <= empirical_diff_means) / len(perm_replicates)
    print(f"p-value for {c}: {p}")

        

In [None]:
condition=['FVC', 'FEV1', 'Performance', 'Pain', 'Haemoptysis',
       'Dyspnoea', 'Cough', 'Weakness', 'Tumor_Size', 'Diabetes_Mellitus',
       'MI_6mo', 'PAD', 'Smoking', 'Asthma', 'Age']
p_val=[]
for c in condition:
    empirical_diff_means=diff_of_means(death[c],live[c])
    perm_replicates=draw_perm_reps(death[c],live[c],diff_of_means,size=10000)
    if empirical_diff_means>0:
        p=np.sum(perm_replicates>= empirical_diff_means)/len(perm_replicates)
        p_val.append(p)
    else:
        p=np.sum(perm_replicates <= empirical_diff_means)/len(perm_replicates)
        p_val.append(p)
print(list(zip(condition,p_val)))

In [None]:
#Numerical data(Age,FVC,FEV1)
import matplotlib.pyplot as plt

# Assuming df is your DataFrame and it contains columns 'FVC', 'FEV1', and 'Age'
fig, axes = plt.subplots(1, 2, figsize=(13, 5))

# Plot FVC vs FEV1
axes[0].plot(df.FVC, df.FEV1, linestyle='none', marker='.')
axes[0].set_xlabel('FVC', fontsize=13)
axes[0].set_ylabel('FEV1', fontsize=13)
axes[0].set_title('FVC vs FEV1', fontsize=16)

# Plot Age vs FEV1 and Age vs FVC
axes[1].plot(df.Age, df.FEV1, linestyle='none', marker='.', label='FEV1')
axes[1].plot(df.Age, df.FVC, linestyle='none', marker='.', label='FVC')
axes[1].set_xlabel('Age', fontsize=13)
axes[1].set_ylabel('FEV1, FVC', fontsize=13)
axes[1].legend()
axes[1].set_title('Age vs FEV1, FVC', fontsize=16)

plt.tight_layout()
plt.show()


In [None]:
#Correlation coefficients for FVC and FEV1
np.corrcoef(df.FVC,df.FEV1)[0,1]

In [None]:
#Correlation coefficients for Age and FVC
np.corrcoef(df.Age,df.FVC)[0,1]

In [None]:
#correlation coefficients for Age and FEV1
np.corrcoef(df.Age,df.FEV1)[0,1]

In [None]:
#Correlations of FVC,FEV1 and Age
def ecdf(data):
    n=len(data)
    x=np.sort(data)
    y=np.arange(1,n+1)/n
    return x,y

In [None]:
#ECDF of FVC,FEV1,Age
x_fvc,y_fvc=ecdf(df.FVC)
x_fev1,y_fev1=ecdf(df.FEV1)
x_age,y_age=ecdf(df.Age)

fig,axes=plt.subplots(1,2,figsize=(13,5))
axes[0].plot(x_fvc,y_fvc,marker='.',linestyle='none',label='FVC')
axes[0].plot(x_fev1,y_fev1,marker='.',linestyle='none',label='FEV1')

axes[0].set_xlabel('Numerical Value',fontsize=13)
axes[0].set_ylabel('ECDF',fontsize=13)
axes[0].legend(loc='upper left')
axes[0].set_title('ECDF of FVC & FEV1',fontsize=16)

axes[1].plot(x_age,y_age,marker='.',linestyle='none',label='Age')
axes[1].set_xlabel('Years Old',fontsize=13)
axes[1].set_ylabel('ECDF',fontsize=13)
axes[1].legend(loc='upper left')
axes[1].set_title('ECDF of Age',fontsize=16)
plt.tight_layout()

In [None]:
x=df.iloc[:,0:15].values
y=df.iloc[:,15:16].values

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
print('Shape of x_train {}'.format(x_train.shape))
print('Shape of y_train {}'.format(y_train.shape))
print('Shape of x_test {}'.format(x_test.shape))
print('Shape of y_test {}'.format(y_test.shape))

In [None]:
from sklearn.preprocessing import StandardScaler

# Standard scaling
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
correlation_matrix=df.corr()
plt.figure(figsize=(12,8))
sns.heatmap(correlation_matrix,annot=True,cmap='coolwarm',linewidths=0.5)
plt.title('corelation Matrix')
plt.show()

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import itertools
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

# Load your dataset
df = pd.read_csv(r'C:\Users\cc\Downloads\ThoracicSurgery.csv')

# Feature selection
# Select features relevant for prediction
features = ['FVC', 'FEV1', 'Performance', 'Pain', 'Haemoptysis', 'Dyspnoea',
            'Cough', 'Weakness', 'Tumor_Size', 'Diabetes_Mellitus', 'MI_6mo',
            'PAD', 'Smoking', 'Asthma', 'Age']
target = 'Death_1yr'

# Prepare the data
X = df[features]
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier()
}
# Train and evaluate classifiers
results = []
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)
    
    results.append({
        'Classifier': name,
        'Accuracy': accuracy,
        'F1 Score': f1
    })
    
    print(f'Classifier: {name}')
    print(f'Accuracy: {accuracy}')
    print(f'F1 Score: {f1}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    print('Confusion Matrix:')
    print(cm)
    
# Print summary of results
results_df = pd.DataFrame(results)
print(results_df)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import warnings
from sklearn.preprocessing import StandardScaler
import pickle

# Ignore warnings
warnings.filterwarnings('ignore')

# Load your dataset
df = pd.read_csv(r'C:\Users\cc\Downloads\ThoracicSurgery.csv')

# Feature selection
features = ['FVC', 'FEV1', 'Performance', 'Pain', 'Haemoptysis', 'Dyspnoea',
            'Cough', 'Weakness', 'Tumor_Size', 'Diabetes_Mellitus', 'MI_6mo',
            'PAD', 'Smoking', 'Asthma', 'Age']
target = 'Death_1yr'

# Prepare the data
X = df[features]
y = df[target]

# Normalize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid for hyperparameter tuning with more values
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'bootstrap': [True, False]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Perform grid search
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')

# Get the best estimator
best_rf = grid_search.best_estimator_

# Save the model and scaler to pickle files
with open('best_random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(best_rf, model_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# Make predictions with the best estimator
y_pred = best_rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(cm)


In [None]:
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler

# Load the model and scaler from the pickle files
with open('best_random_forest_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open('scaler.pkl', 'rb') as scaler_file:
    loaded_scaler = pickle.load(scaler_file)

# Example new data for prediction (replace with actual data)
new_data = np.array([[2.88,2.16,1, 0, 0, 0, 1, 1, 4, 0, 0, 0, 1, 0, 60]])

# Normalize the new data using the loaded scaler
new_data_normalized = loaded_scaler.transform(new_data)

# Make prediction with the loaded model
new_prediction = loaded_model.predict(new_data_normalized)

print(f'Prediction for new data: {new_prediction}')


In [None]:
new_data = np.array([[2.44,0.96,2,0,1,0,1,1,1,0,0,0,1,0,73]])

# Normalize the new data using the loaded scaler
new_data_normalized = loaded_scaler.transform(new_data)

# Make prediction with the loaded model
new_prediction = loaded_model.predict(new_data_normalized)

print(f'Prediction for new data: {new_prediction}')
