<a href="https://colab.research.google.com/github/cjvilla/enose_predictions/blob/main/exploratory_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Modules**

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
raw_data = pd.read_csv('../content/drive/MyDrive/eNose/sensor_data.csv')

In [None]:
raw_data.columns

Index(['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11',
       'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20', 'S21',
       'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29', 'S30', 'S31',
       'S32', 'real_idn', 'group'],
      dtype='object')

# **ETL/Data Cleaning**

In [None]:
demographics = pd.read_csv("/content/drive/MyDrive/eNose/demographics.csv")

In [None]:
# Define a function to map values in the 'Sex' column to 'Sex_Description'
def map_sex_to_description(sex_value):
    if sex_value == 1:
        return 'Female'
    elif sex_value == 2:
        return 'Male'
    else:
        return 'Unknown'  # You can specify a default value for other cases if needed

# Update the 'Sex_Description' column based on the 'Sex' column
demographics['Sex_Description'] = np.vectorize(map_sex_to_description)(demographics['Sex'])


In [None]:
# Define a function to map values in the 'Race' column to 'Race_Description'
def map_race_to_description(race_value):
    if race_value == 1:
        return 'Asian'
    elif race_value == 2:
        return 'Black'
    elif race_value == 3:
        return 'Hispanic'
    elif race_value == 4:
        return 'White'
    else:
        return 'Unknown'  # You can specify a default value for other cases if needed

# Apply the mapping function to create the new column
demographics['Race_Description'] = np.vectorize(map_race_to_description)(demographics['Race'])


In [None]:
# Define a function to map values in the 'Group' column to 'Group_Diagnosis'
def map_group_to_diagnosis(group_value):
    if group_value == 0:
        return 'Healthy Control'
    elif group_value == 1:
        return 'Alcoholic Liver Disease'
    elif group_value == 2:
        return "Parkinson's"
    else:
        return 'Unknown'  # You can specify a default value for other cases if needed

# Apply the mapping function to create the new column
demographics['Group_Diagnosis'] = np.vectorize(map_group_to_diagnosis)(demographics['group'])


In [None]:
# Define the desired column order
desired_column_order = [
    'real_idn', 'Age', 'Sex', 'Sex_Description', 'Race', 'Race_Description', 'BMI',
    'LPS_Endotoxin', 'Perm_sucralose', 'group', 'Group_Diagnosis', 'S1_mean', 'S2_mean',
    'S3_mean', 'S4_mean', 'S5_mean', 'S6_mean', 'S7_mean', 'S8_mean', 'S9_mean',
    'S10_mean', 'S11_mean', 'S12_mean', 'S13_mean', 'S14_mean', 'S15_mean', 'S16_mean',
    'S17_mean', 'S18_mean', 'S19_mean', 'S20_mean', 'S21_mean', 'S22_mean', 'S23_mean',
    'S24_mean', 'S25_mean', 'S26_mean', 'S27_mean', 'S28_mean', 'S29_mean', 'S30_mean',
    'S31_mean', 'S32_mean'
]

# Reorder the columns
demographics = demographics[desired_column_order]


In [None]:
merged_df = pd.merge(demographics, raw_data, on=['group', 'real_idn'], how='inner')


In [None]:

# Get a list of columns to drop based on the condition
columns_to_drop = [col for col in merged_df.columns if '_mean' in col]

# Drop the columns from the DataFrame
merged_df = merged_df.drop(columns=columns_to_drop)

In [None]:
merged_df.columns

Index(['real_idn', 'Age', 'Sex', 'Sex_Description', 'Race', 'Race_Description',
       'BMI', 'LPS_Endotoxin', 'Perm_sucralose', 'group', 'Group_Diagnosis',
       'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11',
       'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20', 'S21',
       'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29', 'S30', 'S31',
       'S32'],
      dtype='object')

# **Testing and Training**

# Random Forest

## Parkinson's vs. Alcoholic Liver Disease

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Define features (X) and target variables (y) for Parkinson's and Alcoholic Liver Disease
X = merged_df[['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10',
          'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20',
          'S21', 'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29', 'S30', 'S31', 'S32']]

# Target variable for Parkinson's
y_parkinson = (merged_df['group'] == 2).astype(int)

# Target variable for Alcoholic Liver Disease
y_ald = (merged_df['group'] == 0).astype(int)

# Split the dataset into training and testing sets for both diagnoses
X_train_parkinson, X_test_parkinson, y_train_parkinson, y_test_parkinson = train_test_split(X, y_parkinson, test_size=0.2, random_state=42)
X_train_ald, X_test_ald, y_train_ald, y_test_ald = train_test_split(X, y_ald, test_size=0.2, random_state=42)

# Create Random Forest Classifier models for both diagnoses
rf_parkinson = RandomForestClassifier(n_estimators=500, random_state=42)
rf_ald = RandomForestClassifier(n_estimators=500, random_state=42)

# Train the models
rf_parkinson.fit(X_train_parkinson, y_train_parkinson)
rf_ald.fit(X_train_ald, y_train_ald)

# Get feature importances for both models
feature_importances_parkinson = rf_parkinson.feature_importances_
feature_importances_ald = rf_ald.feature_importances_

# Create a DataFrame to display feature importances
sensor_importance_df_parkinson = pd.DataFrame({'Sensor': X.columns, 'Importance': feature_importances_parkinson})
sensor_importance_df_ald = pd.DataFrame({'Sensor': X.columns, 'Importance': feature_importances_ald})

# Sort the sensors by importance
sensor_importance_df_parkinson = sensor_importance_df_parkinson.sort_values(by='Importance', ascending=False)
sensor_importance_df_ald = sensor_importance_df_ald.sort_values(by='Importance', ascending=False)

# Display the sorted feature importances
print("\nFeature Importances for Parkinson's Disease:")
sensor_importance_df_parkinson.head()



Feature Importances for Parkinson's Disease:


Unnamed: 0,Sensor,Importance
23,S24,0.082932
7,S8,0.062937
26,S27,0.050462
3,S4,0.04922
2,S3,0.048071


In [None]:
print("\nFeature Importances for Alcoholic Liver Disease:")
sensor_importance_df_ald.head()



Feature Importances for Alcoholic Liver Disease:


Unnamed: 0,Sensor,Importance
23,S24,0.091465
3,S4,0.062128
2,S3,0.059667
9,S10,0.053952
7,S8,0.045868


In [None]:
from sklearn.metrics import accuracy_score

# For Parkinson's Disease
y_parkinson_pred = rf_parkinson.predict(X_test_parkinson)
accuracy_parkinson = accuracy_score(y_test_parkinson, y_parkinson_pred)
print(f"Accuracy for Parkinson's Disease: {accuracy_parkinson * 100:.2f}%")

# For Alcoholic Liver Disease
y_ald_pred = rf_ald.predict(X_test_ald)
accuracy_ald = accuracy_score(y_test_ald, y_ald_pred)
print(f"Accuracy for Alcoholic Liver Disease: {accuracy_ald * 100:.2f}%")


Accuracy for Parkinson's Disease: 86.77%
Accuracy for Alcoholic Liver Disease: 91.48%


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score

# Initialize empty lists to store accuracy scores
parkinson_accuracy_scores = []
ald_accuracy_scores = []

# For Parkinson's Disease
y_parkinson_pred = rf_parkinson.predict(X_test_parkinson)
accuracy_parkinson = accuracy_score(y_test_parkinson, y_parkinson_pred)

# For Alcoholic Liver Disease
y_ald_pred = rf_ald.predict(X_test_ald)
accuracy_ald = accuracy_score(y_test_ald, y_ald_pred)

# Loop through each sensor
for sensor in X.columns:
    # Create a copy of the test data for Parkinson's Disease
    X_test_sensor_parkinson = X_test_parkinson.copy()
    X_test_sensor_parkinson[sensor] = X_test_sensor_parkinson[sensor].sample(frac=1).values  # Shuffle the values of the sensor
    y_sensor_pred_parkinson = rf_parkinson.predict(X_test_sensor_parkinson)
    accuracy_sensor_parkinson = accuracy_score(y_test_parkinson, y_sensor_pred_parkinson)
    parkinson_accuracy_scores.append(accuracy_sensor_parkinson)

    # Create a copy of the test data for Alcoholic Liver Disease
    X_test_sensor_ald = X_test_ald.copy()
    X_test_sensor_ald[sensor] = X_test_sensor_ald[sensor].sample(frac=1).values  # Shuffle the values of the sensor
    y_sensor_pred_ald = rf_ald.predict(X_test_sensor_ald)
    accuracy_sensor_ald = accuracy_score(y_test_ald, y_sensor_pred_ald)
    ald_accuracy_scores.append(accuracy_sensor_ald)

# Create DataFrames for accuracy scores
parkinson_accuracy_df = pd.DataFrame({'Sensor': X.columns, 'Accuracy_Parkinson': parkinson_accuracy_scores})
ald_accuracy_df = pd.DataFrame({'Sensor': X.columns, 'Accuracy_ALD': ald_accuracy_scores})

# Merge the DataFrames on the 'Sensor' column
merged_accuracy_df = pd.merge(parkinson_accuracy_df, ald_accuracy_df, on='Sensor')

# Print the merged DataFrame
print("Merged Accuracy Scores by Sensor:")
merged_accuracy_df.head()


Merged Accuracy Scores by Sensor:


Unnamed: 0,Sensor,Accuracy_Parkinson,Accuracy_ALD
0,S1,0.869318,0.873377
1,S2,0.86526,0.871753
2,S3,0.867695,0.882305
3,S4,0.870942,0.874188
4,S5,0.867695,0.874188


In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix

# Define a function to calculate sensitivity and specificity
def calculate_sensitivity_specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    return sensitivity, specificity

# Initialize lists to store sensitivity and specificity for each sensor
sensitivity_parkinson_list = []
specificity_parkinson_list = []
sensitivity_ald_list = []
specificity_ald_list = []

# Calculate sensitivity and specificity for each sensor for Parkinson's Disease
for sensor in X.columns:
    # For Parkinson's Disease
    X_test_sensor_parkinson = X_test_parkinson.copy()
    X_test_sensor_parkinson[sensor] = X_test_sensor_parkinson[sensor].sample(frac=1).values  # Shuffle the values of the sensor
    y_sensor_pred_parkinson = rf_parkinson.predict(X_test_sensor_parkinson)
    sensitivity_parkinson, specificity_parkinson = calculate_sensitivity_specificity(y_test_parkinson, y_sensor_pred_parkinson)
    sensitivity_parkinson_list.append(sensitivity_parkinson)
    specificity_parkinson_list.append(specificity_parkinson)

    # For Alcoholic Liver Disease
    X_test_sensor_ald = X_test_ald.copy()
    X_test_sensor_ald[sensor] = X_test_sensor_ald[sensor].sample(frac=1).values  # Shuffle the values of the sensor
    y_sensor_pred_ald = rf_ald.predict(X_test_sensor_ald)
    sensitivity_ald, specificity_ald = calculate_sensitivity_specificity(y_test_ald, y_sensor_pred_ald)
    sensitivity_ald_list.append(sensitivity_ald)
    specificity_ald_list.append(specificity_ald)

# Create DataFrames to display sensitivity and specificity by sensor
sensitivity_specificity_df = pd.DataFrame({
    'Sensor': X.columns,
    'Sensitivity (Parkinson)': sensitivity_parkinson_list,
    'Specificity (Parkinson)': specificity_parkinson_list,
    'Sensitivity (Alcoholic Liver Disease)': sensitivity_ald_list,
    'Specificity (Alcoholic Liver Disease)': specificity_ald_list
})

# Display sensitivity and specificity for each sensor in one DataFrame
print("Sensitivity and Specificity by Sensor:")
sensitivity_specificity_df.head()


Sensitivity and Specificity by Sensor:


Unnamed: 0,Sensor,Sensitivity (Parkinson),Specificity (Parkinson),Sensitivity (Alcoholic Liver Disease),Specificity (Alcoholic Liver Disease)
0,S1,0.695652,0.928189,0.810298,0.967555
1,S2,0.692308,0.919614,0.799458,0.967555
2,S3,0.625418,0.951768,0.821138,0.955968
3,S4,0.675585,0.930332,0.796748,0.965238
4,S5,0.70903,0.920686,0.807588,0.96292


In [None]:
# Find the row with the highest sensitivity score for Parkinson's Disease
highest_sensitivity_parkinson_row = sensitivity_specificity_df[
    sensitivity_specificity_df['Sensitivity (Parkinson)'] == sensitivity_specificity_df['Sensitivity (Parkinson)'].max()
]

# Display the row with the highest sensitivity score
print("Row with Highest Sensitivity (Parkinson):")
highest_sensitivity_parkinson_row.head(10)

Row with Highest Sensitivity (Parkinson):


Unnamed: 0,Sensor,Sensitivity (Parkinson),Specificity (Parkinson),Sensitivity (Alcoholic Liver Disease),Specificity (Alcoholic Liver Disease)
4,S5,0.70903,0.920686,0.807588,0.96292
14,S15,0.70903,0.918542,0.796748,0.959444
15,S16,0.70903,0.920686,0.815718,0.960603
22,S23,0.70903,0.924973,0.810298,0.959444


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
# Define features (X) and target variables (y) for Parkinson's and Alcoholic Liver Disease
X = merged_df[['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10',
              'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20',
              'S21', 'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29', 'S30', 'S31', 'S32']]

# Target variable for Parkinson's
y_parkinson = (merged_df['group'] == 2).astype(int)

# Target variable for Alcoholic Liver Disease
y_ald = (merged_df['group'] == 0).astype(int)

# Split the dataset into training and testing sets for both diagnoses
X_train_parkinson, X_test_parkinson, y_train_parkinson, y_test_parkinson = train_test_split(X, y_parkinson, test_size=0.2, random_state=42)
X_train_ald, X_test_ald, y_train_ald, y_test_ald = train_test_split(X, y_ald, test_size=0.2, random_state=42)

# Create Random Forest Classifier models for both diagnoses
rf_parkinson = RandomForestClassifier(n_estimators=500, random_state=42)
rf_ald = RandomForestClassifier(n_estimators=500, random_state=42)

# Train the models
rf_parkinson.fit(X_train_parkinson, y_train_parkinson)
rf_ald.fit(X_train_ald, y_train_ald)

# Initialize a dictionary to store the ranges for each sensor
sensor_ranges = {'Sensor': [], 'Range for Parkinson\'s': [], 'Range for Alcoholic Liver Disease': []}

# Calculate the range for each sensor for Parkinson's Disease
for sensor in X.columns:
    sensor_readings = X_test_parkinson[sensor].values
    predictions = rf_parkinson.predict(X_test_parkinson)
    range_parkinson = (sensor_readings[predictions == 1].min(), sensor_readings[predictions == 1].max())
    sensor_ranges['Sensor'].append(sensor)
    sensor_ranges['Range for Parkinson\'s'].append(range_parkinson)

# Calculate the range for each sensor for Alcoholic Liver Disease
for sensor in X.columns:
    sensor_readings = X_test_ald[sensor].values
    predictions = rf_ald.predict(X_test_ald)
    range_ald = (sensor_readings[predictions == 1].min(), sensor_readings[predictions == 1].max())
    index = sensor_ranges['Sensor'].index(sensor)
    sensor_ranges['Range for Alcoholic Liver Disease'].append(range_ald)

# Create a DataFrame to display the ranges for each sensor
sensor_range_df = pd.DataFrame(sensor_ranges)

# Display the sensor range DataFrame
sensor_range_df.head()


Unnamed: 0,Sensor,Range for Parkinson's,Range for Alcoholic Liver Disease
0,S1,"(3.72975466, 4.18227461)","(3.75790252, 4.17818314)"
1,S2,"(4.56670693, 5.06402495)","(4.58438789, 5.0609467)"
2,S3,"(1.44411178, 1.59334391)","(1.44549878, 1.59253064)"
3,S4,"(4.28410552, 4.73577816)","(4.29128269, 4.7343823)"
4,S5,"(0.74791354, 0.81336697)","(0.75213684, 0.81579502)"


# XGBoost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
# Define features (X) and target variables (y) for Parkinson's and Alcoholic Liver Disease
X = merged_df[['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10',
          'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20',
          'S21', 'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29', 'S30', 'S31', 'S32']]

# Target variable for Parkinson's
y_parkinson = (merged_df['group'] == 0).astype(int)

# Target variable for Alcoholic Liver Disease
y_ald = (merged_df['group'] == 1).astype(int)

# Split the dataset into training and testing sets for both diagnoses
X_train_parkinson, X_test_parkinson, y_train_parkinson, y_test_parkinson = train_test_split(X, y_parkinson, test_size=0.2, random_state=42)
X_train_ald, X_test_ald, y_train_ald, y_test_ald = train_test_split(X, y_ald, test_size=0.2, random_state=42)

# Create XGBoost models for both diagnoses
xgb_parkinson = XGBClassifier(objective='binary:logistic', random_state=42)
xgb_ald = XGBClassifier(objective='binary:logistic', random_state=42)

# Train the models
xgb_parkinson.fit(X_train_parkinson, y_train_parkinson)
xgb_ald.fit(X_train_ald, y_train_ald)

# Initialize a dictionary to store accuracy scores for each sensor
accuracy_dict = {'Sensor': [], 'Accuracy (Parkinson)': [], 'Accuracy (Alcoholic Liver Disease)': []}

# Calculate accuracy for each sensor for Parkinson's Disease
for sensor in X.columns:
    X_test_sensor = X_test_parkinson.copy()
    X_test_sensor[sensor] = X_test_sensor[sensor].sample(frac=1).values  # Shuffle the values of the sensor
    y_sensor_pred = xgb_parkinson.predict(X_test_sensor)
    accuracy_sensor = accuracy_score(y_test_parkinson, y_sensor_pred)
    accuracy_dict['Sensor'].append(sensor)
    accuracy_dict['Accuracy (Parkinson)'].append(accuracy_sensor)
    accuracy_dict['Accuracy (Alcoholic Liver Disease)'].append(0)  # Placeholder for now

# Calculate accuracy for each sensor for Alcoholic Liver Disease
for sensor in X.columns:
    X_test_sensor = X_test_ald.copy()
    X_test_sensor[sensor] = X_test_sensor[sensor].sample(frac=1).values  # Shuffle the values of the sensor
    y_sensor_pred = xgb_ald.predict(X_test_sensor)
    accuracy_sensor = accuracy_score(y_test_ald, y_sensor_pred)
    index = accuracy_dict['Sensor'].index(sensor)
    accuracy_dict['Accuracy (Alcoholic Liver Disease)'][index] = accuracy_sensor

# Create a DataFrame to display accuracy scores for each sensor
accuracy_df = pd.DataFrame(accuracy_dict)

# Display the accuracy DataFrame
accuracy_df.head(32)


Unnamed: 0,Sensor,Accuracy (Parkinson),Accuracy (Alcoholic Liver Disease)
0,S1,0.874188,0.924513
1,S2,0.863636,0.92289
2,S3,0.774351,0.904221
3,S4,0.878247,0.913149
4,S5,0.877435,0.918019
5,S6,0.881494,0.924513
6,S7,0.875812,0.921266
7,S8,0.820617,0.843344
8,S9,0.875812,0.923701
9,S10,0.869318,0.895292


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
# Define features (X) and target variables (y) for Parkinson's and Alcoholic Liver Disease
X = merged_df[['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10',
          'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20',
          'S21', 'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29', 'S30', 'S31', 'S32']]

# Target variable for Parkinson's
y_parkinson = (merged_df['group'] == 0).astype(int)

# Target variable for Alcoholic Liver Disease
y_ald = (merged_df['group'] == 1).astype(int)

# Split the dataset into training and testing sets for both diagnoses
X_train_parkinson, X_test_parkinson, y_train_parkinson, y_test_parkinson = train_test_split(X, y_parkinson, test_size=0.2, random_state=42)
X_train_ald, X_test_ald, y_train_ald, y_test_ald = train_test_split(X, y_ald, test_size=0.2, random_state=42)

# Create XGBoost models for both diagnoses
xgb_parkinson = XGBClassifier(objective='binary:logistic', random_state=42)
xgb_ald = XGBClassifier(objective='binary:logistic', random_state=42)

# Train the models
xgb_parkinson.fit(X_train_parkinson, y_train_parkinson)
xgb_ald.fit(X_train_ald, y_train_ald)

# Define a function to calculate sensitivity and specificity
def calculate_sensitivity_specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    return sensitivity, specificity

# Initialize a dictionary to store sensitivity and specificity by sensor
sensitivity_specificity_dict = {'Sensor': [], 'Sensitivity (Parkinson)': [], 'Specificity (Parkinson)': [],
                                'Sensitivity (Alcoholic Liver Disease)': [], 'Specificity (Alcoholic Liver Disease)': []}

# Calculate sensitivity and specificity by sensor for Parkinson's Disease
for sensor in X.columns:
    X_test_sensor = X_test_parkinson.copy()
    X_test_sensor[sensor] = X_test_sensor[sensor].sample(frac=1).values  # Shuffle the values of the sensor
    y_sensor_pred = xgb_parkinson.predict(X_test_sensor)
    sensitivity, specificity = calculate_sensitivity_specificity(y_test_parkinson, y_sensor_pred)
    sensitivity_specificity_dict['Sensor'].append(sensor)
    sensitivity_specificity_dict['Sensitivity (Parkinson)'].append(sensitivity)
    sensitivity_specificity_dict['Specificity (Parkinson)'].append(specificity)
    sensitivity_specificity_dict['Sensitivity (Alcoholic Liver Disease)'].append(0)  # Placeholder for now
    sensitivity_specificity_dict['Specificity (Alcoholic Liver Disease)'].append(0)  # Placeholder for now

# Calculate sensitivity and specificity by sensor for Alcoholic Liver Disease
for sensor in X.columns:
    X_test_sensor = X_test_ald.copy()
    X_test_sensor[sensor] = X_test_sensor[sensor].sample(frac=1).values  # Shuffle the values of the sensor
    y_sensor_pred = xgb_ald.predict(X_test_sensor)
    sensitivity, specificity = calculate_sensitivity_specificity(y_test_ald, y_sensor_pred)
    index = sensitivity_specificity_dict['Sensor'].index(sensor)
    sensitivity_specificity_dict['Sensitivity (Alcoholic Liver Disease)'][index] = sensitivity
    sensitivity_specificity_dict['Specificity (Alcoholic Liver Disease)'][index] = specificity

# Create a DataFrame to display sensitivity and specificity by sensor
sensitivity_specificity_df = pd.DataFrame(sensitivity_specificity_dict)

# Display the sensitivity and specificity DataFrame
print("Sensitivity and Specificity by Sensor:")
sensitivity_specificity_df.head()


Sensitivity and Specificity by Sensor:


Unnamed: 0,Sensor,Sensitivity (Parkinson),Specificity (Parkinson),Sensitivity (Alcoholic Liver Disease),Specificity (Alcoholic Liver Disease)
0,S1,0.87766,0.866766,0.810298,0.964079
1,S2,0.858156,0.875749,0.823848,0.964079
2,S3,0.657801,0.827844,0.794038,0.945539
3,S4,0.87766,0.865269,0.796748,0.969873
4,S5,0.882979,0.878743,0.834688,0.961761


In [None]:
# Filter for Parkinson's Disease
highest_sensitivity_parkinson = sensitivity_specificity_df[sensitivity_specificity_df['Sensitivity (Parkinson)'] == sensitivity_specificity_df['Sensitivity (Parkinson)'].max()]

# Filter for Alcoholic Liver Disease
highest_sensitivity_ald = sensitivity_specificity_df[sensitivity_specificity_df['Sensitivity (Alcoholic Liver Disease)'] == sensitivity_specificity_df['Sensitivity (Alcoholic Liver Disease)'].max()]


In [None]:
# Display the highest sensitivity for Parkinson's and Alcoholic Liver Disease
print("Highest Sensitivity for Parkinson's Disease:")
highest_sensitivity_parkinson.head()

Highest Sensitivity for Parkinson's Disease:


Unnamed: 0,Sensor,Sensitivity (Parkinson),Specificity (Parkinson),Sensitivity (Alcoholic Liver Disease),Specificity (Alcoholic Liver Disease)
19,S20,0.904255,0.82485,0.780488,0.97219


In [None]:
print("\nHighest Sensitivity for Alcoholic Liver Disease:")
highest_sensitivity_ald.head()


Highest Sensitivity for Alcoholic Liver Disease:


Unnamed: 0,Sensor,Sensitivity (Parkinson),Specificity (Parkinson),Sensitivity (Alcoholic Liver Disease),Specificity (Alcoholic Liver Disease)
30,S31,0.886525,0.865269,0.859079,0.951333


In [None]:
import pandas as pd

# range of senosr readings that are above the specified threshold for both parkinson's and adl. But, does not indicate a diagnosis.
# threshold is used as a boundari to determine the range of sensor readings associated with a positive proediction for each condition

# Create an empty DataFrame to store the range of readings for each sensor
sensor_range_df = pd.DataFrame(columns=['Sensor', 'Range for Parkinson\'s', 'Range for Alcoholic Liver Disease'])

# Define a fixed threshold (you can adjust this threshold)
threshold = 0.5

# Iterate through each sensor
for sensor in X_val:  # Replace X_val with your validation data
    # Get the readings for the current sensor
    sensor_readings = merged_df[sensor]

    # Calculate the range of readings for Parkinson's (above threshold -> positive, below threshold -> negative)
    range_parkinson = (sensor_readings[sensor_readings >= threshold].min(), sensor_readings[sensor_readings >= threshold].max())

    # Calculate the range of readings for Alcoholic Liver Disease (above threshold -> positive, below threshold -> negative)
    range_ald = (sensor_readings[sensor_readings >= threshold].min(), sensor_readings[sensor_readings >= threshold].max())

    # Store the results in the DataFrame
    sensor_range_df = sensor_range_df.append({
        'Sensor': sensor,
        'Range for Parkinson\'s': range_parkinson,
        'Range for Alcoholic Liver Disease': range_ald
    }, ignore_index=True)

# Display the range of readings for each sensor
sensor_range_df.head(32)


  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_range_df = sensor_range_df.append({
  sensor_r

Unnamed: 0,Sensor,Range for Parkinson's,Range for Alcoholic Liver Disease
0,S1,"(3.72675694, 4.18307259)","(3.72675694, 4.18307259)"
1,S2,"(4.563774, 5.06459154)","(4.563774, 5.06459154)"
2,S3,"(1.44346583, 1.59355499)","(1.44346583, 1.59355499)"
3,S4,"(4.28180985, 4.73647565)","(4.28180985, 4.73647565)"
4,S5,"(0.7456443, 0.81675735)","(0.7456443, 0.81675735)"
5,S6,"(0.66766578, 0.73939857)","(0.66766578, 0.73939857)"
6,S7,"(0.93760953, 0.96031516)","(0.93760953, 0.96031516)"
7,S8,"(3.11496136, 3.17808477)","(3.11496136, 3.17808477)"
8,S9,"(0.80069346, 0.8508976)","(0.80069346, 0.8508976)"
9,S10,"(1.15093171, 1.18956868)","(1.15093171, 1.18956868)"
