In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Step 1: Load the dataset 
df = pd.read_csv("D:/Summer2023/APT3025A/coronavirusdataset.csv")

# Step 2: Drop irrelevant columns
irrelevant_columns = ['test_name', 'swab_type', 'age', 'high_risk_interactions',
                      'temperature', 'pulse', 'sys', 'dia', 'rr', 'sats', 'rapid_flu_results',
                      'rapid_strep_results', 'ctab', 'labored_respiration',
                      'rhonchi', 'wheezes', 'days_since_symptom_onset', 'cough_severity',
                      'fever', 'sob_severity', 'cxr_findings', 'cxr_impression',
                      'cxr_label', 'cxr_link']
df.drop(irrelevant_columns, axis=1, inplace=True)

# Step 3: Convert binary categorical variables to 0/1
binary_columns = ['high_risk_exposure_occupation', 'diabetes', 'chd', 'htn', 'cancer',
                  'asthma', 'copd', 'autoimmune_dis', 'smoker', 'cough', 'sob', 'diarrhea',
                  'fatigue', 'headache', 'loss_of_smell', 'loss_of_taste',
                  'runny_nose', 'muscle_sore', 'sore_throat']

for col in binary_columns:
    df[col] = df[col].map({"FALSE": 0, "TRUE": 1})

# Map 'Positive' and 'Negative' to 1 and 0, respectively, in the target variable
df['covid19_test_results'] = df['covid19_test_results'].map({"Positive": 1, "Negative": 0})

# Step 4: Fill missing values with 0
df.fillna(0, inplace=True)

# Select relevant features for prediction
features = binary_columns
target = 'covid19_test_results'

# Step 5: Split the data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the Random Forest Classifier
random_forest_classifier = RandomForestClassifier(random_state=42)
random_forest_classifier.fit(X_train, y_train)

# Step 7: Train the Gradient Boosted Classifier
gradient_boosted_classifier = GradientBoostingClassifier(random_state=42)
gradient_boosted_classifier.fit(X_train, y_train)

# Step 8: Train the Regression Tree Classifier
regression_tree_classifier = DecisionTreeClassifier(random_state=42)
regression_tree_classifier.fit(X_train, y_train)

# Step 9: Make Predictions
y_pred_random_forest = random_forest_classifier.predict(X_test)
y_pred_gradient_boosted = gradient_boosted_classifier.predict(X_test)
y_pred_regression_tree = regression_tree_classifier.predict(X_test)

# Step 10: Evaluation - Calculate Accuracy
accuracy_random_forest = accuracy_score(y_test, y_pred_random_forest)
accuracy_gradient_boosted = accuracy_score(y_test, y_pred_gradient_boosted)
accuracy_regression_tree = accuracy_score(y_test, y_pred_regression_tree)

print("Accuracy of Random Forest Classifier:", accuracy_random_forest)
print("Accuracy of Gradient Boosted Classifier:", accuracy_gradient_boosted)
print("Accuracy of Regression Tree Classifier:", accuracy_regression_tree)

new_data = {
    'high_risk_exposure_occupation': 0,
    'diabetes': 1,
    'chd': 0,
    'htn': 1,
    'cancer': 0,
    'asthma': 0,
    'copd': 0,
    'autoimmune_dis': 0,
    'smoker': 1,
    'cough': 1,
    'sob': 0,
    'diarrhea': 0,
    'fatigue': 1,
    'headache': 1,
    'loss_of_smell': 0,
    'loss_of_taste': 0,
    'runny_nose': 0,
    'muscle_sore': 1,
    'sore_throat': 0
}

# Create a DataFrame for the new data
new_data_df = pd.DataFrame([new_data])
# Make predictions using the trained classifiers
prediction_rf = random_forest_classifier.predict(new_data_df)
prediction_gb = gradient_boosted_classifier.predict(new_data_df)
prediction_rt = regression_tree_classifier.predict(new_data_df)


# Convert predictions to readable format
chance_of_getting_covid_rf = "Positive" if prediction_rf[0] == 1 else "Negative"
chance_of_getting_covid_gb = "Positive" if prediction_gb[0] == 1 else "Negative"
chance_of_getting_covid_rt = "Positive" if prediction_rt[0] == 1 else "Negative"

# Print the predictions
print("\nNew Data Prediction using Random Forest Classifier:", chance_of_getting_covid_rf)
print("New Data Prediction using Gradient Boosted Classifier:", chance_of_getting_covid_gb)
print("New Data Prediction using Regression Tree Classifier:", chance_of_getting_covid_rt)
