# Naive Bayes Model Implementation

Here I use a very basic gaussian classifier from sklearn to see how well it preforms on the data with minimal cleaning. 

In [1]:
import pandas as pd
import numpy as np

%reload_ext autoreload

from sklearn.naive_bayes import GaussianNB

Here is import Samantha's code for splitting the data properly

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
#importing S.F.'s split functions
def standardize_data(df):
    """
    Given

    df (Dataframe) :      input data table

    Returns:
    Data table standadized so that each column has a mean of 0 and a standard deviation of 1
    """
    std_scaler = StandardScaler()

    df_scaled = std_scaler.fit_transform(df.to_numpy())
    df_scaled = pd.DataFrame(df_scaled, columns=df.columns.values.tolist())

    return df_scaled
def split_train_test_data(input_data, test_ratio, standardize = False):
    """
    Given input data table, splits it into training and test
    data sets, with the size of the test data set being set
    by the input 'test_ratio', if the boolean input of
    standardize is set to True, also standardizes the X
    data before returning it as test and train sets

    input_data (Dataframe) :        data table of patient record data
    test_ratio (float):             decimal input that specifies what ratio of the data should be allocated for the test data set
    standardize (Boolean):          boolean input that specifies whether X data should be standardized using standard scaler before being returned

    Returns:
    Histogram plot and statistical breakdown of patient ages at first or second stroke-incident for a specific stroke type
    """
    y = input_data.outcome
    input_data.drop(['outcome'], axis=1, inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(input_data, y, test_size=test_ratio, random_state=42)

    if standardize:
        X_train = standardize_data(X_train)
        X_test = standardize_data(X_test)

    return X_train, X_test, y_train, y_test

In [38]:
# patient_data = pd.read_csv('/content/data01.csv')
patient_data = pd.read_csv('/content/drive/Shareddrives/DSCI 303 Semester Project Fall 2021/Data Exploration/data01.csv')

# drop identifying data
patient_data.drop(['group', 'ID'], axis=1, inplace=True)

# drop rows where the outcome column is null
patient_data = patient_data[patient_data['outcome'].notna()]

print('Data size before clean up : ',patient_data.shape)

# fill in missing column entries with median value for each column 
patient_data = patient_data.fillna(patient_data.median())

print('Data size after clean up : ',patient_data.shape)


Data size before clean up :  (1176, 49)
Data size after clean up :  (1176, 49)


In [25]:
X_train, X_test, y_train, y_test = split_train_test_data(patient_data, 0.3, True)

In [26]:
#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test)
y_pred_train = gnb.predict(X_train)
print("Training accuracy", metrics.accuracy_score(y_train, y_pred_train))

Training accuracy 0.8554070473876063


More work needs to be done to understand how to better use this model and what feature engineering might assist with improving the test accuracy.

In [27]:
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Test Accuracy:",metrics.accuracy_score(y_test, y_pred))

Test Accuracy: 0.8470254957507082


In [39]:
from sklearn.cluster import KMeans
def model_cluster(df, num_clusters):
    """
    Given

    X_train (Dataframe) :        data table input training data for the model
    X_test (Dataframe):          data table input testing data for the model

    Returns:

    """
    # Convert DataFrame to matrix
    mat = df.values
    # Using sklearn
    km = KMeans(n_clusters = num_clusters)
    km.fit(mat)
    # Get cluster assignment labels
    labels = km.labels_
    # Format results as a DataFrame
    df['cluster'] = labels
    
    return df

def optimize_cluster(df):
    """
    Given

    X_train (Dataframe) :        data table input training data for the model
    X_test (Dataframe):          data table input testing data for the model

    Returns:

    """
    X_train, X_test, y_train, y_test = split_train_test_data(df, 0.3, True)
    optimize_random_forest(X_train, X_test, y_train, y_test)
    optimize_gradient_booster(X_train, X_test, y_train, y_test)
    
    return 
cluster_patient_data = model_cluster(patient_data, 2)

In [40]:
first_cluster = cluster_patient_data.loc[lambda cluster_patient_data: cluster_patient_data['cluster'] == 0, :]
second_cluster = cluster_patient_data.loc[lambda cluster_patient_data: cluster_patient_data['cluster'] == 1, :]
#third_cluster = cluster_patient_data.loc[lambda cluster_patient_data: cluster_patient_data['cluster'] == 2, :]

clusters = [first_cluster, second_cluster] #, third_cluster]

In [41]:
for cluster in clusters:
  X_train, X_test, y_train, y_test = split_train_test_data(cluster, 0.3, True)
  
  gnb = GaussianNB()
  gnb.fit(X_train, y_train)
  y_pred = gnb.predict(X_test)
  y_pred_train = gnb.predict(X_train)
  print("Training accuracy", metrics.accuracy_score(y_train, y_pred_train))
  print("Test Accuracy:",metrics.accuracy_score(y_test, y_pred))

Training accuracy 0.8770614692653673
Test Accuracy: 0.8566433566433567
Training accuracy 0.8782051282051282
Test Accuracy: 0.7910447761194029


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
