In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from pprint import pprint
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,confusion_matrix

Mounted at /content/drive


In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/DS5500 PROJECT/processed_train.csv', low_memory=False)
test_df = pd.read_csv('/content/drive/MyDrive/DS5500 PROJECT/processed_test.csv', low_memory=False)
pprint(f'Train dataset shape: {train_df.shape}')
pprint(f'Test dataset shape: {test_df.shape}')

'Train dataset shape: (1025602, 221)'
'Test dataset shape: (134529, 221)'


In [5]:
columns_remove = ["Have you noticed a diffuse (widespread) redness in one or both eyes?",
         "Have you lost consciousness associated with violent and sustained muscle contractions or had an absence episode?",
         "Have you had any vaginal discharge?"]


In [6]:
train_df = train_df.drop(columns=columns_remove)
test_df = test_df.drop(columns=columns_remove)

In [7]:
train_df = train_df.dropna()
test_df = test_df.dropna()
pprint(f'Train dataset shape: {train_df.shape}')
pprint(f'Test dataset shape: {test_df.shape}')

'Train dataset shape: (982224, 218)'
'Test dataset shape: (128726, 218)'


In [8]:
# Prepare training and test variables
X_columns = [col for col in train_df.columns if col not in ['Unnamed: 0', 'PATHOLOGY']]
X_train = train_df[X_columns]
y_train = train_df['PATHOLOGY']
X_test = test_df[X_columns]
y_test = test_df['PATHOLOGY']
pprint(f'X_train shape: {X_train.shape}')
pprint(f'y_train shape: {y_train.shape}')
pprint(f'X_test shape: {X_test.shape}')
pprint(f'y_test shape: {y_test.shape}')

'X_train shape: (982224, 216)'
'y_train shape: (982224,)'
'X_test shape: (128726, 216)'
'y_test shape: (128726,)'


In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
categorical = X_train.select_dtypes("object").columns

In [11]:
categorical

Index(['SEX', 'Characterize your pain:', 'Do you feel pain somewhere?',
       'Does the pain radiate to another location?',
       'Have you traveled out of the country in the last 4 weeks?',
       'What color is the rash?', 'Do your lesions peel off?',
       'Where is the affected region located?',
       'Is the lesion (or are the lesions) larger than 1cm?',
       'Where is the swelling located?'],
      dtype='object')

In [12]:
categorical = [col for col in X_train.columns if X_train[col].dtype == 'O']

In [13]:
len(categorical)

10

In [14]:
label_encoder = LabelEncoder()

In [15]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
for column in categorical:
    X_train[column] = label_encoder.fit_transform(X_train[column])

In [17]:
for column in categorical:
    X_test[column] = label_encoder.fit_transform(X_test[column])

In [18]:
len(X_columns)

216

### Logistic regression model

In [19]:
logistic_regression = LogisticRegression()

#fit model on training data

logistic_regression.fit(X_train,y_train )

#predictions on testing data

y_pred = logistic_regression.predict(X_test)

#evaluate model

accuracy = accuracy_score(y_test, y_pred)
print(f'The accuracy score is {accuracy * 100}%')

The accuracy score is 94.82388950173237%


In [20]:
from sklearn.preprocessing import LabelBinarizer

In [21]:
lb = LabelBinarizer()

In [22]:
y_pred_encoded = lb.fit_transform(y_pred)
y_test_encoded = lb.fit_transform(y_test)

In [23]:
y_test_encoded

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [25]:
import sklearn.metrics as metrics

In [26]:
#calculate AUC of model
auc = metrics.roc_auc_score(y_test_encoded, y_pred_encoded)

#print AUC score
print(auc)


0.6175314292504236


In [27]:
print("Classification Report")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Classification Report
                                          precision    recall  f1-score   support

     Acute COPD exacerbation / infection       1.00      1.00      1.00      2153
                Acute dystonic reactions       1.00      1.00      1.00      3302
                        Acute laryngitis       0.97      0.70      0.81      3214
                      Acute otitis media       0.95      0.99      0.97      3509
                   Acute pulmonary edema       0.92      0.97      0.95      2598
                    Acute rhinosinusitis       0.99      0.39      0.56      1829
                      Allergic sinusitis       1.00      1.00      1.00      2411
                             Anaphylaxis       1.00      0.98      0.99      3798
                                  Anemia       1.00      0.99      0.99      6770
                     Atrial fibrillation       1.00      1.00      1.00      2831
                               Boerhaave       0.98      0.94      0.96    

In [31]:
from sklearn.model_selection import cross_val_score

In [29]:
X = X_train
y = y_train

In [32]:
# Took 17n minutes
k = 5
scores = cross_val_score(logistic_regression, X, y, cv=k, scoring='accuracy')

In [34]:
print(f"Mean Accuracy: {np.mean(scores)}")

Mean Accuracy: 0.9501977148093573
