In [1]:
import os
import sys

import sys
sys.path.append('../src') 

# Importing libraries
import pandas as pd
import numpy as np

# Libraries for machine learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

# Libraries for plotting curves
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from itertools import cycle

# Importing script
import etl as etl
import visualize_data as visualize_data

import warnings
warnings.filterwarnings('ignore')

In [None]:
# simulated_gwas_fp = '../testdata/gwas/gwas_simulate.csv' 
# etl.simulate_data('.', simulated_gwas_fp, 5000)
simulated_data = pd.read_csv('../data/out/simulated_data.csv')
simulated_data.head()

In [None]:
model_gwas_fp = '../testdata/gwas/gwas_model.csv' 
model_data = pd.read_csv(model_gwas_fp)
subset = set(simulated_data.columns).intersection(model_data['variant_id'].unique())
new_columns = list(subset)+['Class']

data = simulated_data[new_columns]

In [None]:
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

In [None]:
# get proportion of each class
prop_per_class = y.value_counts(normalize=True)
prop_per_class

## Logistic Regression - Curves

In [None]:
# fit a model with the best parameters
lr_best = LogisticRegression(C=10, tol=0.0001)
lr_best.fit(X_train, y_train)

# plot multiclass P-R curve
lr_pr = visualize_data.plot_precision_recall(
    'Logistic Regression', lr_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

In [None]:
type(lr_pr)

In [None]:
# fit a model with the best parameters
lr_best = LogisticRegression(C=10, tol=0.0001)
lr_best.fit(X_train, y_train)

# plot multiclass ROC curve
lr_roc = visualize_data.plot_multiclass_roc(
    'Logistic Regression', lr_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

## K Nearest Neighbors - Curves

In [None]:
# fit a model with the best parameters
knn_best = KNeighborsClassifier(n_neighbors=3, p=3)
knn_best.fit(X_train, y_train)

# plot multiclass P-R curve
knn_pr = visualize_data.plot_precision_recall(
    'K Nearest Neighbors', knn_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

In [None]:
# fit a model with the best parameters
knn_best = KNeighborsClassifier(n_neighbors=3, p=3)
knn_best.fit(X_train, y_train)

# plot multiclass ROC curve
knn_roc = visualize_data.plot_multiclass_roc(
    'K Nearest Neighbors', knn_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

## SVM - Curves

In [None]:
# fit a model with the best parameters
svc_best = SVC(C=10, tol=0.1)
svc_best.fit(X_train, y_train)

# plot multiclass P-R curve
svc_pr = visualize_data.plot_precision_recall(
    'Support Vector Machine', svc_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

In [None]:
# fit a model with the best parameters
svc_best = SVC(C=10, tol=0.1)
svc_best.fit(X_train, y_train)

# plot multiclass ROC curve
svc_roc = visualize_data.plot_multiclass_roc(
    'Support Vector Machine', svc_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

## Gaussian NB - Curves

In [None]:
# fit a model with the best parameters
gnb_best = GaussianNB(priors=[0.333, 0.333, 0.334], var_smoothing=0.1)
gnb_best.fit(X_train, y_train)

# plot multiclass P-R curve
gnb_pr = visualize_data.plot_precision_recall(
    'Naive Bayes', gnb_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

In [None]:
# fit a model with the best parameters
gnb_best = GaussianNB(priors=[0.333, 0.333, 0.334], var_smoothing=0.1)
gnb_best.fit(X_train, y_train)

# plot multiclass ROC curve
gnb_roc = visualize_data.plot_multiclass_roc(
    'Naive Bayes', gnb_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

## Random Forest - Curves

In [None]:
# fit a model with the best parameters
rf_best = RandomForestClassifier(class_weight={0: 0.55, 1: 0.3, 2: 0.15}, 
                                 n_estimators=200)
rf_best.fit(X_train, y_train)

# plot multiclass P-R curve
rf_pr = visualize_data.plot_precision_recall(
    'Random Forest', rf_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

In [None]:
# fit a model with the best parameters
rf_best = RandomForestClassifier(class_weight={0: 0.55, 1: 0.3, 2: 0.15}, 
                                 n_estimators=200)
rf_best.fit(X_train, y_train)

# plot multiclass ROC curve
rf_roc = visualize_data.plot_multiclass_roc(
    'Random Forest', rf_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

## Decision Tree - Curves

In [None]:
# fit a model with the best parameters
dt_best = DecisionTreeClassifier()
dt_best.fit(X_train, y_train)

# plot multiclass P-R curve
dt_pr = visualize_data.plot_precision_recall(
    'Decision Tree', dt_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

In [None]:
# fit a model with the best parameters
dt_best = DecisionTreeClassifier()
dt_best.fit(X_train, y_train)

# plot multiclass ROC curve
dt_roc = visualize_data.plot_multiclass_roc(
    'Decision Tree', dt_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

## Change working directory to the project directory (instead of the notebooks directory)

In [2]:
sys.path

['C:\\Users\\micha\\ucsd\\PredictingDisease\\notebooks',
 'C:\\Users\\micha\\Anaconda3\\python37.zip',
 'C:\\Users\\micha\\Anaconda3\\DLLs',
 'C:\\Users\\micha\\Anaconda3\\lib',
 'C:\\Users\\micha\\Anaconda3',
 '',
 'C:\\Users\\micha\\Anaconda3\\lib\\site-packages',
 'C:\\Users\\micha\\Anaconda3\\lib\\site-packages\\win32',
 'C:\\Users\\micha\\Anaconda3\\lib\\site-packages\\win32\\lib',
 'C:\\Users\\micha\\Anaconda3\\lib\\site-packages\\Pythonwin',
 'C:\\Users\\micha\\Anaconda3\\lib\\site-packages\\IPython\\extensions',
 'C:\\Users\\micha\\.ipython']

In [3]:
!pwd

'pwd' is not recognized as an internal or external command,
operable program or batch file.


In [4]:
os.chdir('../')

In [5]:
!pwd

'pwd' is not recognized as an internal or external command,
operable program or batch file.


In [6]:
sys.path

['C:\\Users\\micha\\ucsd\\PredictingDisease\\notebooks',
 'C:\\Users\\micha\\Anaconda3\\python37.zip',
 'C:\\Users\\micha\\Anaconda3\\DLLs',
 'C:\\Users\\micha\\Anaconda3\\lib',
 'C:\\Users\\micha\\Anaconda3',
 '',
 'C:\\Users\\micha\\Anaconda3\\lib\\site-packages',
 'C:\\Users\\micha\\Anaconda3\\lib\\site-packages\\win32',
 'C:\\Users\\micha\\Anaconda3\\lib\\site-packages\\win32\\lib',
 'C:\\Users\\micha\\Anaconda3\\lib\\site-packages\\Pythonwin',
 'C:\\Users\\micha\\Anaconda3\\lib\\site-packages\\IPython\\extensions',
 'C:\\Users\\micha\\.ipython']

## Import visualize_data.py to test code

In [7]:
from src import visualize_data;

In [8]:
# test function from visualize_data.py
fp = './data/out/simulated_data.csv'
visualize_data.plot_polygenic_risk_scores(fp)

FileNotFoundError: File b'./data/out/simulated_data.csv' does not exist

In [None]:
# test function from visualize_data.py
fp = './data/out/simulated_data.csv'
visualize_data.plot_risk_across_classes(fp)

## Command line to run project

In [None]:
# command line to run project
!python run.py test