In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV,Ridge, Lasso,LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [7]:
new_df= pd.read_csv("../data/new_df")

In [8]:
new_df.head()

Unnamed: 0.1,Unnamed: 0,patient_id,age,num_visits,avg_time_between_visits,avg_a1c,avg_control_level,num_obs
0,0,72,56,10,99.777778,7.833333,1,6
1,1,254,64,54,20.584906,8.842857,0,7
2,2,255,64,8,146.857143,10.233333,0,2
3,3,304,70,5,54.5,7.0,1,1
4,4,384,66,9,105.25,8.5,0,4


In [9]:
new_df.drop(columns=["Unnamed: 0"],inplace=True)

In [10]:
new_df.dtypes

patient_id                   int64
age                          int64
num_visits                   int64
avg_time_between_visits    float64
avg_a1c                    float64
avg_control_level            int64
num_obs                      int64
dtype: object

In [11]:
new_df["avg_control_level"].value_counts(normalize=True)
#baseline score 

0    0.606005
1    0.393995
Name: avg_control_level, dtype: float64

In [12]:
new_df.isnull().sum()

patient_id                  0
age                         0
num_visits                  0
avg_time_between_visits    31
avg_a1c                     0
avg_control_level           0
num_obs                     0
dtype: int64

In [13]:
new_df.dropna(inplace=True)

In [14]:
new_df.isnull().sum()

patient_id                 0
age                        0
num_visits                 0
avg_time_between_visits    0
avg_a1c                    0
avg_control_level          0
num_obs                    0
dtype: int64

## Linear Regression

In [15]:
features=['avg_time_between_visits',"age","num_visits","num_obs"]
X = new_df[features] # X = what we use to predict y
y = new_df['avg_a1c'] # y = what we want to predict

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [16]:
lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Score on the training set: {lr.score(X_train, y_train)}')
print(f'Score on the test set: {lr.score(X_test, y_test)}')

Score on the training set: 0.07655737557026987
Score on the test set: 0.08884825793752982


## Logistic Regression

In [17]:
features=['avg_time_between_visits',"age","num_visits","num_obs"]
X = new_df[features]
y = new_df['avg_control_level']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=39)

In [18]:
logreg = LogisticRegression()

In [19]:
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
print(f'Score on the training set: {logreg.score(X_train, y_train)}')
print(f'Score on the test set: {logreg.score(X_test, y_test)}')

Score on the training set: 0.6754057428214731
Score on the test set: 0.6367041198501873


In [21]:
print(f'Logistic Regression Intercept: {logreg.intercept_}')
print(f'Logistic Regression Coefficient: {logreg.coef_}')

Logistic Regression Intercept: [-1.38070144]
Logistic Regression Coefficient: [[ 0.00251095  0.02228041  0.04501979 -0.36030959]]


In [22]:
np.exp(logreg.coef_)

array([[1.00251411, 1.02253047, 1.04604856, 0.69746037]])

- As average time between visits increases by 1, someone is about 1.002 times as likely to be controlled.
- As age increases by 1, someone is about 1.002 times as likely to be controlled.
- As num visits increases by 1, someone is about 1.04 times as likely to be controlled.
- As person has more bw obs done, they are less likely to be in the positive class/ controlled (.36 times as likely to be negative class) 

** making inferences 

## Decision Tree Classifier

In [23]:
grid = GridSearchCV(estimator = DecisionTreeClassifier(),
                    param_grid = {'max_depth': [3, 5, 7, 10],
                                  'min_samples_split': [5, 10, 15, 20],
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7]},
                    cv = 5,
                    verbose = 1)

In [24]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:    2.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 5, 7, 10],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7],
                  

In [25]:
grid.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [26]:
grid.best_score_

0.6229712858926342

In [27]:
dt = grid.best_estimator_

# Fit model.
dt.fit(X_train, y_train)

# Evaluate model.
print(f'Score on training set: {dt.score(X_train, y_train)}')
print(f'Score on testing set: {dt.score(X_test, y_test)}')

Score on training set: 0.6666666666666666
Score on testing set: 0.5692883895131086


In [28]:
preds = dt.predict(X_test)

In [29]:
tn, fp, fn, tp = confusion_matrix(y_test,
                                  preds).ravel()

print(confusion_matrix(y_test,
                       preds))

[[110  42]
 [ 73  42]]


In [30]:
# Calculate sensitivity.

sens = tp / (tp + fn)

print(f'Sensitivity: {round(sens, 4)}')
# tell it to use specificity insteead of accuracy 

Sensitivity: 0.3652


In [31]:
# Calculate specificity.

spec = tn / (tn + fp)

print(f'Specificity: {round(spec, 4)}')

Specificity: 0.7237


In [None]:
#extra trees 
#try random forest again
#look for how to optimize for sensitivty 