# Logisitic Regression, Linear SVC, Gaussian Naive Bayes

### Reading in the Data

In [20]:
import pipeline as pl
import imp

import pandas as pd
import numpy as np
import random
import time

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
imp.reload(pl)
claims = pl.read_all()

In [22]:
claims.head()

Unnamed: 0.1,Unnamed: 0,reimb2010,reimb2008,reimb2009,age2010,male,race,heart.failure,kidney,cancer,...,OfficeVisit,EyeExam,EKG,xray,CTScan,PhysicalTherapy,Ambulance,acuity,costTrend,monthsWithClaims
0,0,390,320.0,360,97,0,White,0,0,0,...,4,1,0,1,0,0,0,0.235294,-0.018856,9
1,1,970,58800.0,2740,79,0,White,1,1,0,...,12,0,2,8,1,0,1,0.853591,-0.027265,21
2,2,5630,510.0,1580,87,0,White,1,0,0,...,10,0,1,2,1,1,0,0.175115,0.496742,20
3,3,3480,2930.0,49330,79,0,White,1,1,0,...,14,1,1,6,4,3,3,0.709147,0.214955,19
4,4,920,1500.0,1650,85,1,White,1,0,1,...,16,2,1,3,0,1,0,0.174603,0.231568,21


### Exploring Data

In [23]:
pl.stats(claims)

Shape of the dataframe is (1023948, 29)


Unnamed: 0.1,Unnamed: 0,reimb2010,reimb2008,reimb2009,age2010,male,heart.failure,kidney,cancer,copd,...,OfficeVisit,EyeExam,EKG,xray,CTScan,PhysicalTherapy,Ambulance,acuity,costTrend,monthsWithClaims
count,1023948.0,1023948.0,1023948.0,1023948.0,1023948.0,1023948.0,1023948.0,1023948.0,1023948.0,1023948.0,...,1023948.0,1023948.0,1023948.0,1023948.0,1023948.0,1023948.0,1023948.0,1023948.0,1023948.0,1023948.0
mean,511975.5,2843.425,4648.829,4919.452,73.18483,0.4325093,0.5044504,0.3157953,0.1277887,0.2612935,...,9.863529,1.130978,1.410392,2.454312,0.9224609,3.446885,1.223081,0.3751325,0.08071224,14.4909
std,295589.9,5959.795,10430.17,8954.888,12.37868,0.4954243,0.4999804,0.4648321,0.3338546,0.4393397,...,8.518056,1.650594,1.859028,3.023727,1.449471,5.442233,3.178825,0.2586138,0.207368,8.0703
min,0.0,0.0,0.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04166667,-0.7627773,0.0
25%,255987.8,240.0,280.0,570.0,68.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1841004,-0.04075997,9.0
50%,511975.5,1120.0,1350.0,2020.0,74.0,0.0,1.0,0.0,0.0,0.0,...,9.0,0.0,1.0,1.0,0.0,1.0,0.0,0.3049738,0.07135935,17.0
75%,767963.2,2620.0,3800.0,4970.0,81.0,1.0,1.0,1.0,0.0,1.0,...,15.0,2.0,2.0,4.0,1.0,5.0,1.0,0.5272727,0.2390716,21.0
max,1023951.0,160500.0,230770.0,187070.0,100.0,1.0,1.0,1.0,1.0,1.0,...,61.0,23.0,28.0,41.0,21.0,133.0,126.0,1.0,0.8179267,24.0


In [24]:
claims.columns

Index(['Unnamed: 0', 'reimb2010', 'reimb2008', 'reimb2009', 'age2010', 'male',
       'race', 'heart.failure', 'kidney', 'cancer', 'copd', 'depression',
       'diabetes', 'ihd', 'osteoporosis', 'arthritis', 'stroke',
       'InpatientClaims', 'OutpatientClaims', 'OfficeVisit', 'EyeExam', 'EKG',
       'xray', 'CTScan', 'PhysicalTherapy', 'Ambulance', 'acuity', 'costTrend',
       'monthsWithClaims'],
      dtype='object')

### Pre-Processing Data

We will be trying OfficeVisit, costTrend, InpatientClaims, OutpatientClaims, age2010, reimb2009, reimb2008, heart.failure, and kidney as features to predict the outcome variable, which is reimb2010.

In [25]:
df = claims[['reimb2010', 'OfficeVisit', 'costTrend', 'InpatientClaims', 'OutpatientClaims', 'age2010', 
            'reimb2009', 'reimb2008', 'heart.failure', 'kidney']]

In [26]:
#fill missing columns
pl.fill_all_missing(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [31]:
train, test = pl.train_test(df, 0.2)

In [32]:
#normalize feature columns
train_df, test_df = pl.normalize(train, test, ['OfficeVisit', 'costTrend', 'InpatientClaims', 'OutpatientClaims', 'age2010', 
            'reimb2009', 'reimb2008', 'heart.failure', 'kidney'])

In [41]:
# Config: Dictionaries of models and hyperparameters - taken from class assignment
# MODELS = {
#     'LogisticRegression': LogisticRegression(), 
#     'LinearSVC': LinearSVC(), 
#     'GaussianNB': GaussianNB()
# }

# GRID = {
#     'LogisticRegression': [{'penalty': x, 'C': y, 'random_state': 0} 
#                            for x in ('l2', 'none') \
#                            for y in (0.01, 0.1, 1, 10, 100)],
#     'GaussianNB': [{'priors': None}],
#     'LinearSVC': [{'C': x, 'random_state': 0} \
#                   for x in (0.01, 0.1, 1, 10, 100)]
# }
MODELS = {
    'LogisticRegression': LogisticRegression(), 
    'LinearSVC': LinearSVC(), 
    'GaussianNB': GaussianNB()
}

GRID = {
    'LogisticRegression': [{'penalty': 'l2', 'C': 0.1, 'random_state': 0}],
    'GaussianNB': [{'priors': None}],
    'LinearSVC': [{'C': 1, 'random_state': 0}]
}

In [None]:
results = pl.grid_search(train_df, test_df, MODELS, GRID, 'reimb2010') #kills the kernel running this

Training model: LogisticRegression | {'penalty': 'l2', 'C': 0.1, 'random_state': 0}
