In [1]:
import pandas as pd
import os

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(rc={'figure.figsize':(11, 4)})

## Reading the Data

In [39]:
def read_data():
    claims_datas = [f"claims_{i}" for i in range(1, 6)]
    claims = pd.read_csv("Data/claims1.csv")
    for i in range(2, 6):
        claims = claims.append(pd.read_csv(f"Data/claims{i}.csv"))
    return claims

In [40]:
claims = read_data()

## Bivariate Linear Regressions

In [47]:
claims = claims.drop(columns = ["Unnamed: 0"])

In [48]:
claims

Unnamed: 0,reimb2010,reimb2008,reimb2009,age2010,male,race,heart.failure,kidney,cancer,copd,...,OfficeVisit,EyeExam,EKG,xray,CTScan,PhysicalTherapy,Ambulance,acuity,costTrend,monthsWithClaims
0,390,320.0,360,97,0,White,0,0,0,0,...,4,1,0,1,0,0,0,0.235294,-0.018856,9
1,970,58800.0,2740,79,0,White,1,1,0,1,...,12,0,2,8,1,0,1,0.853591,-0.027265,21
2,5630,510.0,1580,87,0,White,1,0,0,0,...,10,0,1,2,1,1,0,0.175115,0.496742,20
3,3480,2930.0,49330,79,0,White,1,1,0,1,...,14,1,1,6,4,3,3,0.709147,0.214955,19
4,920,1500.0,1650,85,1,White,1,0,1,0,...,16,2,1,3,0,1,0,0.174603,0.231568,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204783,6620,2090.0,4070,54,1,Block,1,0,0,1,...,12,3,3,5,0,0,3,0.194805,0.333441,22
204784,490,12450.0,1560,70,1,White,1,1,0,0,...,6,0,0,2,0,0,1,0.722341,-0.129291,15
204785,690,1370.0,2440,75,0,Other,1,1,0,1,...,12,1,2,6,1,2,2,0.207349,0.183677,19
204786,1140,6530.0,9220,78,0,White,0,1,1,0,...,20,0,4,2,2,1,0,0.121905,0.117923,23


In [9]:
import numpy as np
import warnings
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.model_selection import train_test_split

In [13]:
def normalize(df, scaler=None):
    '''
    If scaler is not none, use given scaler's means and sds to normalize (used for test set case)
    '''
    #Normalizing train set
    if(scaler is None):
      scaler = StandardScaler()
      normalized_features = scaler.fit_transform(df)
    #Normalizing test set
    else:
      normalized_features = scaler.transform(df)
        
    outcomes = df.iloc[:,-1]
    normalized_df = pd.DataFrame(normalized_features)
    
    # Recover the original indices and column names                                          
    normalized_df.index=df.index
    normalized_df.columns=df.columns

    return normalized_df, scaler

In [76]:
train, test = train_test_split(
claims,
test_size=0.20, random_state = 505)

In [49]:
claims

Unnamed: 0,reimb2010,reimb2008,reimb2009,age2010,male,race,heart.failure,kidney,cancer,copd,...,OfficeVisit,EyeExam,EKG,xray,CTScan,PhysicalTherapy,Ambulance,acuity,costTrend,monthsWithClaims
0,390,320.0,360,97,0,White,0,0,0,0,...,4,1,0,1,0,0,0,0.235294,-0.018856,9
1,970,58800.0,2740,79,0,White,1,1,0,1,...,12,0,2,8,1,0,1,0.853591,-0.027265,21
2,5630,510.0,1580,87,0,White,1,0,0,0,...,10,0,1,2,1,1,0,0.175115,0.496742,20
3,3480,2930.0,49330,79,0,White,1,1,0,1,...,14,1,1,6,4,3,3,0.709147,0.214955,19
4,920,1500.0,1650,85,1,White,1,0,1,0,...,16,2,1,3,0,1,0,0.174603,0.231568,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204783,6620,2090.0,4070,54,1,Block,1,0,0,1,...,12,3,3,5,0,0,3,0.194805,0.333441,22
204784,490,12450.0,1560,70,1,White,1,1,0,0,...,6,0,0,2,0,0,1,0.722341,-0.129291,15
204785,690,1370.0,2440,75,0,Other,1,1,0,1,...,12,1,2,6,1,2,2,0.207349,0.183677,19
204786,1140,6530.0,9220,78,0,White,0,1,1,0,...,20,0,4,2,2,1,0,0.121905,0.117923,23


In [77]:
#Dropping Binary Variables
columns_drop = ["reimb2010", "reimb2008", "reimb2009", "race", 
                "heart.failure", "kidney", "cancer", "copd", 
                "depression", "diabetes",  "ihd", "osteoporosis", 
                "arthritis", "stroke", "male"]

In [78]:
df_train, scaler = normalize(train.drop(columns = columns_drop))
df_test, _ = normalize(test.drop(columns = columns_drop), scaler)

In [79]:
def LinearRegression(indep_var, dep_var, normalized_train = df_train, 
                     normalized_test = df_test, 
                     nn_train = train, nn_test = test):
    #Getting the features
    train_features_variable = normalized_train.loc[:,indep_var]
    test_features_variable = normalized_test.loc[:,indep_var]
    train_targets_variable = nn_train.loc[:,dep_var]
    test_targets_variable = nn_test.loc[:,dep_var]
    #Creating the regression object
    regr = linear_model.LinearRegression()
    regr.fit(np.array(train_features_variable).reshape(-1,1), train_targets_variable)
    #Predicting the regression
    predicted_line_variable = regr.predict(np.array(test_features_variable).reshape(-
    1,1)) #normalized
    return (test_targets_variable, predicted_line_variable)

In [80]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [98]:
def Regression_Evaluation(indep_var, dep_var):
    target, predicted = LinearRegression(indep_var, dep_var)
    return [indep_var, r2_score(target, predicted),mean_squared_error(target, predicted),mean_absolute_error(target, predicted)]

In [99]:
lst_metrics = []
for variable in df_train.columns:
    lst_metrics.append(Regression_Evaluation(variable, "reimb2010"))

In [100]:
metric = pd.DataFrame(lst_metrics, columns = ["Feature", "R2", "MSE", "MAE"])

In [101]:
metric

Unnamed: 0,Feature,R2,MSE,MAE
0,age2010,0.00032,37001880.0,3057.700643
1,InpatientClaims,0.01806,36345230.0,2993.615372
2,OutpatientClaims,0.05637,34927250.0,2838.386384
3,OfficeVisit,0.062708,34692670.0,2780.202617
4,EyeExam,0.010974,36607540.0,3013.135827
5,EKG,0.041932,35461650.0,2907.673966
6,xray,0.054552,34994530.0,2855.367959
7,CTScan,0.033818,35762000.0,2934.404686
8,PhysicalTherapy,0.014418,36480060.0,2989.414739
9,Ambulance,0.051912,35092270.0,2914.401276
