In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoLars
data = pd.read_csv('data/train_large.csv')

In [2]:
data.columns

Index(['Unnamed: 0', 'Health Service Area', 'Hospital County',
       'Operating Certificate Number', 'Facility Id', 'Facility Name',
       'Age Group', 'Zip Code - 3 digits', 'Gender', 'Race', 'Ethnicity',
       'Length of Stay', 'Type of Admission', 'Patient Disposition',
       'CCS Diagnosis Code', 'CCS Diagnosis Description', 'CCS Procedure Code',
       'CCS Procedure Description', 'APR DRG Code', 'APR DRG Description',
       'APR MDC Code', 'APR MDC Description', 'APR Severity of Illness Code',
       'APR Severity of Illness Description', 'APR Risk of Mortality',
       'APR Medical Surgical Description', 'Payment Typology 1',
       'Payment Typology 2', 'Payment Typology 3', 'Birth Weight',
       'Emergency Department Indicator', 'Total Costs'],
      dtype='object')

In [None]:
to_drop = [  
        "Facility Id",
        "CCS Procedure Code",
        "CCS Diagnosis Code",
        "APR DRG Code",
        "APR MDC Code",
        "APR Severity of Illness Code",
        "Unnamed: 0"
]

In [None]:
data.drop(to_drop,axis=1,inplace=True)
X = data.drop(["Total Costs"],axis=1)
y = data['Total Costs']

In [None]:
poly = PolynomialFeatures(degree=2,include_bias=False)
X_poly = poly.fit_transform(X)

In [None]:
sampling_set = np.random.choice(X.shape[0],size=int(X.shape[0]*0.3),replace=False)
lars_X = X_poly[sampling_set]
lars_y = y[sampling_set]
model = LassoLars(alpha=0.1).fit(lars_X,lars_y)

In [None]:
active_X = X_poly[:,model.active_]

In [3]:
from scipy.stats import chi2_contingency
def cramers_V(var1,var2) :
  crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
  stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
  obs = np.sum(crosstab) # Number of observations
  mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table
  return (stat/(obs*mini))

In [4]:
to_comp = ['Health Service Area', 'Hospital County','Facility Id', 'Facility Name',
       'Age Group', 'Zip Code - 3 digits', 'Gender', 'Race', 'Ethnicity','Type of Admission', 'Patient Disposition',
       'CCS Diagnosis Code', 'CCS Diagnosis Description', 'CCS Procedure Code',
       'CCS Procedure Description', 'APR DRG Code', 'APR DRG Description',
       'APR MDC Code', 'APR MDC Description', 'APR Severity of Illness Code',
       'APR Severity of Illness Description', 'APR Risk of Mortality',
       'APR Medical Surgical Description', 'Payment Typology 1',
       'Payment Typology 2', 'Payment Typology 3',
       'Emergency Department Indicator']
data_encoded = data.get(to_comp)
rows= []
for var1 in data_encoded:
  col = []
  for var2 in data_encoded :
    cramers =cramers_V(data_encoded[var1], data_encoded[var2]) # Cramer's V test
    col.append(cramers) # Keeping of the rounded value of the Cramer's V  
  rows.append(col)
cramers_results = np.array(rows)
df = pd.DataFrame(cramers_results, columns = data_encoded.columns, index =data_encoded.columns)
df

Unnamed: 0,Health Service Area,Hospital County,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,Type of Admission,...,APR MDC Code,APR MDC Description,APR Severity of Illness Code,APR Severity of Illness Description,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Emergency Department Indicator
Health Service Area,1.0,1.0,1.0,1.0,0.003624,0.811583,6.2e-05,0.060777,0.021173,0.003078,...,0.002875,0.002875,0.001004,0.001004,0.001965,0.002636,0.010555,0.019451,0.012117,0.004783
Hospital County,1.0,1.0,1.0,1.0,0.006264,0.324564,0.000451,0.105596,0.05113,0.015728,...,0.003031,0.003031,0.003008,0.003008,0.00323,0.024265,0.026145,0.041796,0.033962,0.04443
Facility Id,1.0,1.0,1.0,1.0,0.044879,0.387115,0.006372,0.230153,0.163295,0.068559,...,0.033214,0.033214,0.019723,0.019723,0.022688,0.067803,0.076152,0.148217,0.166058,0.149258
Facility Name,1.0,1.0,1.0,1.0,0.044922,0.387116,0.006387,0.230155,0.163296,0.068625,...,0.033233,0.033233,0.019726,0.019726,0.022699,0.068199,0.076221,0.148226,0.16606,0.149331
Age Group,0.003624,0.006264,0.044879,0.044922,1.0,0.005034,0.01463,0.014745,0.006575,0.163212,...,0.262584,0.262584,0.060162,0.060162,0.116747,0.041737,0.141611,0.079753,0.017229,0.123692
Zip Code - 3 digits,0.811583,0.324564,0.387115,0.387116,0.005034,1.0,0.000682,0.105736,0.042755,0.008003,...,0.001604,0.001604,0.001974,0.001974,0.00232,0.011194,0.019477,0.02497,0.019536,0.016722
Gender,6.2e-05,0.000451,0.006372,0.006387,0.01463,0.000682,1.0,9.5e-05,3.6e-05,0.007295,...,0.061826,0.061826,0.002247,0.002247,0.003555,0.001083,0.002385,0.000789,0.000398,0.00445
Race,0.060777,0.105596,0.230153,0.230155,0.014745,0.105736,9.5e-05,1.0,0.063863,0.006529,...,0.011239,0.011239,0.002438,0.002438,0.003952,0.004539,0.031317,0.027001,0.011208,0.005195
Ethnicity,0.021173,0.05113,0.163295,0.163296,0.006575,0.042755,3.6e-05,0.063863,1.0,0.004394,...,0.005559,0.005559,0.001417,0.001417,0.002245,0.001726,0.012986,0.008692,0.003564,0.002278
Type of Admission,0.003078,0.015728,0.068559,0.068625,0.163212,0.008003,0.007295,0.006529,0.004394,1.0,...,0.255451,0.255451,0.046788,0.046788,0.044005,0.220751,0.02565,0.011341,0.003775,0.713928


In [None]:
X = data.drop(['Total Costs'],axis=1)
y = data['Total Costs']
from sklearn.linear_model import LassoLars
model = LassoLars(alpha=0.01,eps=1e-10).fit(X,y)

In [None]:
cos = [
       'APR DRG Code', 'APR MDC Code', 'APR Medical Surgical Description','APR Risk of Mortality', 'APR Severity of Illness Code', 
       'CCS Diagnosis Code', 'CCS Procedure Code',

       'Emergency Department Indicator','Health Service Area','Facility Name','Type of Admission',
       
       'Age Group','Ethnicity','Gender','Race', 'Patient Disposition',
       'Payment Typology 1', 'Payment Typology 2', 'Payment Typology 3',
       'Zip Code - 3 digits','Hospital County'
]
for col in cos:
    fig = plt.figure()
    fig.figsize = (5,5)
    ax = fig.add_subplot(1,1,1)
    plt.tight_layout()
    ax.set_title(col)
    data.plot(kind="scatter", x=col, y="Total Costs",ax = ax)
    

In [None]:
for col in cols:
    print(data[col].value_counts())

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
X = data.drop(["Total Costs"],axis=1)
y = data["Total Costs"]
X_cont = X.select_dtypes(include=["float64"])
X_dis = X.select_dtypes(exclude=["float64"])
X_dis_new = SelectKBest(mutual_info_regression,k=20).fit_transform(X_dis,y)
X_dis_new.shape


In [None]:
test_x_new = best_model.transform(test_x)
pred = model.predict(test_x)

from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(test_y, pred)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3,interaction_only=True,include_bias=False)
xtrain_apr = data.get(['APR DRG Code', 'APR MDC Code','APR Severity of Illness Code', 'APR Risk of Mortality','APR Medical Surgical Description'])
xtrain_payment = data.get(['Payment Typology 1','Payment Typology 2', 'Payment Typology 3'])
xtrain_ccs = data.get(['CCS Diagnosis Code','CCS Procedure Code'])