In [1]:
import numpy as np
from numpy.linalg.linalg import norm
import scipy as sp
import sys
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoLars
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [None]:
def moore_penrose_pseudoinverse(X,y,lamb):
    return np.linalg.inv(np.dot(X.T,X) + lamb*np.identity(X.shape[1])).dot(X.T).dot(y)

In [None]:
data = "data/train_large.csv"
predict_data ="data/test.csv"
data = pd.read_csv(data)
# to_drop = [  
#     "Facility Id",
#     "CCS Procedure Code",
#     "CCS Diagnosis Code",
#     "APR DRG Code",
#     "APR MDC Code",
#     "APR Severity of Illness Code",
#     "Unnamed: 0"
# ]
# data.drop(to_drop,axis=1,inplace=True)
X = data.drop(['Total Costs'],axis = 1)
y = data['Total Costs']

poly = PolynomialFeatures(degree=2,include_bias=False)
X_poly = poly.fit_transform(X)

sampling_set = np.random.choice(X.shape[0],size=int(X.shape[0]*0.3),replace=False)
lars_X = X_poly[sampling_set]
lars_y = y[sampling_set]
model = LassoLars(alpha=0.1).fit(lars_X,lars_y)

active_X = np.c_[np.ones(X_poly.shape[0]),X_poly[:,model.active_]]

In [None]:
#CROSS VALIDATION
lambdas = [0.001,0.003,0.01,0.03,0.1,0.3,1,3,10,30,100,300,1000]
train_x,test_x,train_y,test_y = train_test_split(active_X,y,test_size=0.7,random_state=42)

print("Start cv")
fold = np.random.randint(0,10,size = train_x.shape[0])
d = {'regularization':[], 'accuracy':[]}

In [None]:
train_x.shape,train_y.shape

In [None]:
def cv(r,cv_x,cv_y,folds):
    acc = np.zeros(folds)
    for i in range(folds):
        train_cv_x = cv_x[np.where(fold != i)]
        test_cv_x = cv_x[np.where(fold == i)]
        train_cv_y = cv_y[fold != i]
        test_cv_y = cv_y[fold == i]
        w = moore_penrose_pseudoinverse(train_cv_x,train_cv_y,r)
        acc[i] = r2_score(test_cv_y,np.dot(test_cv_x,w))
    return np.mean(acc)
    
cv(10,train_x,train_y,10)

In [None]:
for r in lambdas:
    d['regularization'].append(r)
    v = cv(r,train_x,train_y,10)
    d['accuracy'].append(v)
    print(f"{r} {v}")
print("Done CV")

In [None]:
mini = np.argmax(d['accuracy'])
mini_r = d['regularization'][mini]
print(mini_r)


In [None]:
w = moore_penrose_pseudoinverse(active_X,y,mini_r)
pred_data = pd.read_csv(predict_data)
pred_data.drop(to_drop,axis=1,inplace=True)
poly_pred = poly.fit_transform(pred_data)
pred_X = np.c_[np.ones(poly_pred.shape[0]),poly_pred[:,model.active_]]
pred_y = np.dot(pred_X,w)
np.savetxt(sys.argv[4],pred_y)