In [79]:
%matplotlib inline  
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import random
import json
import time

In [227]:
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn import linear_model as lm
from sklearn.utils import shuffle

In [272]:
def model_cv(X_train, y_train, alphas, cv_type, k=5):
    R_cv=[]
    R_t=[]
    for a in alphas:
        
        model=lm.Lasso(alpha=a, max_iter=1000000)
        if cv_type=="k":
            r_cv, r_t = k_fold_cv(model, X_train, y_train, k)
        else:
            r_cv, r_t = leave_one_out(model, X_train, y_train)
            
        R_cv.append(r_cv)
        R_t.append(r_t)
    return R_cv, R_t

def leave_one_out(model, X_train, y_train):
    R_train=[]
    R_cv=[]
    y=y_train.values
    for i in range(len(y)):
        X_sample_train=np.delete( X_train, [i], 0 )
        y_sample_train=np.delete( y, [i], 0 )
        X_sample_cv=X_train[i]
        X_sample_cv=np.reshape(X_sample_cv, (1,-1))
        y_sample_cv=y[i]
        y_sample_cv=np.reshape(y_sample_cv, (1,-1))
        
        model.fit(X_sample_train, y_sample_train)
        r_t=scorer(model, X_sample_train, y_sample_train)
        r = model.predict(X_sample_cv)[0]
        R_train.append(r_t)
        R_cv.append(r)
        
    R_cv = 1 - np.sum(( R_cv - y )**2) / np.sum((y.mean() - y)**2)
    return np.mean(R_cv), np.mean(R_train)

def k_fold_cv(model, X_train, y_train, k):
    n = len(y_train) // k  
    R_train=[]
    R_cv=[]
    #tot=0
    for i in range(0, k-1):
        X_sample = X_train.copy()
        y_sample = y_train.values
        X_sample_train=np.delete( X_sample, [j for j in range((i*n),((i+1)*n))], 0 )
        y_sample_train=np.delete( y_sample, [j for j in range((i*n),((i+1)*n))], 0 )
        
        X_sample_cv=X_sample[(i*n):((i+1)*n)]
        y_sample_cv=y_sample[(i*n):((i+1)*n)]
        
        model.fit(X_sample_train, y_sample_train)
        r_t=scorer(model, X_sample_train, y_sample_train)
        r=scorer(model, X_sample_cv, y_sample_cv)
        R_train.append(r_t)
        R_cv.append(r)
        #tot = tot + len(y_sample)
    X_sample = X_train.copy()
    y_sample = y_train.copy()

    X_sample_train=X_sample[:((k-1)*n)]
    y_sample_train=y_sample[:((k-1)*n)]
    X_sample_cv=X_sample[((k-1)*n):]
    y_sample_cv=y_sample[((k-1)*n):]
    
    model.fit(X_sample_train, y_sample_train)
    r_t=scorer(model, X_sample_train, y_sample_train)
    r=scorer(model, X_sample_cv, y_sample_cv)
    R_train.append(r_t)
    R_cv.append(r)
    #tot = tot + len(y_sample)
    #print(tot)
    return np.mean(R_cv), np.mean(R_train)

def k_fold_test(model, X_test, y_test, k=2):
    n = len(y_test) // k    
    R=[]
    #tot=0
    for i in range(0,k-1):
        
        X_sample=X_test[(i*n):((i+1)*n)]
        y_sample=y_test[(i*n):((i+1)*n)]
        r=scorer(model, X_sample, y_sample)
        R.append(r)
        #tot = tot + len(y_sample)
    
    X_sample=X_test[((k-1)*n):]
    y_sample=y_test[((k-1)*n):]
    r=scorer(model, X_sample, y_sample)
    R.append(r)
    #tot = tot + len(y_sample)
    #print(tot)
    return np.mean(R)

In [160]:
def error(model, X, y):
    e = np.sum((model.predict(X) - y)**2) / len(y)
    return e

def scorer(model, X, y):
    e = 1 - np.sum( (model.predict(X) - y)**2) / np.sum((y.mean() - y)**2 )
    return e

In [161]:
matrix_df=pd.read_csv("../data/matrix/matrix_df_159.csv", index_col=0)

In [163]:
def preprocess(matrix_df, l=["prevalence_c", "prevalence_a"]):
    X = matrix_df.drop(l , axis=1, inplace=False)
    
    y_c = matrix_df["prevalence_c"]
    y_a = matrix_df["prevalence_a"]
    
    X = X.fillna(X.mean())
    return X, y_a, y_c

def split_scale(X, y, test_size, r, cv_type):
    
    scaler=RobustScaler()
    if cv_type == "k":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=r)
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    else:
        X_train=shuffle(X)
        y_train=shuffle(y)
        X_test=[]
        y_test=[]
        
    X_train = scaler.fit_transform(X_train)
    
    return X_train, X_test, y_train, y_test

In [273]:
def run_lasso(X_train, X_test, y_train, y_test, cv_type, k):
    
    alphas= [0.00001, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 1.0]
    R_cv, R_t = model_cv(X_train, y_train, alphas, cv_type=cv_type, k=k)
    alpha=alphas[np.argmax(R_cv)]
    
    model=lm.Lasso(alpha=alpha, max_iter=1000000)
    if cv_type=="k":
        model.fit(X_train, y_train)
        R_test=k_fold_test(model, X_test, y_test)
        return model, R_t, R_cv, R_test
    else:
        return model, R_cv, R_t, _

def main_lasso(matrix_df, l=["prevalence_c", "prevalence_a", "income", "insurance"], test_size=0.3, flag="c", n=20, cv_type="k", k=5):
    
    X, y_a, y_c = preprocess(matrix_df, l)
    R=[]
    if flag=="c":
        y=y_c
    else:
        y=y_a
        
    for _ in range(n):
        X_train, X_test, y_train, y_test = split_scale(X, y, test_size=test_size, r=np.random.randint(100), cv_type=cv_type)
        model, r_train, r_cv, r_test = run_lasso(X_train, X_test, y_train, y_test, cv_type, k)

        R.append([model.alpha, r_test])
    R=np.array(R)
    best_alpha=np.mean(R[:,0])
    r=np.mean(R[:,1])
    return best_alpha, r

# model selection

In [282]:
def routine_grid(model, parameters, X_train, y_train, X_test, y_test):
    '''
    routine for cross-validation with different hyper-parameters
    '''
    grid = GridSearchCV(model, parameters, scoring="r2", cv=5)
    grid.fit(X_train,y_train)
    
    model = grid.best_estimator_
    model.fit(X_train, y_train)  
    best_score = model.score(X_train,y_train)
    test_score = k_fold_test(model, X_test, y_test)
    
    return model, best_score, test_score

In [283]:
def main_grid(matrix_df):
    '''
    regression model selection
    '''
    
    lasso={"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0]}
    ridge={"alpha": [0.01, 0.1, 1.0, 10.0, 50.0, 100.0]}
    elasticNet={"alpha":[0.00001, 0.0001, 0.001,0.01, 0.1, 1.0], "l1_ratio":[0.1, 0.5, 0.7, 0.8, 0.9, 1.0]}
    kernel_ridge={"alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5)}
    models = {"lasso": lm.Lasso(max_iter=100000), "elasticNet": lm.ElasticNet(max_iter=100000), 
              "ridge": lm.Ridge(max_iter=100000), "kernelRidge": KernelRidge(kernel='rbf')}
    parameters = {"lasso": lasso, "elasticNet": elasticNet, "ridge": ridge, "kernelRidge": kernel_ridge}
    res = []
    
    X, y_a, y_c = preprocess(matrix_df)
    X_train, X_test, y_train, y_test = split_scale(X, y_c, 0.1, r=np.random.randint(100), cv_type="k")
    
    for m in ["lasso", "elasticNet", "ridge"]:

        model = models[m]
        par = parameters[m]
        

        model, best_score, test_score = routine_grid(model, par, X_train, y_train, X_test, y_test)
        res.append([model,best_score,test_score])
        print(m)
    return res

In [284]:
matrix_df=pd.read_csv("../data/matrix/matrix_df_163.csv", index_col=0)

In [292]:
r_lasso=[]
r_ridge=[]
r_elastic=[]

for i in range(10):
    res=main_grid(matrix_df)
    r_lasso.append(res[0][2])
    r_elastic.append(res[1][2])
    r_ridge.append(res[2][2])

lasso
elasticNet
ridge
lasso
elasticNet
ridge
lasso
elasticNet
ridge
lasso
elasticNet
ridge
lasso
elasticNet
ridge
lasso
elasticNet
ridge
lasso
elasticNet
ridge
lasso
elasticNet
ridge
lasso
elasticNet
ridge
lasso
elasticNet
ridge


In [294]:
np.mean(r_lasso)

0.77167360573471044

In [297]:
np.mean(r_elastic)

0.76067111036787738

In [296]:
np.mean(r_ridge)

0.77259076404230764

In [171]:
#matrix_spatial_average=pd.DataFrame(index=matrix_spatial_df.index)

In [255]:
# for c in cols:
#     temp=matrix_spatial_df[c]
#     temp=np.mean(temp, axis=1)
#     temp=pd.DataFrame(temp, columns=[c], index=matrix_spatial_df.index)
#     matrix_spatial_average=pd.concat([matrix_spatial_average, temp], axis=1)