In [38]:
%matplotlib inline  
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import nltk
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

In [66]:
matrix_df=pd.read_csv("../data/matrix/matrix_df_107.csv", index_col=0)

In [105]:
X = matrix_df[list(matrix_df.columns[:-4])+list(matrix_df.columns[-2:])]


In [100]:
matrix_df.columns[-2:]

Index(['income', 'insurance'], dtype='object')

In [106]:
X = X.fillna(X.mean())
y_c = matrix_df["prevalence_c"]
y_a = matrix_df["prevalence_a"]

In [133]:
def routine_grid(model, parameters, X_train,y_train,X_test,y_test):
    grid = GridSearchCV(model, parameters, scoring="r2", cv=3)
    grid.fit(X_train,y_train)
    
    model = grid.best_estimator_
    model.fit(X_train, y_train)  
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    
    return model, best_score, test_score
def scorer(model, X, y):
    e = 1 - np.sum( (model.predict(X) - y)**2) / np.sum((y.mean() - y)**2 )
    return e
def k_fold(model, X_test, y_test, k=3):
    n = len(y_test) // k    
    R=[]
    #tot=0
    for i in range(0,k-1):
        
        X_sample=X_test[(i*n):((i+1)*n)]
        y_sample=y_test[(i*n):((i+1)*n)]
        r=scorer(model, X_sample, y_sample)
        R.append(r)
        #tot = tot + len(y_sample)
    
    X_sample=X_test[((k-1)*n):]
    y_sample=y_test[((k-1)*n):]
    r=scorer(model, X_sample, y_sample)
    R.append(r)
    #tot = tot + len(y_sample)
    #print(tot)
    return np.mean(R)

### with y_c

In [134]:
models = []
train_scores = []
test_scores =[]
for i in range(30): 
    X_train, X_test, y_train, y_test = train_test_split(X, y_c, test_size=0.3, random_state=np.random.randint(100))
    model = ElasticNet(random_state=0, fit_intercept = True, normalize = True, max_iter= 10000)
    par = {"alpha": np.arange(0.0005, 0.0007, 0.0001),
                 "l1_ratio": np.arange(0.85, 1, 0.01)}
    model_best, best_score, test_score = routine_grid(model,par,X_train,y_train,X_test,y_test)
    models.append(model_best)
    train_scores.append(best_score)
    test_scores.append(test_score)
    

In [138]:
np.max(test_scores)

0.81559935202910805

In [136]:
np.mean(test_scores)

0.75576041644981684

### with y_a

In [139]:
models= []
train_scores = []
test_scores =[]
for i in range(30): 
    X_train, X_test, y_train, y_test = train_test_split(X, y_a, test_size=0.3, random_state=np.random.randint(100))
    model = ElasticNet(random_state=0, fit_intercept = True, normalize = True, max_iter= 10000)
    par = {"alpha": np.arange(0.0005, 0.0007, 0.0001),
                 "l1_ratio": np.arange(0.85, 1, 0.01)}
    model_best, best_score, test_score = routine_grid(model,par,X_train,y_train,X_test,y_test)
    models.append(model_best)
    train_scores.append(best_score)
    test_scores.append(test_score)

In [140]:
np.max(test_scores)

0.84454845924393096

In [141]:
np.mean(test_scores)

0.7746951576858585

### spatial data

In [174]:
matrix_df_s=pd.read_csv("../data/matrix_spatial_average.csv", index_col=0)
matrix_df_s.head()


Unnamed: 0,safety,adults,functioning,meme,clinic,pregnancy,robin,ebola,icd,ashwagandha,...,oils,pills,cancer,nursing,memory,medication,todd,party,postpartum,state
Alabama,73.0,90.0,60.833333,77.666667,46.333333,83.333333,66.0,56.166667,73.666667,47.833333,...,56.166667,78.0,91.833333,88.5,82.0,85.5,72.833333,83.5,76.5,66.333333
Alaska,100.0,81.0,69.666667,94.333333,53.333333,85.666667,78.833333,63.5,51.5,59.0,...,75.5,55.333333,71.0,51.0,74.666667,74.833333,59.5,62.833333,95.666667,69.833333
Arizona,58.666667,78.666667,58.666667,95.166667,38.0,74.333333,73.833333,53.333333,69.5,71.333333,...,46.666667,61.666667,82.333333,57.833333,82.5,82.166667,61.666667,82.166667,71.833333,39.0
Arkansas,64.5,97.166667,72.166667,81.0,72.0,89.333333,64.0,63.333333,66.666667,47.0,...,51.666667,74.0,85.5,86.5,82.333333,85.166667,68.5,77.166667,74.833333,59.166667
California,56.166667,70.833333,53.833333,86.5,25.833333,69.833333,69.333333,51.166667,45.0,58.0,...,30.833333,60.666667,75.5,42.5,85.0,65.166667,57.666667,90.166667,69.166667,38.333333


In [175]:
y_a = matrix_df_s["prevalence_a"]
y_c = matrix_df_s["prevalence_c"]
X = matrix_df_c.drop(["prevalence_a","prevalence_c"], axis =1)

In [179]:
models= []
train_scores = []
test_scores =[]
for i in range(10): 
    X_train, X_test, y_train, y_test = train_test_split(X, y_a, test_size=0.3, random_state=np.random.randint(100))
    model = ElasticNet(random_state=0, fit_intercept = True, normalize = True, max_iter= 100000)
    par = {"alpha": np.arange(0.001, 0.9, 0.002),
                 "l1_ratio": np.arange(0.1, 1, 0.05)}
    model_best, best_score, test_score = routine_grid(model,par,X_train,y_train,X_test,y_test)
    models.append(model_best)
    train_scores.append(best_score)
    test_scores.append(test_score)
    print(test_score)

0.0455116134547
0.43718698451
0.193577222525


KeyboardInterrupt: 

In [178]:
np.arange(0.1, 1, 0.1)

array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9])

run 6 times 