In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from sklearn.datasets import load_boston
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso

def pretty_print_coefs(coefs, names = None, index = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    if index == None:
        index =["%s" % x for x in range (len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    df = pd.DataFrame(list(lst), index =index, columns =['Coef', 'Name']) 
    return df
   # return " + ".join("%s * %s" % (round(coef, 3), name)
    #                               for coef, name in lst)

def end_result(coefs1, coefs2, names):
    lstfinal = []
    namesfinal = []
    index = []
    for i in range(len(coefs1)):
        print(str(coefs1[i])+" "+str(coefs2[i]))
        print("\n")
        if coefs2[i]/coefs1[i] >=0.7:
            lstfinal.append(coefs2[i])
            namesfinal.append(names[i])
            index.append(i)
    print(len(lstfinal))
    return pretty_print_coefs(lstfinal,namesfinal, index)

def main():
    data = pd.read_csv("National_112.csv", encoding='latin-1')
    data.replace(np.NaN, 0, inplace=True)
    print(1)
    temp = data['MRR_2010']
    data = data.drop("MRR_2010", axis=1)
    data = data.drop("Tract", axis=1)
    data = data.drop("County", axis=1)
    data = data.drop("GIDTR", axis=1)
    data = data.drop("State", axis=1)
    data = data.drop("State_name", axis=1)
    data = data.drop("County_name", axis=1)
    dataArr = preprocessing.scale(data)
    #data.replace(0, np.NaN, inplace=True)
    print(2)
    X_train, X_test, y_train, y_test = train_test_split(dataArr, temp, test_size=0.3, random_state=10)
    ols_reg = LinearRegression()
    ols_reg.fit(X_train, y_train)
    ols_pred = ols_reg.predict(X_test)
    print(3)
    
    # initialize
    ridge_reg = Ridge(alpha=0)
    print(4)
    ridge_reg.fit(X_train, y_train)
    print(5)
    col_list = []
    print(6)
    j=0
    for i in data.columns:
            col_list.append(i+str(j))
            j=j+1
    print(len(ridge_reg.coef_))
    print(len(col_list))
    ridge_df = pd.DataFrame({'variable': col_list, 'estimate': ridge_reg.coef_})
    print(7)
    ridge_train_pred = []
    ridge_test_pred = []
    print(8)

    # iterate lambdas
    '''
    for alpha in [500,499,500,501,1000,10000,50000]:
        ridge_reg = Ridge(alpha=alpha)
        ridge_reg.fit(X_train, y_train)
        var_name = 'estimate' + str(alpha)
        ridge_df[var_name] = ridge_reg.coef_
    # prediction
        ridge_train_pred.append(ridge_reg.predict(X_train))
        ridge_test_pred.append(ridge_reg.predict(X_test))
        '''
        
   # for alpha in np.arange(550, 650, 1):
    # training
     #   ridge_reg = Ridge(alpha=alpha)
     #   ridge_reg.fit(X_train, y_train)
      #  var_name = 'estimate' + str(alpha)
      #  ridge_df[var_name] = ridge_reg.coef_
    # prediction
     #   ridge_train_pred.append(ridge_reg.predict(X_train))
      #  ridge_test_pred.append(ridge_reg.predict(X_test))
        
    print(9)
    ridge_df = ridge_df.set_index('variable').T.rename_axis('estimate').rename_axis(None, 1).reset_index()
   
    # MSE of Ridge and OLS
    print(10)
    ridge_mse_test = [mean_squared_error(y_test, p) for p in ridge_test_pred]
    print(11)
    ols_mse = mean_squared_error(y_test, ols_pred)
    print(12)
    # plot mse
    plt.plot(ridge_mse_test[:200], 'ro')
    plt.axhline(y=ols_mse, color='g', linestyle='--')
    plt.title("Ridge Test Set MSE", fontsize=16)
    plt.xlabel("Model Simplicity$\longrightarrow$")
    plt.ylabel("MSE")
    print('MSE Scores belows')
    ridge_reg = Ridge(alpha=0)
    ridge_reg.fit(X_train, y_train)
    coef1 = ridge_reg.coef_
    #print ("Ridge model:", pretty_print_coefs(ridge_reg.coef_, names=col_list))
    ridge_reg = Ridge(alpha=640)
    ridge_reg.fit(X_train, y_train)
    coef2 = ridge_reg.coef_
    #print ("Ridge model:", pretty_print_coefs(ridge_reg.coef_, names=col_list))
    print("")
    print("FINAL:")
    print(end_result(coef1, coef2, names=col_list))
    #for i in range (100):
    #   print(ridge_mse_test[i])
    
main()
    


1
2
3
4
5
6
112
112
7
8
9
10
11
12
MSE Scores belows

FINAL:
2.58132292823 2.47653758187


0.576397212475 0.460211997617


0.266476554699 0.200838895619


-19.8695392268 0.127896900466


287.534685548 -0.107721059774


-20.1376202844 0.233837728005


292.46347925 0.107462111689


7.06256647489 -0.540821191017


367.891596533 -0.233761271233


17.3814503611 -0.0249237597138


9.27962647302 0.106839132778


27.2434563905 -0.128867437731


13.9037227035 -0.0652485831394


24.3446555231 -0.13874369186


-42.4443965443 0.117178627247


21.6997644282 -0.0351545541196


-34.8978928987 0.0550180662563


25.6242779202 0.786677365508


-43.249283615 -0.0859759533705


-130.816982357 0.00995034041257


83.4141093719 -0.0432051979643


100.25057235 0.0497376820189


-2.4735858493 -0.635204239788


1.46237741619 0.590732506575


-1.76833217489 0.340773591235


0.801711114453 0.212424974596


-0.925423352805 0.343087660207


-0.343560910991 -0.537498392284


-1.89749970992 -1.55002832033


-1.160840