In [1]:
%matplotlib inline 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import scale 
from sklearn import cross_validation 
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
from scipy import stats
from scipy.stats import zscore



### Load and preprocess data

In [39]:
### Import data ###
df=pd.DataFrame.from_csv('INCA_data.csv')

####################

### Z-score cols ###
zcolumns=['Age', 'Sex', 'Race','Nonaccept','Goals','Impulse',\
          'Awareness','Strategies','Clarity ']
for column in zcolumns:
    df[column]=zscore(df[column])
    
####################

### Set static variables ###
static_vars = df.iloc[:,8:]
### Set X and y ###

val = df[['V_m']]
# val = df[['V_b']]
# val = df[['V_err']]
# val = df[['V_acc']]
X = val.join(static_vars) # join valence and static vars
y = df.iloc[:,1:2] # A_m

# arous = df[['A_m']]
# # arous = df[['A_b']]
# # arous = df[['A_err']]
# # arous = df[['A_acc']]
# X = arous.join(static_vars) # join arousal and static vars
# y = df.iloc[:,0:1] # V_m

####################

### Drop subj w/ extremely high IN val- Y/N ? ###
# df=df.drop('IN 011')

Unnamed: 0_level_0,V_m,Age,Sex,Race,Nonaccept,Goals,Impulse,Awareness,Strategies,Clarity
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
IN 001,0.0845,0.172695,-1.183216,-0.34641,-0.507803,-1.330429,-0.765384,-1.066436,-0.647978,-0.21179
IN 002,0.0942,-0.197365,0.845154,-0.34641,-0.013724,-1.765842,-0.765384,0.592464,-1.096579,-1.108781
IN 003,0.2188,1.282876,0.845154,-0.34641,-0.754842,0.411223,-0.334855,-1.303422,-0.647978,-1.108781
IN 004,0.2188,-0.863474,-1.183216,-0.34641,3.197787,0.411223,2.678843,-0.592464,1.595023,1.133698
IN 005,0.0906,-0.34539,0.845154,-0.34641,0.233315,2.152876,0.095673,-0.592464,0.249222,-0.660286
IN 006,0.2146,2.097008,0.845154,-0.34641,0.480354,-1.330429,-0.334855,-0.592464,-0.199378,-1.108781
IN 007,0.371,1.578924,0.845154,1.03923,-0.260764,-1.765842,-0.765384,-0.82945,-0.199378,-1.557277
IN 008,0.2202,2.245032,-1.183216,-0.34641,0.974433,1.717463,0.95673,2.251365,0.697823,1.582194
IN 009,0.1358,-0.641438,-1.183216,-0.34641,1.96259,0.846637,0.526201,-0.592464,2.492224,0.685202
IN 010,0.3572,-0.937486,0.845154,-0.34641,0.233315,-0.02419,-0.765384,0.118493,1.146423,0.685202


### Set variables and parameters

In [45]:
alphas = 10**np.linspace(1,-2,36)*0.5
# print(alphas)

[ 5.          4.10445708  3.36931358  2.7658406   2.27045481  1.86379686
  1.52997484  1.25594322  1.030993    0.84633331  0.69474775  0.57031246
  0.4681646   0.38431231  0.31547867  0.25897373  0.21258932  0.17451274
  0.14325601  0.11759763  0.09653489  0.07924466  0.06505126  0.05340002
  0.04383562  0.03598428  0.02953919  0.02424847  0.01990536  0.01634014
  0.01341348  0.01101101  0.00903884  0.00741991  0.00609094  0.005     ]


In [92]:
X_train, X_test , y_train, y_test = \
cross_validation.train_test_split(X, y, test_size=.25, random_state=None) 

# variables = [y, X, y_train, X_train, y_test, X_test]
# for var in variables:
#     print(var.shape)

(36, 1)
(36, 10)
(27, 1)
(27, 10)
(9, 1)
(9, 10)


# Ridge Regression

In [None]:
ridge = Ridge(normalize=True) 
coefs = [] 
for a in alphas: 
    ridge.set_params(alpha=a) 
    ridge.fit(X, y) 
    coefs.append(ridge.coef_) 
    
np.shape(coefs) 

In [None]:
ax = plt.gca() 
ax.plot(alphas, coefs) 
ax.set_xscale('log') 
plt.axis('tight') 
plt.xlabel(r'$\alpha$') 
plt.ylabel('weights') 

In [None]:
ridge2 = Ridge(alpha=0, normalize=True) 
ridge2.fit(X_train, y_train)# Fit a ridge regression on the training data 
pred2 = ridge2.predict(X_test)# Use this model to predict the test data 
print(pd.Series(ridge2.coef_, index=X.columns)) # Print coefficients 
print(mean_squared_error(y_test, pred2)) # Calculate the test MSE

In [None]:
ridge3 = Ridge(alpha=10**10, normalize=True) 
ridge3.fit(X_train, y_train)# Fit a ridge regression on the training data 
pred3 = ridge3.predict(X_test)# Use this model to predict the test data 
print(pd.Series(ridge3.coef_, index=X.columns)) # Print coefficients
print(mean_squared_error(y_test, pred3)) # Calculate the test MSE 