In [20]:
%matplotlib inline

In [21]:
import pandas as pd
from sklearn import linear_model, cross_validation
from sklearn import grid_search
import pylab as plt
import numpy as np
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
pd.set_option("precision", 2)

In [31]:
data = pd.read_csv("forbes_dataset.csv", index_col=['code', 'year']).sort_index(level=[1,0])
print data.info()
print "Number of countries : ", len(data.index.levels[0])

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 181 entries, (BGD, 1960.0) to (VEN, 1985.0)
Data columns (total 6 columns):
gini           181 non-null float64
PPPI           181 non-null float64
sch_male       181 non-null float64
sch_female     181 non-null float64
log(GNI_PC)    181 non-null float64
growth         181 non-null float64
dtypes: float64(6)
memory usage: 9.9+ KB
None
Number of countries :  46


In [32]:
for country in sorted(np.unique(data.index.levels[0])):
    data.loc[:, country] = (data.reset_index()['code'] == country).astype(int).values
for period in sorted(np.unique(data.index.levels[1])):
    data.loc[:, period] = (data.reset_index()['year'] == period).astype(int).values
data = (data - data.mean(axis=0)) / data.std(axis=0)
variables = data.keys().tolist()[:4] + ['log(GNI_PC)']
dummies = sorted(np.unique(data.index.levels[0])) + sorted(np.unique(data.index.levels[1]))
X = data[variables + dummies]
X_dum = data[dummies]
X_var = data[variables]
Y = data['growth']

In [39]:
n_folds = 5

# Finding the best alpha
parameters = {'alpha':np.logspace(-5, 0, 50)}
lm = linear_model.Lasso()
clf_lasso = grid_search.RandomizedSearchCV(lm, parameters, cv=n_folds, n_iter=20)
clf_lasso.fit(X, Y)
best_alpha = clf_lasso.best_params_ 

clf = linear_model.LinearRegression()
clf.fit(X, Y)

R_dict = {'lasso': [], 'OLS': [], 'gini_lasso': []}
for k in range(20):
    kf = cross_validation.KFold(len(X), n_folds=n_folds, shuffle=True)
    for i, (train_index, test_index) in enumerate(kf):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.values[train_index], X.values[test_index]
        Y_train, Y_test = Y.values[train_index], Y.values[test_index]
        clf = linear_model.Lasso(alpha=10**-12)
        clf.fit(X_train, Y_train)
        parameters = {'alpha':np.logspace(-5, 0, 50)}
        lm = linear_model.Lasso()
        clf_l = grid_search.RandomizedSearchCV(lm, parameters, cv=n_folds, n_iter=20)
        clf_l.fit(X, Y)
        best_alpha = clf_l.best_params_
        #print i, "th fold"
        OLS_res = (Y_test.var() - (Y_test - clf.predict(X_test)).var()) / Y_test.var()
        #print "R^2 OLS", OLS_res
        lasso_res = (Y_test.var() - (Y_test - clf_l.predict(X_test)).var()) / Y_test.var()
        #print "R^2 Lasso", lasso_res
        R_dict['lasso'].append(lasso_res)
        R_dict['OLS'].append(OLS_res)
        R_dict['gini_lasso'].append(clf_l.coef_[0])
        #print '\n'

print best_alpha
x = range(len(data))
plt.figure(figsize=(15,7))
plt.plot(train_index, Y_train, 'bo', label="train")
plt.plot(test_index, Y_test, 'ko', label="test")
plt.plot(x, clf.predict(X), 'r--.', label="'OLS'")
plt.plot(x, clf_l.predict(X), 'g--.', label="'Lasso'")
plt.legend()
plt.ylabel("Growth")
plt.xlabel("Country/period")
print np.mean(R_dict['lasso']), np.std(R_dict['lasso'])
print np.mean(R_dict['OLS']), np.std(R_dict['OLS'])
print np.mean(R_dict['gini_lasso']), np.std(R_dict['gini_lasso'])
print pd.DataFrame(index=X.columns, data=clf_l.coef_)

AttributeError: 'RandomizedSearchCV' object has no attribute 'coef_'