# Using PCA to reduce dimensionality of data

In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn_pandas import DataFrameMapper

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# features_imputed_set1.csv
features = pd.read_csv('background_imputed_1.csv', low_memory=False, index_col='challengeID')
# features.drop('idnum', 1, inplace=True) # remove idnum column
print("Features shape: {}".format(features.shape))
features.sort_index(kind='mergesort', inplace=True) # mergesort is the only stable sort
features.head()

Features shape: (4242, 12133)


Unnamed: 0_level_0,idnum,mothid1,m1intmon,m1lenhr,m1lenmin,cm1fint,cm1citsm,m1citywt,innatsm,incitysm,...,m4d9,m4e23,f4d6,f4d7,f4d9,m5c6,m5d20,m5k10,f5c6,k5f1
challengeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1854,18540,1,1,40,0,1,202.485367,1,1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0
2,4470,44700,1,0,40,1,1,45.608219,1,1,...,1.0,8.473318,1.0,1.0,1.0,1.0,9.845074,1,1.0,9.723551
3,371,3710,1,0,35,1,1,39.060299,1,1,...,1.0,1.0,9.097495,10.071504,1.0,1.0,1.0,1,1.0,1.0
4,2654,26540,1,0,30,1,1,22.304855,1,1,...,1.0,1.0,9.512706,10.286578,1.0,10.677285,1.0,1,8.522331,10.608137
5,2810,28100,1,0,25,1,1,35.518272,1,1,...,1.0,1.0,11.076016,9.615958,1.0,9.731979,1.0,1,10.115313,9.646466


In [3]:
# some columns are not numeric; encode them
from sklearn.preprocessing import LabelEncoder
str_col_features = features.select_dtypes(include=['object'])
label_encoders={} # save the label encoders to allow for inverse transforms
for col_name in str_col_features.columns:
    le = LabelEncoder()
    dmy = le.fit(features[col_name])
    features[col_name] = le.transform(features[col_name])
    label_encoders[col_name] = le
features.head()

Unnamed: 0_level_0,idnum,mothid1,m1intmon,m1lenhr,m1lenmin,cm1fint,cm1citsm,m1citywt,innatsm,incitysm,...,m4d9,m4e23,f4d6,f4d7,f4d9,m5c6,m5d20,m5k10,f5c6,k5f1
challengeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1854,18540,1,1,40,0,1,202.485367,1,1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0
2,4470,44700,1,0,40,1,1,45.608219,1,1,...,1.0,8.473318,1.0,1.0,1.0,1.0,9.845074,1,1.0,9.723551
3,371,3710,1,0,35,1,1,39.060299,1,1,...,1.0,1.0,9.097495,10.071504,1.0,1.0,1.0,1,1.0,1.0
4,2654,26540,1,0,30,1,1,22.304855,1,1,...,1.0,1.0,9.512706,10.286578,1.0,10.677285,1.0,1,8.522331,10.608137
5,2810,28100,1,0,25,1,1,35.518272,1,1,...,1.0,1.0,11.076016,9.615958,1.0,9.731979,1.0,1,10.115313,9.646466


In [4]:
# scale data to unit variance and 0 mean
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
indices_f = features.index
colnames_f = features.columns
features = ss.fit_transform(features)
features = pd.DataFrame(data=features, index=indices_f, columns=colnames_f)
features.head()

Unnamed: 0_level_0,idnum,mothid1,m1intmon,m1lenhr,m1lenmin,cm1fint,cm1citsm,m1citywt,innatsm,incitysm,...,m4d9,m4e23,f4d6,f4d7,f4d9,m5c6,m5d20,m5k10,f5c6,k5f1
challengeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.416725,-0.416725,0.0,1.526403,0.416335,-1.880664,0.162398,0.702599,0.601133,0.162398,...,-0.439427,-0.528718,-0.75643,-0.771735,-0.428173,-0.629212,-0.575395,0.0,-0.607729,-1.446309
2,1.439058,1.439058,0.0,-0.568389,0.416335,0.531727,0.162398,-0.184623,0.601133,0.162398,...,-0.439427,1.467391,-0.75643,-0.771735,-0.428173,-0.629212,1.681247,0.0,-0.607729,0.617832
3,-1.468761,-1.468761,0.0,-0.568389,0.034922,0.531727,0.162398,-0.221655,0.601133,0.162398,...,-0.439427,-0.528718,1.05474,1.289296,-0.428173,-0.629212,-0.575395,0.0,-0.607729,-1.446309
4,0.150793,0.150793,0.0,-0.568389,-0.346491,0.531727,0.162398,-0.316416,0.601133,0.162398,...,-0.439427,-0.528718,1.147611,1.33816,-0.428173,1.723229,-0.575395,0.0,1.254602,0.82714
5,0.261459,0.261459,0.0,-0.568389,-0.727904,0.531727,0.162398,-0.241687,0.601133,0.162398,...,-0.439427,-0.528718,1.497277,1.185796,-0.428173,1.493436,-0.575395,0.0,1.648983,0.599592


In [5]:
from sklearn.model_selection import train_test_split

full_y = pd.read_csv('train.csv', index_col='challengeID')
train_y = full_y['gpa'] # we are specifically interested in GPA
print("Target shape: {}".format(train_y.shape))
train_y.dropna(how='any', inplace=True) # drop those with no reported GPA
print("Target shape (no na): {}".format(train_y.shape))

# get the rows for which we can predict
y_indices = train_y.index.values.tolist()
full_x = features
train_x = features.loc[y_indices]
print("Features shape (final): {}".format(train_x.shape))

train_x, test_x, train_y, test_y = train_test_split(
    train_x, train_y, test_size=0.2, random_state=0)

train_x.shape
train_x.head()
train_y.shape
train_y.head()
test_x.shape
test_x.head()
test_y.shape
test_y.head()

Target shape: (2121,)
Target shape (no na): (1165,)
Features shape (final): (1165, 12133)


(932, 12133)

Unnamed: 0_level_0,idnum,mothid1,m1intmon,m1lenhr,m1lenmin,cm1fint,cm1citsm,m1citywt,innatsm,incitysm,...,m4d9,m4e23,f4d6,f4d7,f4d9,m5c6,m5d20,m5k10,f5c6,k5f1
challengeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1875,1.617826,1.617826,0.0,-0.568389,0.034922,-1.880664,0.162398,0.054707,0.601133,0.162398,...,-0.439427,-0.528718,-0.75643,-0.771735,-0.428173,-0.629212,1.373581,0.0,-0.607729,0.743777
1062,0.365031,0.365031,0.0,-0.568389,1.560574,0.531727,0.162398,-0.050341,-1.663526,0.162398,...,2.775024,2.024423,-0.75643,-0.771735,2.167878,-0.629212,1.7537,0.0,-0.607729,0.927319
3254,-0.380546,-0.380546,0.0,-0.568389,0.034922,0.531727,0.162398,-0.397717,0.601133,0.162398,...,-0.439427,-0.528718,-0.75643,1.064208,-0.428173,-0.629212,-0.575395,0.0,-0.607729,0.580713
3034,0.579978,0.579978,0.0,-0.568389,0.416335,0.531727,0.162398,-0.36915,0.601133,0.162398,...,-0.439427,-0.528718,-0.75643,-0.771735,-0.428173,-0.629212,1.776071,0.0,-0.607729,0.743777
338,-0.425238,-0.425238,0.0,-0.568389,0.034922,0.531727,0.162398,-0.352207,0.601133,0.162398,...,-0.439427,2.024423,-0.75643,-0.771735,-0.428173,-0.629212,1.864688,0.0,-0.607729,0.580713


(932,)

challengeID
1875    3.00
1062    1.75
3254    2.50
3034    2.25
338     1.50
Name: gpa, dtype: float64

(233, 12133)

Unnamed: 0_level_0,idnum,mothid1,m1intmon,m1lenhr,m1lenmin,cm1fint,cm1citsm,m1citywt,innatsm,incitysm,...,m4d9,m4e23,f4d6,f4d7,f4d9,m5c6,m5d20,m5k10,f5c6,k5f1
challengeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3268,1.737714,1.737714,0.0,-0.568389,0.034922,0.531727,0.162398,-0.25838,-1.663526,0.162398,...,-0.439427,2.024423,-0.75643,-0.771735,-0.428173,-0.629212,1.864688,0.0,-0.607729,1.050577
2126,1.135436,1.135436,0.0,-0.568389,0.034922,0.531727,0.162398,4.0732,0.601133,0.162398,...,-0.439427,-0.528718,1.109625,1.448965,-0.428173,1.192955,-0.575395,0.0,1.583459,0.605552
382,-0.096787,-0.096787,0.0,-0.568389,0.416335,0.531727,0.162398,-0.278977,0.601133,0.162398,...,2.557966,1.748005,-0.75643,-0.771735,1.833255,-0.629212,1.864688,0.0,-0.607729,0.787495
2998,0.806985,0.806985,0.0,-0.568389,0.797748,0.531727,0.162398,-0.413586,-1.663526,0.162398,...,-0.439427,-0.528718,1.165931,1.398667,-0.428173,1.621463,-0.575395,0.0,1.48019,0.580713
4090,1.584484,1.584484,0.0,-0.568389,-0.346491,0.531727,0.162398,1.470424,0.601133,0.162398,...,1.683279,1.565537,-0.75643,-0.771735,2.890307,-0.629212,1.377554,0.0,1.832985,0.639237


(233,)

challengeID
3268    2.25
2126    3.75
382     3.75
2998    2.25
4090    3.75
Name: gpa, dtype: float64

## Feature Selection
Optional step in which we select features via lasso before doing dimensionality reduction

In [6]:
from sklearn.feature_selection import VarianceThreshold
# remove features with 0 variance

# mapper from pandas to numpy
mapper = DataFrameMapper([(train_x.columns, None)])
indices = train_x.index
colnames = train_x.columns

vt = VarianceThreshold()
train_vt_x = vt.fit_transform(mapper.fit_transform(train_x))
train_x = pd.DataFrame(data=train_vt_x, index=indices, columns=colnames[vt.get_support(True)])
train_x.head()

# Test set needs variance threshold treatment
# mapper from pandas to numpy
mapper_test = DataFrameMapper([(test_x.columns, None)])
indices = test_x.index
colnames = test_x.columns

test_vt_x = vt.transform(mapper.fit_transform(test_x))
test_x = pd.DataFrame(data=test_vt_x, index=indices, columns=colnames[vt.get_support(True)])
test_x.head()

Unnamed: 0_level_0,idnum,mothid1,m1lenhr,m1lenmin,cm1fint,m1citywt,innatsm,m1a3,m1a4,m1a5,...,m4d7,m4d9,m4e23,f4d6,f4d7,f4d9,m5c6,m5d20,f5c6,k5f1
challengeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1875,1.617826,1.617826,-0.568389,0.034922,-1.880664,0.054707,0.601133,-0.312772,0.554463,-0.424618,...,-0.802841,-0.439427,-0.528718,-0.75643,-0.771735,-0.428173,-0.629212,1.373581,-0.607729,0.743777
1062,0.365031,0.365031,-0.568389,1.560574,0.531727,-0.050341,-1.663526,-0.312772,-1.753714,-0.424618,...,-0.802841,2.775024,2.024423,-0.75643,-0.771735,2.167878,-0.629212,1.7537,-0.607729,0.927319
3254,-0.380546,-0.380546,-0.568389,0.034922,0.531727,-0.397717,0.601133,-0.312772,0.554463,-0.424618,...,1.366038,-0.439427,-0.528718,-0.75643,1.064208,-0.428173,-0.629212,-0.575395,-0.607729,0.580713
3034,0.579978,0.579978,-0.568389,0.416335,0.531727,-0.36915,0.601133,-0.312772,0.554463,-0.424618,...,-0.802841,-0.439427,-0.528718,-0.75643,-0.771735,-0.428173,-0.629212,1.776071,-0.607729,0.743777
338,-0.425238,-0.425238,-0.568389,0.034922,0.531727,-0.352207,0.601133,-0.312772,0.554463,-0.424618,...,-0.802841,-0.439427,2.024423,-0.75643,-0.771735,-0.428173,-0.629212,1.864688,-0.607729,0.580713


Unnamed: 0_level_0,idnum,mothid1,m1lenhr,m1lenmin,cm1fint,m1citywt,innatsm,m1a3,m1a4,m1a5,...,m4d7,m4d9,m4e23,f4d6,f4d7,f4d9,m5c6,m5d20,f5c6,k5f1
challengeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3268,1.737714,1.737714,-0.568389,0.034922,0.531727,-0.25838,-1.663526,-0.312772,0.554463,2.355058,...,-0.802841,-0.439427,2.024423,-0.75643,-0.771735,-0.428173,-0.629212,1.864688,-0.607729,1.050577
2126,1.135436,1.135436,-0.568389,0.034922,0.531727,4.0732,0.601133,-0.312772,-1.753714,-0.424618,...,1.497237,-0.439427,-0.528718,1.109625,1.448965,-0.428173,1.192955,-0.575395,1.583459,0.605552
382,-0.096787,-0.096787,-0.568389,0.416335,0.531727,-0.278977,0.601133,-0.312772,0.554463,-0.424618,...,-0.802841,2.557966,1.748005,-0.75643,-0.771735,1.833255,-0.629212,1.864688,-0.607729,0.787495
2998,0.806985,0.806985,-0.568389,0.797748,0.531727,-0.413586,-1.663526,-0.312772,0.554463,-0.424618,...,1.301939,-0.439427,-0.528718,1.165931,1.398667,-0.428173,1.621463,-0.575395,1.48019,0.580713
4090,1.584484,1.584484,-0.568389,-0.346491,0.531727,1.470424,0.601133,-0.312772,-1.753714,-0.424618,...,-0.802841,1.683279,1.565537,-0.75643,-0.771735,2.890307,-0.629212,1.377554,1.832985,0.639237


In [9]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

# mapper from pandas to numpy
mapper = DataFrameMapper([(train_x.columns, None)])
indices = train_x.index
colnames = train_x.columns

In [23]:
# Use Lasso Regression
fs_model_lasso = LassoCV().fit(mapper.fit_transform(train_x), train_y.as_matrix())
fs_lasso = SelectFromModel(fs_model_lasso, prefit=True, threshold=0.0005)
train_fs_lasso_x = fs_lasso.transform(train_x)
train_fs_lasso_x = pd.DataFrame(data=train_fs_lasso_x, index=indices, columns=colnames[fs_lasso.get_support(True)])
train_fs_lasso_x.head()

Unnamed: 0_level_0,m1d2c,m1i1,m1i3,f1b9b2,f1b20,cf1edu,m2b31e,m2c37a3,m2h3,cm2povco,...,t5b4r,t5b4y,cf5povco,hv3m7,hv3m21,hv3m23,hv4s1_ot2,ffcc_pof_e2,ffcc_centsurvey_f2_c,ffcc_famsurvey_b45_a
challengeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1875,1.059408,0.728058,-0.176104,-0.338028,-0.516012,-0.069817,-0.017107,-0.968957,-0.316138,-0.47897,...,0.193229,0.123772,-0.503264,-0.68612,-0.669645,-0.66796,-0.015356,-0.308232,-0.106243,-0.020604
1062,-0.6598,0.728058,0.814657,-0.338028,-0.516012,0.954966,-0.017107,0.142239,0.317814,-0.531296,...,0.193229,0.123772,0.864842,1.10167,-0.669645,2.634472,-0.015356,-0.308232,-0.106243,-0.020604
3254,-0.6598,-0.925273,-0.671485,-0.338028,-0.516012,-1.094599,-0.017107,0.142239,-0.316138,-0.126749,...,-1.319088,0.123772,-0.083656,-0.68612,-0.669645,-0.66796,-0.015356,-0.308232,-0.106243,-0.020604
3034,-0.6598,0.728058,0.814657,2.958331,1.937938,0.954966,-0.017107,0.142239,-0.316138,-0.19632,...,0.193229,0.123772,1.189408,1.10167,1.05404,0.983256,-0.015356,-0.308232,-0.106243,-0.020604
338,-0.6598,0.728058,-0.176104,-0.338028,-0.516012,-0.069817,-0.017107,0.142239,-0.316138,0.715532,...,1.705545,0.123772,2.741067,-0.68612,-0.669645,-0.66796,-0.015356,-0.308232,-0.106243,-0.020604


In [24]:
train_fs_x = fs_lasso.transform(train_x)
test_fs_x = fs_lasso.transform(test_x)

## Dimensionality Reduction

In [12]:
from sklearn.decomposition import PCA
from sklearn.decomposition import FactorAnalysis
from sklearn.model_selection import cross_val_score

## DEFAULT COMMENTED OUT B/C THIS TAKES FOREVER TO RUN

# # adapted from http://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_fa_model_selection.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-fa-model-selection-py
# n_components = [200, 300, 400]

# def compute_scores(X, y):
#     pca = PCA(svd_solver='full')
# #     fa = FactorAnalysis()

#     pca_scores, fa_scores = [], []
#     for n in n_components:
#         pca.n_components = n
# #         fa.n_components = n
#         pca_scores.append(np.mean(cross_val_score(pca, X, y)))
# #         fa_scores.append(np.mean(cross_val_score(fa, X)))
#         print("Finished for %d" % n)

#     return pca_scores #, fa_scores

# pca_scores = compute_scores(train_x, train_y) #, pca_scores
# n_components_pca = n_components[np.argmax(pca_scores)]
# # n_components_fa = n_components[np.argmax(fa_scores)]

# # pca = PCA(svd_solver='full', n_components='mle')
# # pca.fit(train_x)
# # n_components_pca_mle = pca.n_components_

# print("best n_components by PCA CV = %d" % n_components_pca)
# # print("best n_components by FactorAnalysis CV = %d" % n_components_fa)
# # print("best n_components by PCA MLE = %d" % n_components_pca_mle)

In [25]:
# best number of factor components is 100 according to cv (versus 100,500,1000,5000)
# best number of PCA components is 400 according to cv (versus 10,25,50,100,200, 300, 400)
pca = PCA(svd_solver='full', n_components=100)
pca.fit(train_x, train_y)


PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
  svd_solver='full', tol=0.0, whiten=False)

In [20]:
pca_comps = np.array(pca.components_)
pca_comps.shape
variance_ratios = np.array(pca.explained_variance_ratio_) # seems like first 6 components have the most
variance_ratios
pca_comps_high_var = pca_comps[variance_ratios > 0.01] # get components with high variance
pca_comps_high_var.shape

for i in range(1,len(pca_comps_high_var)):
    comp = pca_comps_high_var[i]
    print("Investigating component {} with variance {}".format(i, variance_ratios[i]) )
    relevant_features = np.array(features.columns.values[((comp > 0.01) == True)]) # indices of top 10 features that lend more than 0.01 to the component's variance
    relevant_features.shape
    relevant_features

(100, 10125)

array([ 0.03850787,  0.02415232,  0.01488248,  0.01218281,  0.01143019,
        0.01082844,  0.00967048,  0.00930546,  0.00836747,  0.0083195 ,
        0.00772168,  0.00756251,  0.00670582,  0.00656345,  0.00627047,
        0.00598745,  0.00567443,  0.0055455 ,  0.00532796,  0.00506661,
        0.00493537,  0.00484104,  0.00475194,  0.00467934,  0.00445898,
        0.00441663,  0.00427458,  0.00408001,  0.00399618,  0.00390849,
        0.00387116,  0.00375466,  0.00371781,  0.00364178,  0.00357362,
        0.00353697,  0.00345694,  0.003397  ,  0.00335494,  0.00329722,
        0.00326957,  0.00321829,  0.00317035,  0.00311314,  0.00306839,
        0.00303147,  0.00300135,  0.00298866,  0.00294313,  0.0028714 ,
        0.00284942,  0.00281497,  0.00279665,  0.00276929,  0.00271822,
        0.002712  ,  0.00266507,  0.00261979,  0.00261043,  0.00257648,
        0.002567  ,  0.00253634,  0.00249379,  0.00248657,  0.00247373,
        0.00244616,  0.00242898,  0.00241299,  0.00239118,  0.00

(6, 10125)

Investigating component 1 with variance 0.0241523189247




(1814,)

array(['m1lenmin', 'innatsm', 'm1a7', ..., 'hv3k3e', 'hv3k3h', 'hv3l2'], dtype=object)

Investigating component 2 with variance 0.0148824780392


(1312,)

array(['innatsm', 'm1a7', 'm1a10', ..., 'hv3k3g', 'hv3l1', 'hv3l3'], dtype=object)

Investigating component 3 with variance 0.0121828089613


(1446,)

array(['innatsm', 'm1a7', 'm1a11a', ..., 'hv3l1', 'hv3l3', 'hv3l4'], dtype=object)

Investigating component 4 with variance 0.0114301942953


(892,)

array(['m1i2d', 'm1i4', 'f1b5d', 'f1k1a', 'f1k1d', 'm2a7a1', 'm2a7b2',
       'm2b17e', 'm2b19', 'm2b19a', 'm2b20b', 'm2c1a', 'm2c1b', 'm2c8c',
       'm2c10a2', 'm2c23a', 'm2c23b', 'm2c23c', 'm2c23d', 'm2c23e',
       'm2c23f', 'm2c24', 'm2c24a', 'm2c24b', 'm2c24c', 'm2c27b',
       'm2c36a1', 'm2d6b', 'm2d6d', 'm2e2', 'm2e2a1', 'm2e2a2', 'm2e2b',
       'm2e3', 'm2e4a', 'm2e6d', 'm2e8c', 'm2g5a1', 'm2h15', 'm2j16b2b',
       'm2k3a3', 'm2k3a14', 'm2k4', 'm2k5', 'm2k6', 'm2k6a', 'm2k6b',
       'm2k7a', 'm2k7b', 'm2k8a', 'm2k8b', 'm2k11a', 'm2k11b', 'm2k11c',
       'm2k15a', 'm2k16', 'm2k17b', 'm2k17e1', 'm2k17f1c', 'm2k18a1',
       'm2k18b', 'm2k18e', 'm2k18e1', 'm2k18f1a', 'm2k18f1b', 'm2k18f1c',
       'm2k18f1d', 'm2k19a', 'm2k19c', 'm2k19d', 'm2k19e', 'm2k19e1',
       'm2k19f', 'm2k19f1a', 'm2k19f1b', 'm2k19f1c', 'm2k19f1d', 'm2k20a',
       'm2k20a1', 'm2k20b', 'm2k20c', 'm2k20d', 'm2k20e', 'm2k20f1a',
       'm2k20f1b', 'm2k20f1c', 'm2l1', 'm2l1a', 'm2l2', 'm2l2a', 'm2l3',
 

Investigating component 5 with variance 0.0108284394053


(1087,)

array(['m1b15e', 'm1b15f', 'm1b16', ..., 'hv3k1d', 'hv3k2a', 'hv3k2c'], dtype=object)

In [26]:
# transform data sets, looking for features with high variance ratios
train_reduced_x = pca.transform(train_x) #[:,variance_ratios > 0.002]
test_reduced_x = pca.transform(test_x) #[:,variance_ratios > 0.002]
train_reduced_x.shape

(932, 100)

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error

# Test on
# Lasso - Linear Regression
# Lasso
# Random Forest/ Decision Tree
# SVR
# Kernel Ridge Regression?
# 

#clf - the classifier, params - params to CV, transform - to do any feature selection
classifiers = [
    {
        'clf': DecisionTreeRegressor(),
        'params': {
            "max_depth": [1000, None],
            "max_features": [None],
            "criterion": ["mse", "mae"]
        }
    },
    {
        'clf': RandomForestRegressor(),
        'params': {"max_depth": [1000, None],
                  "max_features": [None],
                  "criterion": ["mse", "mae"]}
    },
    {
        'clf': Lasso(),
        'params': {
            "alpha": [0.5, 1, 2, 4, 8],
            "fit_intercept": [True, False],
            "normalize": [True, False]
        }
    },
    {
        'clf': LinearRegression(),
        'params': {
            "fit_intercept": [True, False],
            "normalize": [True, False]
        }
        
    },
    {
        'clf': ElasticNet(),
        'params': {
            "alpha": [0.5, 1, 2, 4, 8],
            "l1_ratio": [0.1, 0.2, 0.4, 0.6, 0.8],
            "fit_intercept": [True, False],
            "normalize": [True, False],
            "positive": [True, False]
        }
    },
#     {
#         'clf': KernelRidge(),
#         'params': {
            
#         }
        
#     },
    {
        'clf': LinearSVR(),
        'params': {
            "C": [1, 2, 4]
        }
        
    }
]

# run grid search without feature selection
# keeping track of best classifier
best_mse = 100
best_clf = 0
for clf in classifiers: 
    model = clf['clf']
    print("~~~ Fitting and Testing {} ~~~".format(type(model).__name__))
    grid_search = GridSearchCV(model, param_grid=clf['params'])

    dmy=grid_search.fit(train_reduced_x, train_y)
    grid_search.get_params()

    clf = grid_search.best_estimator_
    dmy=clf.fit(train_reduced_x, train_y)
    pred_y = clf.predict(test_reduced_x)

    mse = mean_squared_error(test_y, pred_y)
    "----MSE: {}----".format(mse)
    if (mse < best_mse):
        best_mse = mse
        best_clf = clf

~~~ Fitting and Testing DecisionTreeRegressor ~~~


{'cv': None,
 'error_score': 'raise',
 'estimator': DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
 'estimator__criterion': 'mse',
 'estimator__max_depth': None,
 'estimator__max_features': None,
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_split': 1e-07,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__presort': False,
 'estimator__random_state': None,
 'estimator__splitter': 'best',
 'fit_params': {},
 'iid': True,
 'n_jobs': 1,
 'param_grid': {'criterion': ['mse', 'mae'],
  'max_depth': [1000, None],
  'max_features': [None]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': True,
 'scoring': None,
 'verbose': 0}

'----MSE: 0.813572961373----'

~~~ Fitting and Testing RandomForestRegressor ~~~


{'cv': None,
 'error_score': 'raise',
 'estimator': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
 'estimator__bootstrap': True,
 'estimator__criterion': 'mse',
 'estimator__max_depth': None,
 'estimator__max_features': 'auto',
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_split': 1e-07,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 10,
 'estimator__n_jobs': 1,
 'estimator__oob_score': False,
 'estimator__random_state': None,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'fit_params': {},
 'iid': True,
 'n_jobs': 1,
 'param_grid': {'criterion': ['ms

'----MSE: 0.469380364807----'

~~~ Fitting and Testing Lasso ~~~


{'cv': None,
 'error_score': 'raise',
 'estimator': Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
    normalize=False, positive=False, precompute=False, random_state=None,
    selection='cyclic', tol=0.0001, warm_start=False),
 'estimator__alpha': 1.0,
 'estimator__copy_X': True,
 'estimator__fit_intercept': True,
 'estimator__max_iter': 1000,
 'estimator__normalize': False,
 'estimator__positive': False,
 'estimator__precompute': False,
 'estimator__random_state': None,
 'estimator__selection': 'cyclic',
 'estimator__tol': 0.0001,
 'estimator__warm_start': False,
 'fit_params': {},
 'iid': True,
 'n_jobs': 1,
 'param_grid': {'alpha': [0.5, 1, 2, 4, 8],
  'fit_intercept': [True, False],
  'normalize': [True, False]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': True,
 'scoring': None,
 'verbose': 0}

'----MSE: 0.405931606195----'

~~~ Fitting and Testing LinearRegression ~~~


{'cv': None,
 'error_score': 'raise',
 'estimator': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 'estimator__copy_X': True,
 'estimator__fit_intercept': True,
 'estimator__n_jobs': 1,
 'estimator__normalize': False,
 'fit_params': {},
 'iid': True,
 'n_jobs': 1,
 'param_grid': {'fit_intercept': [True, False], 'normalize': [True, False]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': True,
 'scoring': None,
 'verbose': 0}

'----MSE: 0.416820781899----'

~~~ Fitting and Testing ElasticNet ~~~


{'cv': None,
 'error_score': 'raise',
 'estimator': ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
       max_iter=1000, normalize=False, positive=False, precompute=False,
       random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
 'estimator__alpha': 1.0,
 'estimator__copy_X': True,
 'estimator__fit_intercept': True,
 'estimator__l1_ratio': 0.5,
 'estimator__max_iter': 1000,
 'estimator__normalize': False,
 'estimator__positive': False,
 'estimator__precompute': False,
 'estimator__random_state': None,
 'estimator__selection': 'cyclic',
 'estimator__tol': 0.0001,
 'estimator__warm_start': False,
 'fit_params': {},
 'iid': True,
 'n_jobs': 1,
 'param_grid': {'alpha': [0.5, 1, 2, 4, 8],
  'fit_intercept': [True, False],
  'l1_ratio': [0.1, 0.2, 0.4, 0.6, 0.8],
  'normalize': [True, False],
  'positive': [True, False]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': True,
 'scoring': None,
 'verbose': 0}

'----MSE: 0.402995121796----'

~~~ Fitting and Testing LinearSVR ~~~


{'cv': None,
 'error_score': 'raise',
 'estimator': LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
      intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
      random_state=None, tol=0.0001, verbose=0),
 'estimator__C': 1.0,
 'estimator__dual': True,
 'estimator__epsilon': 0.0,
 'estimator__fit_intercept': True,
 'estimator__intercept_scaling': 1.0,
 'estimator__loss': 'epsilon_insensitive',
 'estimator__max_iter': 1000,
 'estimator__random_state': None,
 'estimator__tol': 0.0001,
 'estimator__verbose': 0,
 'fit_params': {},
 'iid': True,
 'n_jobs': 1,
 'param_grid': {'C': [1, 2, 4]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': True,
 'scoring': None,
 'verbose': 0}

'----MSE: 0.692577063598----'