In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# names of all the features in the dataset. There is probably a better way to do this
feature_names = ['name', 'hazType', 'meanXdist', 'meanYdist', 'meanAngle', 'meanSLdist', 'meanXspeed', 'meanYspeed', 'medianXdist', 
          'medianYdist', 'medianAngle', 'medianSLdist', 'medianXspeed', 'medianYspeed', 'stdXdist', 'stdYdist', 'stdAngle', 
          'stdSLdist', 'stdXspeed', 'stdYspeed', 'minXdist', 'minYdist', 'minAngle', 'minSLdist', 'minXspeed', 'minYspeed', 
          'maxXdist', 'maxYdist', 'maxAngle', 'maxSLdist', 'maxXspeed', 'maxYspeed', 'rangeXdist',  'rangeYdist', 'rangeAngle', 
          'rangeSLdist', 'rangeXspeed', 'rangeYspeed', 'skewXdist', 'skewYdist', 'skewAngle', 'skewSLdist', 'skewXspeed', 'skewYspeed', 
          'kurtXdist', 'kurtYdist', 'kurtAngle', 'kurtSLdist', 'kurtXspeed', 'kurtYspeed', 'cvXdist', 'cvYdist', 
          'cvAngle', 'cvSLdist', 'cvXspeed', 'cvYspeed', 'madXdist', 'madYdist', 'madAngle', 'madSLdist', 'madXspeed', 'madYspeed', 
          'action']

# import the data
data = pd.read_csv("april10dataNoNaN.csv", header =0, index_col=0, names = feature_names)

#  drop the outputs (action and response time), keep the features
X = data.drop(['action', 'RT'],axis=1)

# make a new df with only response time (output)
y = data["RT"]

# how many features?
r,c = np.shape(X)
featuresToKeep=np.arange(1,c)   

high_score=0

nof=0           
score_list =[]

for n in range(len(feature_indicies)):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)
    model = LinearRegression()
    rfe = RFE(model,featuresToKeep[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    score = model.score(X_test_rfe,y_test)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = featuresToKeep[n]

print("Optimum number of features: %d" %nof) 
print("r2 with %d features: %f" % (nof, high_score))

cols = list(X.columns)
model = LinearRegression()#Initializing RFE model
rfe = RFE(model, nof)             #Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  #Fitting the data to model
model.fit(X_rfe,y)              
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index

# print(selected_features_rfe)

# new dataset containing selected features only
new_data=data[selected_features_rfe]

# calculate correlation between all the remaining features
for i in range(len(selected_features_rfe)):
    for j in range(len(selected_features_rfe)):
           new_data[[selected_features_rfe[i],selected_features_rfe[j]]].corr()

# plot correlation matrix
print('correlation matrix for all selected features')
plt.figure()
cor_new = new_data.corr()
sns.heatmap(cor_new, annot=True, cmap=plt.cm.Reds)
plt.show()

relevant_features_new = cor_new[cor_new<0.5]

# drop all the highly correlated features (>.5)
new_data2 = new_data.drop(["medianAngle", 'minYdist', 'minAngle', 'maxYdist', 'maxAngle', 'medianSLdist', 
                           'stdXdist', 'stdSLdist', 'maxSLdist', 'rangeXdist',  'madXdist', 'madSLdist', 
                           'rangeSLdist', 'minXspeed', 'maxXspeed', 'medianYspeed', 'rangeYdist', 'rangeXdist',
                           'rangeAngle', 'rangeXspeed', 'rangeYspeed', 'cvXspeed', 'maxXdist', 'kurtSLdist', 
                           'madYspeed', 'madAngle', 'stdYdist', 'minYspeed', 'cvSLdist'], axis=1)

print('correlation matrix for independant features')
plt.figure()
cor_new = new_data2.corr()
sns.heatmap(cor_new, annot=True, cmap=plt.cm.Reds)
plt.show()

# final list of important, and independent features
print(list(cor_new.columns))



Optimum number of features: 39
r2 with 39 features: 0.989964
correlation matrix for all selected features
