# Objective
To eliminate non-predictive features

Some assumptions:
* UCI data contains 5 true predictors and 15 additional features which are linear combinations of true predictors.
* DSI Madelon data contains some true predictors as well as linear combintations of the true predictors. The number of features in these two categories is unknown.
* This information does allow us to select likely predictors by finding features which are related to one another, either via correlation or through modeling (easier to funtionalize.)

In [1]:
%run data_package_loading.py # Code loads data as well as packages that are relevant across most project phases
%matplotlib inline

In [2]:
X_target = [(Xuci_1, 'uci_1'), 
            (Xuci_2, 'uci_2'), 
            (Xuci_3, 'uci_3'), 
            (Xdb_1, 'db_1'), 
            (Xdb_2, 'db_2'), 
            (Xdb_3, 'db_3')]

In [4]:
#to clear up some memory in the workspace
%reset_selective -f Xdb_1, Xdb_2, Xdb_3, ydb_1, ydb_2, ydb_3

In [5]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from tqdm import tqdm

In [6]:
def calculate_r_2_for_feature(data, feature):
    tmp_X = data.drop(feature, axis=1)
    tmp_y = data[feature]

    X_train, X_test, y_train, y_test = train_test_split(tmp_X, tmp_y,test_size=0.25)
    
    # Pipe to scale and fit
    dtr_pipe = Pipeline([
                        ('scaler', StandardScaler()),
                        ('model', DecisionTreeRegressor())
                        ])
    
    dtr_pipe.fit(X_train, y_train)

    score = dtr_pipe.score(X_test, y_test)
    return score

In [None]:
def mean_r2_for_feature(data, feature):
    scores = []
    for _ in range(5):
        tmp_score = calculate_r_2_for_feature(data, feature)
        scores.append(tmp_score)
        
        if tmp_score < 0:
            return np.array(scores).mean()
        
    scores = np.array(scores)
    return scores.mean()

In [None]:
for data_src in X_target[3:4]:
    results_R2 = []
    data = data_src[0]
    src = data_src[1]
    
    for feature in tqdm(data.columns):
        results_R2.append([feature, mean_r2_for_feature(data, feature)])
        
    results_df = pd.DataFrame(results_R2, columns = ['Feature', 'R2'])
    results_df.to_pickle('feature_results_' + src + '.pickle')




  3%|▎         | 28/1000 [08:52<5:08:22, 19.04s/it]