In [18]:
import pandas as pd
import numpy as np
import preprocess as pp
import matplotlib.pyplot as plt
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
import eda
import statsmodels.api as sm

In [2]:
def wls_results_wald(feature_df, target_df, x=None, z=None, y='y2'):

    q1 = feature_df[z].quantile(.25)
    q2 = feature_df[z].quantile(.5)
    q3 = feature_df[z].quantile(.75)
    
    I_q2 = np.where((feature_df[z] >= q1) & (feature_df[z] < q2), 1, 0)
    I_q3 = np.where((feature_df[z] >= q2) & (feature_df[z] < q3), 1, 0)
    I_q4 = np.where(feature_df[z] >= q3, 1, 0)
    
    x_ = np.array(feature_df[x]).reshape(-1,1)
    z_ = np.array(feature_df[z]).reshape(-1,1)
    
    x_z_q2 = I_q2.reshape(-1,1) * x_ * z_
    x_z_q3 = I_q3.reshape(-1,1) * x_ * z_
    x_z_q4 = I_q4.reshape(-1,1) * x_ * z_

    bias = np.ones(x_.shape)
    
    x__ = np.concatenate((bias, x_, x_z_q2, x_z_q3, x_z_q4), axis=1)
    y_ = target_df[y]
    weights = np.array(feature_df['weights']).reshape(-1,1)

    results = sm.WLS(y_, x__, weights=weights).fit()
    w = results.wald_test(np.eye(len(results.params))[2:5])
    f = w.fvalue
    p = w.pvalue
    
    return f, p

In [3]:
def ranked_from_wald0(features, targets):
    z_list = features.loc[:,'z1':'z12'].columns
    x_list = features.loc[:,'x1':'x34'].columns

    f_list = []
    p_list = []
    for z in z_list:
        flist = []
        plist = []
        for feat in x_list:
            f0, p0 = wls_results_wald(features, targets, x=feat, z=z)
            flist.append(f0)
            plist.append(p0)

        f_list.append(flist)
        p_list.append(plist)
    
    f_dict = {'z1':f_list[0], 'z2':f_list[1], 'z3':f_list[2], 'z4':f_list[3],
              'z5':f_list[4], 'z6':f_list[5], 'z7':f_list[6], 'z8':f_list[7],
              'z9':f_list[8], 'z10':f_list[9], 'z11':f_list[10], 'z12':f_list[11]}

    f_df = pd.DataFrame.from_dict(f_dict, orient='index',
                           columns=x_list)

    p_dict = {'z1':p_list[0], 'z2':p_list[1], 'z3':p_list[2], 'z4':p_list[3],
              'z5':p_list[4], 'z6':p_list[5], 'z7':p_list[6], 'z8':p_list[7],
              'z9':p_list[8], 'z10':p_list[9], 'z11':p_list[10], 'z12':p_list[11]}

    p_df = pd.DataFrame.from_dict(p_dict, orient='index',
                           columns=x_list)
    
     
    p_list = []
    for col in p_df.columns:
        for idx in p_df.index:
            val = p_df.loc[idx, col]
            p_list.append(((col, idx), val))
    
    # ranked pairs with pvals
    ranked_p = eda.sort_scores1(p_list)
   
    return ranked_p[:10]

In [4]:
def normalize_and_fill(df):
    mean = df.loc[:,'x1':'z12'].mean()
    std = df.loc[:,'x1':'z12'].std()
    df.loc[:,'x1':'z12'] = (df.loc[:,'x1':'z12'] - mean)/std
    
    df = df.fillna(0)
    
    return df, mean, std

In [5]:
def train_and_test(features_df, targets_df, year=None, 
                   train_month_start=None, train_day_start=None,
                   train_month_end=None, train_day_end=None,
                   test_month_start=None, test_day_start=None,
                   test_month_end=None, test_day_end=None):
    
    train_features = features_df.loc[(features_df['datetime'].dt.date >= datetime.date(year, train_month_start, train_day_start))
                                    & (features_df['datetime'].dt.date <= datetime.date(year, train_month_end, train_day_end))]
    train_targets = targets_df.loc[(targets_df['datetime'].dt.date >= datetime.date(year, train_month_start, train_day_start))
                                    & (targets_df['datetime'].dt.date <= datetime.date(year, train_month_end, train_day_end))]
    
    test_features = features_df.loc[(features_df['datetime'].dt.date >= datetime.date(year, test_month_start, test_day_start))
                                    & (features_df['datetime'].dt.date <= datetime.date(year, test_month_end, test_day_end))]
    test_targets = targets_df.loc[(targets_df['datetime'].dt.date >= datetime.date(year, test_month_start, test_day_start))
                                    & (targets_df['datetime'].dt.date <= datetime.date(year, test_month_end, test_day_end))]
    
    return train_features, train_targets, test_features, test_targets

In [6]:
def selection(train_features, train_targets, test_features, test_targets):
    
    z_list = train_features.loc[:,'z1':'z12'].columns
    x_list = train_features.loc[:,'x1':'x34'].columns
    
    train_features, mean, std = normalize_and_fill(train_features)
    test_features.loc[:,'x1':] = (test_features.loc[:,'x1':]-mean)/std
    test_features = test_features.fillna(0)
    
    ranked_p = ranked_from_wald0(train_features, train_targets)
    
    pairs_list_train_top = []
    pairs_list_test_top = []
    for pair in ranked_p:
        print(pair)
        x = pair[0][0]
        z = pair[0][1]
        x_col_train = np.array(train_features[x]).reshape(-1,1)
        z_col_train = np.array(train_features[z]).reshape(-1,1)
        x_z_train = x_col_train * z_col_train
        pairs_list_train_top.append(x_z_train)
        x_col_test = np.array(test_features[x]).reshape(-1,1)
        z_col_test = np.array(test_features[z]).reshape(-1,1)
        x_z_test = x_col_test * z_col_test
        pairs_list_test_top.append(x_z_test)
        
    pairs_list_train_all = []
    pairs_list_test_all = []
    for x in x_list:
        x_col_train = np.array(train_features[x]).reshape(-1,1)
        x_col_test = np.array(test_features[x]).reshape(-1,1)
        pairs_list_train_all.append(x_col_train)
        pairs_list_test_all.append(x_col_test)
        for z in z_list:
            z_col_train = np.array(train_features[z]).reshape(-1,1)
            x_z_train = x_col_train * z_col_train
            pairs_list_train_all.append(x_z_train)
            z_col_test = np.array(test_features[z]).reshape(-1,1)
            x_z_test = x_col_test * z_col_test
            pairs_list_test_all.append(x_z_test)
        
    train_features_all = np.concatenate(pairs_list_train_all, axis=1)
    test_features_all = np.concatenate(pairs_list_test_all, axis=1)
    train_features_top = np.concatenate(pairs_list_train_top, axis=1)
    test_features_top = np.concatenate(pairs_list_test_top, axis=1)
    
    return train_features_all, test_features_all, train_features_top, test_features_top

In [19]:
def cov_and_r2(train_features_all, train_features_top, train_targets, test_features_all, test_features_top, test_targets):
    
    pca = PCA(n_components=10)
    pca.fit(train_features_all)
    train_features_pca = pca.transform(train_features_all)
    test_features_pca = pca.transform(test_features_all)
    
    lr_all = LinearRegression().fit(train_features_all, np.array(train_targets['y2']).reshape(-1,1))
    lr_top = LinearRegression().fit(train_features_top, np.array(train_targets['y2']).reshape(-1,1))
    lr_pca = LinearRegression().fit(train_features_pca, np.array(train_targets['y2']).reshape(-1,1))
    
    pred_all = np.array(lr_all.predict(test_features_all)).reshape(-1,)
    pred_top = np.array(lr_top.predict(test_features_top)).reshape(-1,)
    pred_pca = np.array(lr_pca.predict(test_features_pca)).reshape(-1,)
    
    x = np.stack((pred_all, pred_top, pred_pca), axis=0)
    cov = np.cov(x)
    
    print('R2 from all features')
    print(r2_score(np.array(test_targets['y2']).reshape(-1,1), pred_all))
    print('R2 from top features')
    print(r2_score(np.array(test_targets['y2']).reshape(-1,1), pred_top))
    print('R2 from pca features')
    print(r2_score(np.array(test_targets['y2']).reshape(-1,1), pred_pca))
    
    return cov

In [8]:
jan_features = pp.read_npy1('/u/project/cratsch/tescala/month_split_right/features_jan_2015.npy', features=True)
jan_targets = pp.read_npy1('/u/project/cratsch/tescala/month_split_right/targets_jan_2015.npy', targets=True)

feb_features = pp.read_npy1('/u/project/cratsch/tescala/month_split_right/features_feb_2015.npy', features=True)
feb_targets = pp.read_npy1('/u/project/cratsch/tescala/month_split_right/targets_feb_2015.npy', targets=True)

mar_features = pp.read_npy1('/u/project/cratsch/tescala/month_split_right/features_mar_2015.npy', features=True)
mar_targets = pp.read_npy1('/u/project/cratsch/tescala/month_split_right/targets_mar_2015.npy', targets=True)

z_list = ['z2', 'z3', 'z4', 'z5', 'z6', 'z7', 'z8', 'z9', 'z10', 'z11', 'z12']

new_jan_features = jan_features.loc[:, 'datetime':'z1']
for z in z_list:
    new_jan_features[z] = jan_features[z]
    
new_feb_features = feb_features.loc[:, 'datetime':'z1']
for z in z_list:
    new_feb_features[z] = feb_features[z]
    
new_mar_features = mar_features.loc[:, 'datetime':'z1']
for z in z_list:
    new_mar_features[z] = mar_features[z]
    
comb_features = pd.concat([new_jan_features, new_feb_features, new_mar_features], ignore_index=True)
comb_targets = pd.concat([jan_targets, feb_targets, mar_targets], ignore_index=True)

In [9]:
train_features, train_targets, test_features, test_targets = train_and_test(comb_features, comb_targets, year=2015, 
                   train_month_start=1, train_day_start=5,
                   train_month_end=1, train_day_end=30,
                   test_month_start=2, test_day_start=2,
                   test_month_end=2, test_day_end=6)

In [10]:
train_features_all, test_features_all, train_features_top, test_features_top = selection(train_features, train_targets, test_features, test_targets)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, v)


(('x32', 'z7'), array(2.39009491e-175))
(('x7', 'z8'), array(2.21796921e-151))
(('x33', 'z7'), array(1.79383485e-144))
(('x2', 'z8'), array(5.42365083e-132))
(('x8', 'z8'), array(2.26152532e-130))
(('x8', 'z4'), array(1.42159615e-129))
(('x3', 'z8'), array(3.16501336e-121))
(('x6', 'z8'), array(5.21929203e-111))
(('x34', 'z7'), array(3.63948426e-110))
(('x5', 'z7'), array(1.27055987e-85))


In [20]:
cov = cov_and_r2(train_features_all, train_features_top, train_targets, test_features_all, test_features_top, test_targets)
cov

R2 from all features
0.002191605368637828
R2 from top features
0.0006632985064024544
R2 from pca features
0.00038912384064382355


array([[2.69763101e-08, 5.58738119e-09, 2.48342888e-09],
       [5.58738119e-09, 5.90822340e-09, 2.02258017e-09],
       [2.48342888e-09, 2.02258017e-09, 2.86853786e-09]])

In [23]:
pd.DataFrame(np.array([('x1', 'z1'), ('x3', 'z5')]))

Unnamed: 0,0,1
0,x1,z1
1,x3,z5
