## Feature selection

Create own functions to filter features based on the following criteria:

* lower variance than x
* number of missing values is more than *x* %
* one of each pair of features, which are correlated together more than *x*

Use two data sources as input:
- output dataset from the feature engineering exercise last week.
- output dataset from the PCA exercise

Apply your functions to the combination of these two datasource and come up with the final dataset that can be used for training.

> #### Note
> Don't forget to keep target variable (duration_seconds) intact

In [83]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

In [84]:
#X = pd.read_csv('pca_data.csv', index_col=0)
# X.head()

In [85]:
X = pd.read_csv('feature_eng_data.csv', index_col=0)
Y = pd.DataFrame(X['duration_seconds'],index=X.index)
X.drop(columns=['duration_seconds'],inplace=True)
X.head()

Unnamed: 0,lat,lng,apparentTemperature,cloudCover,cloudCoverError,dewPoint,dewPointError,humidity,humidityError,precipIntensity,...,windBearing,windBearingError,windSpeed,windSpeedError,ast_absolute_magnitude_h,ast_estimated_diameter,ast_miss_distance,ast_relative_velocity,days_to_report,period_of_day
0,29.883056,-97.941111,25.86,0.0,,20.56,,0.73,,0.0,...,154.0,,4.89,,21.5,0.297879,42621696.0,13778.372043,19923,3.0
1,29.38421,-98.581082,26.12,0.0,,21.72,,0.77,,0.0,...,135.0,,6.6,,21.5,0.297879,42621696.0,13778.372043,20521,3.0
2,53.2,-2.916667,15.56,0.75,,12.86,,0.84,,,...,,,,,21.5,0.297879,42621696.0,13778.372043,19096,2.0
3,28.978333,-96.645833,22.68,0.12,,17.04,,0.71,,,...,136.0,,2.75,,21.5,0.297879,42621696.0,13778.372043,17265,3.0
4,21.418056,-157.803611,25.74,0.63,,21.47,,0.77,,,...,80.0,,3.6,,21.5,0.297879,42621696.0,13778.372043,15809,3.0


In [86]:
def var_filter(df,threshold=0.10):
    vt = VarianceThreshold(threshold)
    df_transformed = vt.fit_transform(df)
    # columns we have selected
    # get_support() is method of VarianceThreshold and stores boolean of each variable in the numpy array.
    selected_columns = df.columns[vt.get_support()]
    # transforming an array back to a data-frame preserves column labels
    return pd.DataFrame(df_transformed, columns = selected_columns)

In [87]:
def miss_val_filter(df,threshold=0.30):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    todrop = missing_data[missing_data['Percent']>threshold].index.tolist()
    return df.drop(columns=todrop)

In [88]:
def corr_feat_filter(df,threshold=0.8):
    # Removing correlated features
    # step 1 - calculate a correlation matrix
    df_corr = df.corr().abs()
    
    # step 2 - get pairs of highly correlated features
    indices = np.where(df_corr > 0.8) 
    indices = [(df_corr.index[x], df_corr.columns[y]) for x, y in zip(*indices) if x != y and x < y]

    # step 3 - remove correlated columns
    for idx in indices: #each pair
        try:
            df.drop(idx[1], axis = 1, inplace=True)
        except KeyError:
            pass
    return df

In [89]:
X = var_filter(X)
X.head()

Unnamed: 0,lat,lng,apparentTemperature,cloudCover,dewPoint,dewPointError,precipIntensity,pressure,pressureError,temperature,...,windBearing,windBearingError,windSpeed,windSpeedError,ast_absolute_magnitude_h,ast_estimated_diameter,ast_miss_distance,ast_relative_velocity,days_to_report,period_of_day
0,29.883056,-97.941111,25.86,0.0,20.56,,0.0,1009.05,,25.86,...,154.0,,4.89,,21.5,0.297879,42621696.0,13778.372043,19923.0,3.0
1,29.38421,-98.581082,26.12,0.0,21.72,,0.0,1008.81,,26.12,...,135.0,,6.6,,21.5,0.297879,42621696.0,13778.372043,20521.0,3.0
2,53.2,-2.916667,15.56,0.75,12.86,,,1019.0,,15.56,...,,,,,21.5,0.297879,42621696.0,13778.372043,19096.0,2.0
3,28.978333,-96.645833,22.68,0.12,17.04,,,1020.64,,22.68,...,136.0,,2.75,,21.5,0.297879,42621696.0,13778.372043,17265.0,3.0
4,21.418056,-157.803611,25.74,0.63,21.47,,,1015.33,,25.74,...,80.0,,3.6,,21.5,0.297879,42621696.0,13778.372043,15809.0,3.0


In [90]:
X = miss_val_filter(X)
X.head()

Unnamed: 0,lat,lng,apparentTemperature,cloudCover,dewPoint,pressure,temperature,time,visibility,windBearing,windSpeed,ast_absolute_magnitude_h,ast_estimated_diameter,ast_miss_distance,ast_relative_velocity,days_to_report,period_of_day
0,29.883056,-97.941111,25.86,0.0,20.56,1009.05,25.86,-638227800.0,16.09,154.0,4.89,21.5,0.297879,42621696.0,13778.372043,19923.0,3.0
1,29.38421,-98.581082,26.12,0.0,21.72,1008.81,26.12,-638226000.0,16.09,135.0,6.6,21.5,0.297879,42621696.0,13778.372043,20521.0,3.0
2,53.2,-2.916667,15.56,0.75,12.86,1019.0,15.56,-448959600.0,2.9,,,21.5,0.297879,42621696.0,13778.372043,19096.0,2.0
3,28.978333,-96.645833,22.68,0.12,17.04,1020.64,22.68,-417304800.0,16.09,136.0,2.75,21.5,0.297879,42621696.0,13778.372043,17265.0,3.0
4,21.418056,-157.803611,25.74,0.63,21.47,1015.33,25.74,-291060000.0,16.09,80.0,3.6,21.5,0.297879,42621696.0,13778.372043,15809.0,3.0


In [91]:
X = corr_feat_filter(X)
X.head()

Unnamed: 0,lat,lng,apparentTemperature,cloudCover,pressure,time,visibility,windBearing,windSpeed,ast_absolute_magnitude_h,ast_miss_distance,ast_relative_velocity,period_of_day
0,29.883056,-97.941111,25.86,0.0,1009.05,-638227800.0,16.09,154.0,4.89,21.5,42621696.0,13778.372043,3.0
1,29.38421,-98.581082,26.12,0.0,1008.81,-638226000.0,16.09,135.0,6.6,21.5,42621696.0,13778.372043,3.0
2,53.2,-2.916667,15.56,0.75,1019.0,-448959600.0,2.9,,,21.5,42621696.0,13778.372043,2.0
3,28.978333,-96.645833,22.68,0.12,1020.64,-417304800.0,16.09,136.0,2.75,21.5,42621696.0,13778.372043,3.0
4,21.418056,-157.803611,25.74,0.63,1015.33,-291060000.0,16.09,80.0,3.6,21.5,42621696.0,13778.372043,3.0
