# Feature Selection & Dimensionality Reduction
Try out different feature combinations, and PCA/LDA.

In [257]:
feature_selection_method = "none"
# Possible: "manual", "none"

feature_normalisation_method = "min-max"
# Possible: "min-max", "z-score", "combined", "none"

dim_reduction_method = "lda"
# Possible: "pca", "lda", "none"

pca_num_components = 10

In [258]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

pd.set_option('display.max_columns', None)

### Load Data

In [259]:
train_preprocessed_df = pd.read_csv("final_df.csv")
train_df = train_preprocessed_df.drop('Unnamed: 0', axis=1)


#test_df = pd.read_csv("project/data/test/test.csv", parse_dates = ['date_recorded' ],  na_values = [0, '0'])


# Numerical cols
numerical_cols = train_df.select_dtypes(include='number').columns.tolist()
categorical_cols = ['funder','installer','basin', 'region_code', 'extraction_type', 'extraction_type_class', 'payment_type', 'water_quality',
 'quantity', 'source', 'waterpoint_type', 'scheme_management', 'season', 'region_district', 'authority_scheme', 'population_bins', "amount_tsh_missing", "permit"]

numerical_cols = list(set(numerical_cols) - set(categorical_cols))

print(train_df.columns)

featureList = list(train_df.columns)
featureList.remove("status_group")
print(featureList)

print(f"Numerical Cols: {numerical_cols}")

Index(['id', 'status_group', 'funder', 'installer', 'latitude', 'basin',
       'region_code', 'public_meeting', 'extraction_type',
       'extraction_type_class', 'payment_type', 'water_quality', 'quantity',
       'source', 'waterpoint_type', 'longitude', 'permit', 'scheme_management',
       'gps_height', 'population', 'age', 'season', 'amount_tsh_missing',
       'region_district', 'authority_scheme'],
      dtype='object')
['id', 'funder', 'installer', 'latitude', 'basin', 'region_code', 'public_meeting', 'extraction_type', 'extraction_type_class', 'payment_type', 'water_quality', 'quantity', 'source', 'waterpoint_type', 'longitude', 'permit', 'scheme_management', 'gps_height', 'population', 'age', 'season', 'amount_tsh_missing', 'region_district', 'authority_scheme']
Numerical Cols: ['longitude', 'age', 'gps_height', 'population', 'latitude', 'id', 'public_meeting']


## Feature Selection

### No Feature Selection
Simply use all of the processed/engineered features.

In [260]:
if(feature_selection_method == "none"):
    pass

### Manual Feature Selection
BrendaLoznik drops these features:

`'amount_tsh', 'date_recorded',  'extraction_type_group', 'lga', 'management', 'management_group', 'num_private', 'payment', 'source_class', 'source_type', 'quality_group', 'quantity_group', 'ward', 'waterpoint_type_group', 'wpt_name', 'scheme_name', 'subvillage', 'recorded_by', 'id', 'status_group'`

In [261]:
if(feature_selection_method == "manual"):
    features_to_drop = ['amount_tsh', 'date_recorded',  'extraction_type_group', 'lga', 'management', 'management_group',
                        'num_private', 'payment', 'source_class', 'source_type', 'quality_group', 'quantity_group', 'ward',
                        'waterpoint_type_group', 'wpt_name', 'scheme_name', 'subvillage', 'recorded_by', 'id', 'status_group']
    train_df = train_df.drop(features_to_drop)

In [262]:
featureList = list(train_df.columns)
featureList.remove("status_group")

## Feature Normalisation

### Min-Max Scaling


In [263]:
if(feature_normalisation_method == "min-max"):

    # Create an instance of MinMaxScaler
    scaler = MinMaxScaler()

    # Fit the scaler to your data
    scaler.fit(train_df[numerical_cols])

    # Perform min-max scaling on your data
    train_df[numerical_cols] = scaler.transform(train_df[numerical_cols])

### Z-score Normalisation

In [264]:
if(feature_normalisation_method == "z-score"):

    # Create an instance of MinMaxScaler
    scaler = StandardScaler()

    # Fit the scaler to your data
    scaler.fit(train_df[numerical_cols])

    # Perform min-max scaling on your data
    train_df[numerical_cols] = scaler.transform(train_df[numerical_cols])


### Combined Scaling
Manually combine Min-Max scaling, Z-score normalisation and Robust scaling on a feature-by-feature basis.

In [265]:
if(feature_normalisation_method == "combined"):

    # Convert to normal distribution
    zscoreColumns = ["population", "gps_height", "age"]

    # Maintain original distribution - flatten to 0..1
    minmaxColumns = ["latitude", "longitude"]

    # Normalisation for when there's outliers
    robustColumns = ["population"]


    mmScaler = MinMaxScaler()
    zScaler = StandardScaler()
    rScaler = RobustScaler()

    # Fit the scalers to data
    mmScaler.fit(train_df[minmaxColumns])
    zScaler.fit(train_df[zscoreColumns])
    rScaler.fit(train_df[robustColumns])

    # Apply scaling to each column group
    train_df[zscoreColumns] = zScaler.transform(train_df[zscoreColumns])
    train_df[minmaxColumns] = mmScaler.transform(train_df[minmaxColumns])
    train_df[robustColumns] = rScaler.transform(train_df[robustColumns])

## Dimensionality Reduction

### PCA

In [266]:
if(dim_reduction_method=="pca"):
    # Apply PCA to all columns
    pca = PCA(n_components=pca_num_components)
    train_df = pca.fit_transform(train_df[featureList])

### LDA

In [267]:
if(dim_reduction_method=="lda"):

    print(featureList)
    
    # Create an instance of LinearDiscriminantAnalysis
    lda = LinearDiscriminantAnalysis()

    # Fit the LDA model to the scaled features and target variable
    lda = lda.fit(train_df[featureList], train_df["status_group"])

    # Transform the features to the LDA space
    train_df = lda.transform(train_df[featureList])

['id', 'funder', 'installer', 'latitude', 'basin', 'region_code', 'public_meeting', 'extraction_type', 'extraction_type_class', 'payment_type', 'water_quality', 'quantity', 'source', 'waterpoint_type', 'longitude', 'permit', 'scheme_management', 'gps_height', 'population', 'age', 'season', 'amount_tsh_missing', 'region_district', 'authority_scheme']
