# Feature Selection & Dimensionality Reduction
Try out different feature combinations, and PCA/LDA.

In [642]:
feature_selection_method = "none"
# Possible: "manual", "none"

feature_normalisation_method = "min-max"
# Possible: "min-max", "z-score", "combined", "none"

dim_reduction_method = "lda"
# Possible: "pca", "lda", "none"

pca_num_components = 10

In [643]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Load Data

In [644]:
train_preprocessed_df = pd.read_csv("final_df.csv")
train_df = train_preprocessed_df.drop('Unnamed: 0', axis=1)


#test_df = pd.read_csv("project/data/test/test.csv", parse_dates = ['date_recorded' ],  na_values = [0, '0'])


# Numerical cols
numerical_cols = train_df.select_dtypes(include='number').columns.tolist()
categorical_cols = ['funder','installer','basin', 'region_code', 'extraction_type', 'extraction_type_class', 'payment_type', 'water_quality',
 'quantity', 'source', 'waterpoint_type', 'scheme_management', 'season', 'region_district', 'authority_scheme', 'population_bins', "amount_tsh_missing", "permit"]

numerical_cols = list(set(numerical_cols) - set(categorical_cols))



print(f"Numerical Cols: {numerical_cols}")

Numerical Cols: ['longitude', 'age', 'gps_height', 'population', 'latitude', 'id', 'public_meeting']


  train_preprocessed_df = pd.read_csv("final_df.csv")


In [645]:
temp_df = pd.read_csv("cleaned_df.csv")

  temp_df = pd.read_csv("cleaned_df.csv")


## (Continued) Feature Engineering

### Funder + Installer Cardinality

In [646]:
print(temp_df[["funder", "installer"]].head())

    funder     installer
0    roman         roman
1  grumeti       grumeti
2    other  world vision
3   unicef        unicef
4    other       artisan


In [647]:
print(temp_df["funder"].unique().size)

151


In [648]:
#overall counts
temp_df['status_group'].value_counts(True)

functional                 0.543090
non functional             0.384232
functional needs repair    0.072678
Name: status_group, dtype: float64

In [649]:
# Filter the DataFrame based on the specific value in 'column1'
filtered_data = temp_df[temp_df['funder'] == "germany"]

# Compute the value counts of 'column2' for the specific value in 'column1'
filtered_data['status_group'].value_counts(True)

functional                 0.772277
non functional             0.207921
functional needs repair    0.019802
Name: status_group, dtype: float64

In [650]:
# Compute the value counts of the column
value_counts = temp_df['funder'].value_counts()

# Identify values with fewer than X instances
values_to_replace = value_counts[value_counts < 10].index.tolist()

# Replace values with "other"
temp_df['funder'] = temp_df['funder'].replace(values_to_replace, "other")
temp_df["funder"].value_counts()

other                             16170
government of tanzania            11299
danida                             3907
hesawa                             2782
rwssp                              1703
world bank                         1701
kkkt                               1623
world vision                       1562
unicef                             1324
tasaf                              1135
dhv                                1065
district council                   1037
private individual                 1034
dwsp                               1012
norad                               949
germany republi                     765
water                               739
tcrs                                735
ministry of water                   728
dwe                                 605
netherlands                         592
hifab                               577
adb                                 551
lga                                 546
amref                               525


## Feature Selection

### No Feature Selection
Simply use all of the processed/engineered features.

In [651]:
if(feature_selection_method == "none"):
    pass

### Manual Feature Selection
BrendaLoznik drops these features:

`'amount_tsh', 'date_recorded',  'extraction_type_group', 'lga', 'management', 'management_group', 'num_private', 'payment', 'source_class', 'source_type', 'quality_group', 'quantity_group', 'ward', 'waterpoint_type_group', 'wpt_name', 'scheme_name', 'subvillage', 'recorded_by', 'id', 'status_group'`

In [652]:
if(feature_selection_method == "manual"):
    features_to_drop = ['amount_tsh', 'date_recorded',  'extraction_type_group', 'lga', 'management', 'management_group',
                        'num_private', 'payment', 'source_class', 'source_type', 'quality_group', 'quantity_group', 'ward',
                        'waterpoint_type_group', 'wpt_name', 'scheme_name', 'subvillage', 'recorded_by', 'id', 'status_group']
    train_df = train_df.drop(features_to_drop)

In [653]:
featureList = list(train_df.columns)
featureList.remove("status_group")

## Feature Normalisation

### Min-Max Scaling


In [654]:
if(feature_normalisation_method == "min-max"):

    # Create an instance of MinMaxScaler
    scaler = MinMaxScaler()

    # Fit the scaler to your data
    scaler.fit(train_df[numerical_cols])

    # Perform min-max scaling on your data
    train_df[numerical_cols] = scaler.transform(train_df[numerical_cols])

### Z-score Normalisation

In [655]:
if(feature_normalisation_method == "z-score"):

    # Create an instance of MinMaxScaler
    scaler = StandardScaler()

    # Fit the scaler to your data
    scaler.fit(train_df[numerical_cols])

    # Perform min-max scaling on your data
    train_df[numerical_cols] = scaler.transform(train_df[numerical_cols])


### Combined Scaling
Manually combine Min-Max scaling, Z-score normalisation and Robust scaling on a feature-by-feature basis.

In [656]:
if(feature_normalisation_method == "combined"):

    # Convert to normal distribution
    zscoreColumns = ["population", "gps_height", "age"]

    # Maintain original distribution - flatten to 0..1
    minmaxColumns = ["latitude", "longitude"]

    # Normalisation for when there's outliers
    robustColumns = ["population"]


    mmScaler = MinMaxScaler()
    zScaler = StandardScaler()
    rScaler = RobustScaler()

    # Fit the scalers to data
    mmScaler.fit(train_df[minmaxColumns])
    zScaler.fit(train_df[zscoreColumns])
    rScaler.fit(train_df[robustColumns])

    # Apply scaling to each column group
    train_df[zscoreColumns] = zScaler.transform(train_df[zscoreColumns])
    train_df[minmaxColumns] = mmScaler.transform(train_df[minmaxColumns])
    train_df[robustColumns] = rScaler.transform(train_df[robustColumns])

## Dimensionality Reduction

### PCA

In [657]:
if(dim_reduction_method=="pca"):
    # Apply PCA to all columns
    pca = PCA(n_components=pca_num_components)
    reduced_df = pca.fit_transform(train_df[featureList])

### LDA

In [658]:
if(dim_reduction_method=="lda"):

    # Create an instance of LinearDiscriminantAnalysis
    lda = LinearDiscriminantAnalysis()

    # Fit the LDA model to the scaled features and target variable
    lda = lda.fit(train_df[featureList], train_df["status_group"])

    # Transform the features to the LDA space
    reduced_df = lda.transform(train_df[featureList])

ValueError: Input contains NaN

In [None]:
print(reduced_df)

[[-1.6348384  -0.11828999]
 [-0.98795118 -0.18637031]
 [-1.22236819 -0.74249202]
 ...
 [-0.14693932 -0.0160649 ]
 [-0.15982245 -0.35400913]
 [ 0.0182034  -0.43452091]]


# Modelling

## 