## Feature Selection

### Import all necessary modules into our project.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sb

### Read the Wine dataset and view the data.

In [None]:
df = pd.read_csv('wine.csv')

df

### Extract and display all features except "Cultivar".

In [None]:
df_features = df.iloc[:,1:]

df_features

### Extract and display the label/target "Cultivar".

In [None]:
df_target = df.loc[:,['Cultivar']]

df_target

### Use Seaborn's pairplot() function to show correlation between features, except "Cultivar".

In [None]:
sb.pairplot(df_features)

### Display the Pearson Correlation Matrix for the entire dataset.

In [None]:
corr_mat = df.corr()

corr_mat

Select the best features in the Wine dataset.

1. Look for candidate-features that have a pearson correlation coefficient less than -0.5 and more than 0.5 with respect to "Cultivar" (our target).

2. For each candidate-feature, extract peers that have a coefficient more than 0.6 with itself. Then, among that candidate-feature and its highly-correlated peers, select the one that has the highest coefficient value with resepect to "Cultivar". Discard the rest of the unselected features and do not consider them in future iterations.

3. Iterate until all candidate-features (that have not been discarded in earlier iterations) have been processed.

4. The selected ones in each iteration are the "best features".

### Perform Step 1 below.

In [None]:
series = corr_mat['Cultivar'].drop('Cultivar')
candidates_series = series[(series < -0.5) | (series > 0.5)]

candidates = candidates_series.index
candidates


### Perform Steps 2 and 3 below.

In [None]:
skip = []
selected = []

for x in candidates:  
    if x not in skip and x not in selected:  
        series_wrt_x = corr_mat[x].drop(['Cultivar'])   # examine the coefficients w.r.t to feature x
        peers = series_wrt_x[candidates]
        peers = peers[(peers > 0.6)]    # look for highly-correlated peers within the candidates
        if len(peers) == 1:
            selected.append(x)  # accept feature x, no other highly-correlated peers
        else:
            series_wrt_target = corr_mat['Cultivar'][peers.index]
            most_corr = series_wrt_target.abs().idxmax()  # select most highly-correlated feature w.r.t target
            selected.append(most_corr)  

            series_wrt_target.drop(most_corr, inplace=True)
            for i in series_wrt_target.index:
                skip.append(i)  # place non-selected peers into the "skip" basket
          
selected