### Feature engineering - PCA
PCA with sklearn on the auto-mpg and Iris datasets

***
#### Environment
`conda activate sklearn-env`

***
#### Goals
- Run PCA
- Observe explained variance
- Observe the scatter plot of the PCA features

***
#### References
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html


#### Basic python imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris

# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

#### Dataset load from CSV located on UCI website.

http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data  
If the URL does not work the dataset can be loaded from the data folder `./data/auto-mpg.data`.

In [None]:
label = ''
dataset = None

if True :
    label = 'MPG'

    url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
    column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                    'Acceleration', 'Model Year', 'Origin']

    raw_dataset = pd.read_csv(url, names=column_names,
                              na_values='?', comment='\t',
                              sep=' ', skipinitialspace=True)
    dataset = raw_dataset.copy()
    dataset.sample(5)
else :
    label = 'target'
    data = load_iris(as_frame = True )
    dataset = data.frame
    dataset.head(2)

### Dataset split
- row base in test and train datasets
- column base in features and labels

In [None]:
dataset = dataset.dropna().copy()

dataset.reset_index(drop=True, inplace=True)

train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop(label)
train_labels.reset_index(drop=True, inplace=True)
test_labels = test_features.pop(label)

#### Standardize data

In [None]:
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit_transform(train_features)

#### PCA

In [None]:
from sklearn.decomposition import PCA
pca_transformer = PCA()
pca_result = pca_transformer.fit_transform(scaled_features)

labels = {
    str(i): f"pca {i+1}"
    for i, var in enumerate(pca_transformer.explained_variance_ratio_ * 100)
}

pca_df = pd.DataFrame(data = pca_result, columns = labels)

pca_df = pd.concat([pca_df, train_labels], axis=1)

pca_df.sample(10)

#### Explain and visualize output

In [None]:
print('Explained variance ratio:', pca_transformer.explained_variance_ratio_)

In [None]:
corr_orig = dataset.corr()
corr_pca = pca_df.corr()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,4))

ax1.set_title('PCA Features')
ax2.set_title('Original Features')

sns.color_palette("hls", 8)

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr_pca, dtype=bool))
sns.heatmap(corr_pca, annot=True, fmt='.2f', mask = mask, cmap="YlGnBu", xticklabels=corr_pca.columns.values,yticklabels=corr_pca.columns.values, ax = ax1)

mask = np.triu(np.ones_like(corr_orig, dtype=bool))
sns.heatmap(corr_orig, annot=True, fmt='.2f', mask = mask, cmap="YlGnBu", xticklabels=corr_orig.columns.values,yticklabels=corr_orig.columns.values, ax = ax2)

#### Plot "new" data

In [None]:
plt.scatter(pca_df['0'], pca_df['1'], c = pca_df[label])
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title(f'{label}')
plt.show()