In [None]:
import plotly.express as px
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
iris_df = px.data.iris()
iris_df.shape

(150, 6)

In [None]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1


In [None]:
features = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
new_iris_df = iris_df[features]
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
new_iris_df.shape

(150, 4)

In [None]:
scaler = MinMaxScaler()
scaled_arr = scaler.fit_transform(X=new_iris_df)

In [None]:
scaled_arr

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667],
       [0.30555556, 0.79166667, 0.11864407, 0.125     ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667],
       [0.13888889, 0.41666667, 0.06779661, 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.38888889, 1.        , 0.08474576, 0.125     ],
       [0.30555556, 0.79166667, 0.05084746, 0.125     ],
       [0.22222222, 0.625     ,

In [None]:
pca = PCA(n_components=None)
components = pca.fit_transform(scaled_arr)

In [None]:
components

array([[-6.30361249e-01,  1.11556258e-01,  1.69929115e-02,
        -6.89094590e-03],
       [-6.23546310e-01, -1.00313199e-01,  4.71201236e-02,
        -3.23161046e-02],
       [-6.69792802e-01, -4.72200571e-02, -2.16477393e-02,
        -7.18645441e-03],
       [-6.54633051e-01, -9.87912747e-02, -2.52141464e-02,
         2.02850460e-02],
       [-6.48263265e-01,  1.37557689e-01, -1.68125515e-02,
         1.23015032e-02],
       [-5.34057000e-01,  2.93223042e-01, -2.67364715e-02,
         1.01744489e-03],
       [-6.56395887e-01,  1.49219390e-02, -9.36980789e-02,
         1.53319337e-02],
       [-6.25643957e-01,  6.10753191e-02,  1.22695676e-02,
         7.32530644e-03],
       [-6.76525845e-01, -1.96482315e-01, -3.81316446e-02,
         8.95180650e-03],
       [-6.46137172e-01, -6.32422300e-02,  5.97536915e-02,
         9.09989735e-03],
       [-5.96654863e-01,  2.20941913e-01,  4.97532816e-02,
        -5.63443311e-03],
       [-6.38828681e-01,  3.65958117e-02, -2.62592393e-02,
      

In [None]:
pd.DataFrame(components).head()

Unnamed: 0,0,1,2,3
0,-0.630361,0.111556,0.016993,-0.006891
1,-0.623546,-0.100313,0.04712,-0.032316
2,-0.669793,-0.04722,-0.021648,-0.007186
3,-0.654633,-0.098791,-0.025214,0.020285
4,-0.648263,0.137558,-0.016813,0.012302


In [None]:
pca.explained_variance_ratio_

array([0.84141901, 0.11732474, 0.03490564, 0.00635061])

In [None]:
pca.explained_variance_

array([0.23231168, 0.03239279, 0.00963728, 0.00175337])

In [None]:
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(components,
                        labels=labels,
                        dimensions=range(4),
                        color=iris_df["species"])
fig.update_traces(diagonal_visible=False)
fig.show()

## PCA LOADINGS

In [None]:
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

In [None]:
cumulative_variance

array([0.84141901, 0.95874375, 0.99364939, 1.        ])

In [None]:
var_df = pd.DataFrame(cumulative_variance,
                      index=[f'PCA-{i}' for i in range(1, 5)],
                      columns=['Explained Variance'])

In [None]:
var_df

Unnamed: 0,Explained Variance
PCA-1,0.841419
PCA-2,0.958744
PCA-3,0.993649
PCA-4,1.0


## Original DF

In [None]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1


## PCA's DF

In [None]:
pca_df = pd.DataFrame(components,
                      columns=[f'PC-{i}' for i in range(1, 5)])
pca_df = pd.concat([pca_df, iris_df['species']], axis=1)

In [None]:
pca_df.head()

Unnamed: 0,PC-1,PC-2,PC-3,PC-4,species
0,-0.630361,0.111556,0.016993,-0.006891,setosa
1,-0.623546,-0.100313,0.04712,-0.032316,setosa
2,-0.669793,-0.04722,-0.021648,-0.007186,setosa
3,-0.654633,-0.098791,-0.025214,0.020285,setosa
4,-0.648263,0.137558,-0.016813,0.012302,setosa
