In [121]:
import numpy as np
import pandas as pd


In [122]:
mu_vec1 = np.array([0,0,0])
cov_mat1 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class1_sample = np.random.multivariate_normal(mu_vec1,cov_mat1,30)
df = pd.DataFrame(class1_sample,columns = ['feature1','feature2','feature3'])
df['target'] = 1
mu_vec2 = np.array([1,1,1])
cov_mat2 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class2_sample = np.random.multivariate_normal(mu_vec2,cov_mat2,30)
df1 = pd.DataFrame(class2_sample,columns = ['feature1','feature2','feature3'])
df1['target'] = 0
df = pd.concat([df,df1],ignore_index=True)
df = df.sample(frac=1)  # sample-select a random subset of data, frac1 = whole dataset
df.head()

Unnamed: 0,feature1,feature2,feature3,target
52,1.22147,1.601953,2.23224,0
58,1.686502,-0.850042,-0.201931,0
54,0.263845,0.236626,-0.371323,0
3,-0.448639,1.849688,-1.53965,1
6,-1.843051,1.920818,-0.571038,1


In [123]:
import plotly.express as px
fig = px.scatter_3d(df, x = df['feature1'], y = df['feature2'],
                    z = df['feature3'], color = df['target'].astype('str'))
fig.update_traces(marker=dict(size=12,line=dict(width=2,color = 'DarkSlateGrey')),
                  selector = dict(mode='markers'))
fig.write_html("scatter_3d_plot.html")  # saved

# PCA

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [99]:
# Mean centering
df.iloc[:,0:3] = scaler.fit_transform(df.iloc[:,0:3])
print(df.iloc[:,2].sum()) # this sum is not zero because of 
# rounding in the process of making std dev 1 

2.0816681711721685e-15


In [108]:
# Finding the covar matrix
cov_matrix = np.cov([df.iloc[:,0],df.iloc[:,1],df.iloc[:,2]])
print(cov_matrix)

[[1.01694915 0.52643911 0.33939696]
 [0.52643911 1.01694915 0.28346683]
 [0.33939696 0.28346683 1.01694915]]


In [109]:
eigen_values, eigen_vectors = np.linalg.eig(cov_matrix)

In [110]:
eigen_values

array([1.79355401, 0.48600889, 0.77128456])

In [111]:
eigen_vectors

array([[-0.62537545, -0.73506266, -0.26189393],
       [-0.60417871,  0.66852332, -0.43364117],
       [-0.49383563,  0.1129578 ,  0.86218728]])

In [112]:
minima = min(eigen_values)
idx_minima = np.argmin(eigen_values)

In [113]:
pc = np.delete(eigen_vectors, idx_minima , axis=0)

In [114]:
pc

array([[-0.62537545, -0.73506266, -0.26189393],
       [-0.49383563,  0.1129578 ,  0.86218728]])

In [115]:
transformed_df = np.dot(df.iloc[:,0:3],pc.T)
# 40,3 - 3,2  = 40,2

In [117]:
new_df = pd.DataFrame(transformed_df,columns=['PC1','PC2'])
new_df['target']=df['target'].values
new_df.head()

Unnamed: 0,PC1,PC2,target
0,-0.465125,0.886519,0
1,0.893776,-1.787454,1
2,0.624183,-0.17696,1
3,0.408683,-0.193201,1
4,0.292514,0.541399,1


In [120]:
new_df['target'] = new_df['target'].astype('str')
fig = px.scatter(new_df, x = new_df['PC1'], y = new_df['PC2'], color = new_df['target'],color_discrete_sequence=px.colors.qualitative.G10)
fig.update_traces(marker=dict(size=12,line=dict(width=2,color = 'DarkSlateGrey')),
                  selector = dict(mode='markers'))
fig.write_html("scatter_2d_plot.html")  # saved