In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 1 Import Data

In [None]:
dataset = pd.read_csv("../data/results/all_fingerprints.csv")
dataset.head()

# 2. Explore Data

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset[dataset["class"] == 0].info() # 2322
print()
dataset[dataset["class"] == 1].info() # 78
print()
dataset[dataset["class"] == 2].info() # 5396

# 3 Data Cleansing

In [None]:
dataset.drop(['smiles'], axis=1, inplace=True)
dataset.head(10)

In [None]:
dataset.tail(10)

# 4 Data Splitting

In [None]:
X = dataset.drop("class", axis=1)
y = dataset["class"]

In [None]:
X

In [None]:
y

# 5 Standardized Features

In [None]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

# 6 PCA

In [None]:
pca = PCA(n_components=0.9) # Keep 90% of Informations
X_pca = pca.fit_transform(X_std)

In [None]:
X_pca

In [None]:
X_pca.shape

In [None]:
X_pca_df = pd.DataFrame(X_pca)

In [None]:
X_pca_df

# 7 Concatenate with Target

In [None]:
X_pca_df["class"] = y
pca_df = X_pca_df
pca_df.head(10)

In [None]:
pca_df[pca_df["class"] == 0].info() # 2322
print()
pca_df[pca_df["class"] == 1].info() # 78
print()
pca_df[pca_df["class"] == 2].info() # 5396

# 8 Save to CSV

In [None]:
pca_df.to_csv("../data/pca/pca_fingerprint_results.csv", index=False)