In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np

df = pd.read_csv('./dataset/c/dataset_2.csv')
X = df.drop(columns=['class'])
Y = df['class']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)


# 1. Random Forest Model without PCA

In [7]:

random_forest = RandomForestClassifier(max_depth=50, random_state=0)

random_forest.fit(x_train, y_train)

pred_test = random_forest.predict(x_test)

print ('Current accuracy of Random Forest is: %3f' % metrics.accuracy_score(y_test, pred_test))

Current accuracy of Random Forest is: 0.933333


# 2. Random Forest With PCA

In [2]:
import numpy as np

X_std = StandardScaler().fit_transform(x_train)
# Calculating Eigenvectors and eigenvalues of Cov matirx
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
# Create a list of (eigenvalue, eigenvector) tuples
eig_pairs = [ (np.abs(eig_vals[i]),eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the eigenvalue, eigenvector pair from high to low
eig_pairs.sort(key = lambda x: x[0], reverse= True)

# Calculation of Explained Variance from the eigenvalues
tot = sum(eig_vals)
var_exp = [(i/tot)*100 for i in sorted(eig_vals, reverse=True)] # Individual explained variance
cum_var_exp = np.cumsum(var_exp) # Cumulative explained variance
n_components = [ n for n,i in enumerate(cum_var_exp) if i>90 ][0]

n_components

206

In [3]:
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder

new_df = pd.DataFrame(scale(x_train), columns=x_train.columns)

pca = PCA(n_components = n_components)


post_x_train = pca.fit_transform(x_train)
post_x_test = pca.transform(x_test)

random_forest = RandomForestClassifier(max_depth=15, random_state=0)

random_forest.fit(post_x_train, y_train)
y_pre = random_forest.predict(post_x_test)

print ('Current accuracy of Random Forest is: %3f' % metrics.accuracy_score(y_test, y_pre))


Current accuracy of Random Forest is: 0.913333


In [95]:
from sklearn import datasets

iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)

# normalize data
from sklearn import preprocessing
data_scaled = pd.DataFrame(preprocessing.scale(df),columns = df.columns) 
data_scaled

# # PCA
# pca = PCA(n_components=2)
# pca.fit_transform(data_scaled)

# post_pca_array = pca.fit_transform(data_scaled)

# print (data_scaled.shape)

# print (post_pca_array.shape)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832
