**Initial Steps**

In [None]:
# Defining the file name from github
filename = '../Dataset/ad.data'

In [None]:
import pandas as pd
# Loading the data using pandas

adData = pd.read_csv(filename,sep=",",header = None,error_bad_lines=False)
adData.head()

In [None]:
# Seperating the dependent and independent variables
# Preparing the X variables
X = adData.loc[:,0:1557]
print(X.shape)
# Preparing the Y variable
Y = adData[1558]
print(Y.shape)


In [None]:
import numpy as np
# Replacing special characters in first 3 columns which are of type object
for i in range(0,3):
  X[i] = X[i].str.replace("?", 'NaN').values.astype(float)
# Replacing special characters in the remaining columns which are of type integer
for i in range(3,1557):
  X[i] = X[i].replace("?", 'NaN').values.astype(float)  
# Imputing the 'nan'  with mean of the values
for i in range(0,1557):
  X[i] = X[i].fillna(X[i].mean())

In [None]:
# Normalising the data sets
# Normalising data
from sklearn import preprocessing
# Creating the scaling function
minmaxScaler = preprocessing.MinMaxScaler()
X_tran = pd.DataFrame(minmaxScaler.fit_transform(X))
X_tran.head()

In [None]:
# Creating a high dimension data set
X_hd = pd.DataFrame(pd.np.tile(X_tran, (1, 2)))

print(X_hd.shape)

**Adding noise to the dataset**

In [None]:
# Defining the mean and standard deviation
mu, sigma = 0, 0.1 


In [None]:
# Generating samples from the distribution
noise = np.random.normal(mu, sigma, [3279,3116]) 
noise.shape

In [None]:
# Creating a new data set by adding noise
X_new = X_hd + noise


In [None]:
# Splitting data set into train and test sets
from sklearn.model_selection import train_test_split
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_new, Y, test_size=0.3, random_state=123)

print('Training set shape',X_train.shape)

print('Test set shape',X_test.shape)

**Backward Elimination Method**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Defining the Classification function
backModel = LogisticRegression()
# Reducing dimensionality to 300 features for backward elimination model
rfe = RFE(backModel, 300)


In [None]:
# Fitting the rfe for selecting the top 300 features
import time
t0 = time.time()
rfe = rfe.fit(X_train, y_train)
t1 = time.time()
print("Backward Elimination time:", round(t1-t0, 3), "s")

In [None]:
# Transforming both train and test sets

X_train_tran = rfe.transform(X_train)

X_test_tran = rfe.transform(X_test)

print("Training set shape",X_train_tran.shape)

print("Test set shape",X_test_tran.shape)

In [None]:
# Fitting the logistic regression model 
import time
# Defining the LogisticRegression function
RfeModel = LogisticRegression()
# Starting a timing function
t0=time.time()
# Fitting the model
RfeModel.fit(X_train_tran, y_train)
# Finding the end time 

print("Total training time:", round(time.time()-t0, 3), "s")





In [None]:
# Predicting on the test set and getting the accuracy
pred = RfeModel.predict(X_test_tran)

print('Accuracy of Logistic regression model after backward elimination: {:.2f}'.format(RfeModel.score(X_test_tran, y_test)))



In [None]:
# Printing the Confusion matrix
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

In [None]:
from sklearn.metrics import classification_report
# Getting the Classification_report
print(classification_report(y_test, pred))

**Forward Selection Method**

In [None]:
from sklearn.feature_selection import SelectKBest

# feature extraction
feats = SelectKBest(k=300)

In [None]:
 # Fitting the features for training set
import time
t0 = time.time()
fit = feats.fit(X_train, y_train)
t1 = time.time()
print("Forward selection fitting time:", round(t1-t0, 3), "s")

In [None]:
# Creating new training set and test sets 

features_train = fit.transform(X_train)
features_test = fit.transform(X_test)

In [None]:
# Printing the shape of train and test sets before transformation
print('Train shape before transformation',X_train.shape)
print('Test shape before transformation',X_test.shape)

# Printing the shape of train and test sets after transformation
print('Train shape after transformation',features_train.shape)
print('Test shape after transformation',features_test.shape)

In [None]:
# Fitting a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
import time

t0 = time.time()

forwardModel = LogisticRegression()
forwardModel.fit(features_train, y_train)

t1 = time.time()
print("Total training time:", round(t1-t0, 3), "s")

In [None]:
# Predicting with the forward model
pred = forwardModel.predict(features_test)
print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(forwardModel.score(features_test, y_test)))

In [None]:
# Generating confusion matrix
from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

In [None]:
from sklearn.metrics import classification_report
# Getting the Classification_report
print(classification_report(y_test, pred))

**Principal Component Analysis**

In [None]:
from sklearn.decomposition import PCA
import time
t0 = time.time()
pca = PCA(n_components=300)
# Fitting the PCA on the training set
pca.fit(X_train)
t1 = time.time()
print("PCA fitting time:", round(t1-t0, 3), "s")

In [None]:
# Transforming training set and test set
X_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [None]:
print("original shape of Training set:   ", X_train.shape)
print("original shape of Test set:   ", X_test.shape)
print("Transformed shape of training set:", X_pca.shape)
print("Transformed shape of test set:", X_test_pca.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
import time

pcaModel = LogisticRegression()

t0 = time.time()
pcaModel.fit(X_pca, y_train)
t1 = time.time()

print("Total training time:", round(t1-t0, 3), "s")

In [None]:
# Predicting with the pca model
pred = pcaModel.predict(X_test_pca)
print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(pcaModel.score(X_test_pca, y_test)))

In [None]:
# Generating confusion matrix
from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

In [None]:
from sklearn.metrics import classification_report
# Getting the Classification_report
print(classification_report(y_test, pred))

**Independent Component Analysis**

In [None]:
# Defining the ICA with number of components
from sklearn.decomposition import FastICA 
ICA = FastICA(n_components=300, random_state=123) 

In [None]:
# Fitting the ICA method and transforming the training set and noting the time
import time
t0 = time.time()
X_ica=ICA.fit_transform(X_train)
t1 = time.time()
print("ICA fitting time:", round(t1-t0, 3), "s")

In [None]:
# Transfroming the test set 
X_test_ica=ICA.transform(X_test)

In [None]:
print("original shape of Training set:   ", X_train.shape)
print("original shape of Test set:   ", X_test.shape)
print("Transformed shape of training set:", X_ica.shape)
print("Transformed shape of test set:", X_test_ica.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
import time

icaModel = LogisticRegression()

t0 = time.time()
icaModel.fit(X_ica, y_train)
t1 = time.time()

print("Total training time:", round(t1-t0, 3), "s")

In [None]:
# Predicting with the ica model
pred = icaModel.predict(X_test_ica)
print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(icaModel.score(X_test_ica, y_test)))

In [None]:
# Generating confusion matrix
from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

In [None]:
from sklearn.metrics import classification_report
# Getting the Classification_report
print(classification_report(y_test, pred))

**Factor Analysis**

In [None]:
# Defining the number of factors
from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components = 30,random_state=123)

In [None]:
# Fitting the Factor analysis method and transforming the training set
import time
t0 = time.time()
X_fac=fa.fit_transform(X_train)
t1 = time.time()
print("Factor analysis fitting time:", round(t1-t0, 3), "s")

In [None]:
# Transfroming the test set 
X_test_fac=fa.transform(X_test)

In [None]:
print("original shape of Training set:   ", X_train.shape)
print("original shape of Test set:   ", X_test.shape)
print("Transformed shape of training set:", X_fac.shape)
print("Transformed shape of test set:", X_test_fac.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
import time

facModel = LogisticRegression()

t0 = time.time()
facModel.fit(X_fac, y_train)
t1 = time.time()

print("Total training time:", round(t1-t0, 3), "s")

In [None]:
# Predicting with the factor analysis model
pred = facModel.predict(X_test_fac)
print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(facModel.score(X_test_fac, y_test)))

In [None]:
# Generating confusion matrix
from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

In [None]:
from sklearn.metrics import classification_report
# Getting the Classification_report
print(classification_report(y_test, pred))