In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.linear_model import Ridge, Lasso
import seaborn as sns

<h2> Feature Selection on Autism Biolog Dataset</h2>

<h3>Loading the Data</h3>


1. Loaded the dataset 
2. Drop the Columns which aren't required


In [4]:
 # read the csv file from the link provided
 # drop Negative Controls from the dataset, since Negative Controls are not needed
 # drop the column that is not required from the dataset(CMS#)

df = pd.read_csv("C:/AutisticDisorderProject/TestTrain5050data.csv")
df.drop(list(df.filter(regex='Negative Control')), axis=1, inplace=True)


asd_data = df.drop(columns='CMS#',axis=1)
# drop the 'CMS' column

asd_data.head()

Unnamed: 0,a-Cyclodextrin,Dextrin,Glycogen,Maltitol,Maltotriose,Maltose,D-Trehalose,D-Cellobiose,Gentiobiose,D-Glucose-6-Phosphate,...,Adenosine.4,Adenosine.5,Adenosine.6,Gly-His-Lys acetate salt,Gly-His-Lys acetate salt.1,Gly-His-Lys acetate salt.2,Gly-His-Lys acetate salt.3,Gly-His-Lys acetate salt.4,Gly-His-Lys acetate salt.5,Diagnosis
0,0.262591,0.997133,0.986968,0.277475,1.074956,1.089453,0.405887,0.31138,0.353178,0.455127,...,1.102738,1.095119,1.080465,0.997059,1.088651,0.986823,0.97434,0.951473,0.869257,1
1,0.239292,0.911528,1.013144,0.300022,1.022068,1.135745,0.392186,0.324281,0.310849,0.463682,...,1.181,1.1223,1.2356,0.9638,0.9439,0.9193,0.9068,0.9336,0.8862,1
2,0.312369,1.408409,1.419281,0.379099,0.362415,0.169417,0.256599,0.111319,0.16791,0.61743,...,1.3228,0.7538,0.6993,1.188,1.2108,1.1219,1.0656,1.0468,0.711,1
3,0.188019,0.592031,0.413213,0.170653,0.253267,0.182602,0.158959,0.146343,0.134804,0.329473,...,0.6153,0.669,0.5721,0.641,0.6339,0.6221,0.6145,0.5767,0.5462,1
4,0.253399,1.352917,0.949392,0.251156,0.444184,0.218518,0.137328,0.17454,0.155979,0.680785,...,1.401038,1.367023,1.305691,1.234141,1.515057,1.158642,1.21869,1.18306,1.089372,1


In [5]:
# strip the whitespace in the column names
asd_data.columns = asd_data.columns.str.strip()
asd_data.shape

(100, 735)

<h3>Scaling the Data</h3>

In [6]:
# hint: Use MinMaxScaler for scaling
def scale_data(data):
    # store all the columns
    cols = data.columns
    # create a scaler
    scaler = MinMaxScaler(feature_range=(0, 1))
    
    # fit and transform the data
    scaled_data = scaler.fit_transform(data)
    
    # store the transformed data in a dataframe and return it.
    transformed_data = pd.DataFrame(scaled_data)
    transformed_data.columns = cols
    return transformed_data


In [7]:
transformed_data = scale_data(asd_data)
transformed_data.head()

Unnamed: 0,a-Cyclodextrin,Dextrin,Glycogen,Maltitol,Maltotriose,Maltose,D-Trehalose,D-Cellobiose,Gentiobiose,D-Glucose-6-Phosphate,...,Adenosine.4,Adenosine.5,Adenosine.6,Gly-His-Lys acetate salt,Gly-His-Lys acetate salt.1,Gly-His-Lys acetate salt.2,Gly-His-Lys acetate salt.3,Gly-His-Lys acetate salt.4,Gly-His-Lys acetate salt.5,Diagnosis
0,0.423145,0.611943,0.673013,0.655346,0.79826,0.741404,0.599716,0.665656,0.754709,0.489498,...,0.553747,0.529862,0.554927,0.580709,0.632509,0.61602,0.553273,0.584858,0.574413,1.0
1,0.354959,0.554366,0.692812,0.725226,0.754946,0.775075,0.57532,0.703234,0.652358,0.503952,...,0.597941,0.544887,0.644687,0.558909,0.537492,0.568644,0.508922,0.573468,0.58587,1.0
2,0.568819,0.888562,1.0,0.970318,0.214713,0.072198,0.333904,0.082914,0.306733,0.763727,...,0.678014,0.341184,0.334388,0.705864,0.71269,0.710792,0.6132,0.64561,0.467408,1.0
3,0.204909,0.339476,0.239045,0.324258,0.125324,0.081788,0.160053,0.184931,0.226685,0.277191,...,0.278496,0.294307,0.260791,0.347324,0.334002,0.360123,0.316979,0.346016,0.355979,1.0
4,0.396242,0.851239,0.644593,0.573772,0.281679,0.107912,0.121537,0.267066,0.277886,0.870772,...,0.722194,0.680168,0.685241,0.736108,0.912411,0.736572,0.713729,0.732448,0.723244,1.0


5. Perform train_test_split

In [8]:
# select all rows of all columns except the column 'Diagnosis'
X = transformed_data.loc[:, :'Gly-His-Lys acetate salt.5']

# the column we are going to classify
y = transformed_data['Diagnosis']

# do the train test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=50)

# convert y_train and y_test into dataframes
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

y_train = y_train.astype('int')

Writing a function which returns the list of k-Best features where k being the number of features required

In [9]:
#use chi2
def get_k_best_features(X_train, y_train, k):
    
    # use SelectKBest class to get k best features
    X_best_features = SelectKBest(score_func=chi2, k=k)
    z = X_best_features.fit_transform(X_train, y_train) 
    
    # will return boolean indices
    best_features_indices = X_best_features.get_support()
    
    best_features = []
    
    data_columns = X_train.columns
    
    for index, bool_value in enumerate(best_features_indices):
        # append the best features to the best_features list
        if(bool_value):
          best_features.append(data_columns[index])
    
    return best_features



Print the results

In [10]:
features_one = get_k_best_features(X_train, y_train, 12)        
features_two = get_k_best_features(X_train, y_train, 20)

In [11]:
features_one

['NaCl.2',
 'Potassium Chloride.1',
 'Potassium Chloride.2',
 'Calcium Choride',
 'Iodine',
 'Iodine.1',
 'Iodine.2',
 'Sodium Molybdate.1',
 'Sodium Molybdate.2',
 'Potassium Chromate.1',
 'Potassium Chromate.2',
 'Sodium Nitrite.3']

In [12]:
features_two

['D-Glucose-6-Phosphate',
 'NaCl.2',
 'Potassium Chloride.1',
 'Potassium Chloride.2',
 'Calcium Choride',
 'Manganese Chloride.3',
 'Cobalt Chloride.3',
 'Iodine',
 'Iodine.1',
 'Iodine.2',
 'Iodine.3',
 'Sodium Molybdate',
 'Sodium Molybdate.1',
 'Sodium Molybdate.2',
 'Potassium Chromate.1',
 'Potassium Chromate.2',
 'Potassium Chromate.3',
 'Sodium Nitrite.1',
 'Sodium Nitrite.3',
 'Glucagon.3']

Feature selection is the process of identifying and selecting a set of relevant features or variables that are most useful for building an accurate prediction model. 

The code involves cleaning the data by dropping columns, taking care of whitespaces in names, and preprocessing it by scaling all features to a range, to prevent bias and normalize them. 

The data is then split into training data and testing data, and SelectKBest class is used to return *k* best features.



### perform Ridge regularization 

In [13]:
from sklearn.metrics import mean_squared_error
clf = Ridge(alpha=1.0)
clf.fit(X_train, y_train)
y_pred_ridge = clf.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print("Ridge Regression MSE: ", mse_ridge)



Ridge Regression MSE:  0.24085228595702884


### perform Lasso regularization

In [14]:
clf = Lasso(alpha=0.1)
clf.fit(X_train, y_train)
y_pred_lasso = clf.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print("Lasso Regression MSE: ", mse_lasso)


Lasso Regression MSE:  0.25115646258503393


As is evident here, Ridge regression works better in this case. Both Lasso and Ridge models add a penalty term to the regression equation to prevent overfitting and thus make for better prediction.

Ridge model applies L2 regularization, i.e., it adds a factor of the sum of squares of coefficients in the optimization objective.

LASSO stands for Least Absolute Shrinkage and Selection Operator, and performs L1 regularization, adding a factor of the sum of the absolute value of coefficients in the optimization objective.

Ridge regression is better suited when all features in the model have predictive power, while Lasso is a good option when some features are less important or irrelevant, as it can help identify and remove them from the model.

In [15]:
from numpy import mean
from numpy import std
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
model = LogisticRegression()
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.667 (0.081)


In [33]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# evaluate an LDA model on the dataset using k-fold cross validation
model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=5, random_state=None)
result = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
print(result.mean())


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, classification_report, cohen_kappa_score
from sklearn import metrics 

# Baseline Random forest based Model
rfc = RandomForestClassifier(n_estimators=200)
    
kfold = KFold(n_splits=5, random_state=None)
result2 = cross_val_score(rfc, X_train, y_train, cv=kfold, scoring='accuracy')
print(result2.mean())

0.4999999999999999


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


0.6142857142857142


In [32]:
# evaluate pca with logistic regression algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
# define dataset
#X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, random_state=7)
# define the pipeline
steps = [('pca', PCA(n_components=10)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=100, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.699 (0.098)


In [36]:
#!pip install lazypredict
from lazypredict.Supervised import LazyClassifier, LazyRegressor
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

100%|██████████| 29/29 [00:01<00:00, 18.14it/s]


In [37]:
print(models)

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LogisticRegression                 0.73               0.72     0.72      0.73   
LinearDiscriminantAnalysis         0.70               0.69     0.69      0.69   
LinearSVC                          0.70               0.69     0.69      0.69   
RidgeClassifierCV                  0.70               0.69     0.69      0.69   
PassiveAggressiveClassifier        0.70               0.69     0.69      0.69   
CalibratedClassifierCV             0.67               0.66     0.66      0.66   
RidgeClassifier                    0.67               0.65     0.65      0.65   
SGDClassifier                      0.63               0.63     0.63      0.63   
NearestCentroid                    0.63               0.63     0.63      0.63   
AdaBoostClassifier                 0.63               0.63     0.63      0.63   
LGBMClassifier              

In [38]:
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

xgb = XGBClassifier()
logreg= LogisticRegressionCV(solver='lbfgs', cv=10)
knn = KNeighborsClassifier(5)
svcl = SVC()
adb = AdaBoostClassifier()
dt = DecisionTreeClassifier(max_depth=5)
rf = RandomForestClassifier()
lda = LinearDiscriminantAnalysis()
gnb = GaussianNB()

# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegressionCV(solver='lbfgs', max_iter=5000, cv=10)))
models.append(('XGB', XGBClassifier()))
models.append(('SVM', SVC(gamma='auto')))
results = []
names = []
scoring = 'f1'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=None)
    cv_results = model_selection.cross_val_score(model, x_train_scaled, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.628571 (0.204041)
XGB: 0.542857 (0.166599)
SVM: 0.314286 (0.166599)
