In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
diabetes_df = pd.read_csv('diabetes.csv')
print(diabetes_df.head())
missing_thresh = 0.3 * len(diabetes_df)
diabetes_reduced = diabetes_df.dropna(thresh=missing_thresh, axis=1)
X_train, X_test, y_train, y_test = train_test_split(diabetes_reduced.drop('Outcome', axis=1),
diabetes_reduced['Outcome'], test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
corr_matrix = diabetes_df.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.8)]
diabetes_reduced_corr = diabetes_df.drop(columns=to_drop)
X_train_corr, X_test_corr, y_train_corr, y_test_corr = train_test_split(diabetes_reduced_corr.drop('Outcome', axis=1),
                                                                        diabetes_reduced_corr['Outcome'], test_size=0.3, random_state=42)
model.fit(X_train_corr, y_train_corr)
y_pred_corr = model.predict(X_test_corr)
var_thresh = VarianceThreshold(threshold=0.01)
diabetes_low_var = var_thresh.fit_transform(diabetes_df.drop('Outcome', axis=1))
X_train_var, X_test_var, y_train_var, y_test_var = train_test_split(diabetes_low_var, diabetes_df['Outcome'], test_size=0.3, random_state=42)
model.fit(X_train_var, y_train_var)
y_pred_var = model.predict(X_test_var)
X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']
sfs = SequentialFeatureSelector(LogisticRegression(), n_features_to_select=3)
sfs.fit(X, y)
X_train_fs = sfs.transform(X_train)
X_test_fs = sfs.transform(X_test)
model.fit(X_train_fs, y_train)
y_pred_fs = model.predict(X_test_fs)
tree_clf = DecisionTreeClassifier()
rfe = RFE(tree_clf, n_features_to_select=5)
rfe.fit(diabetes_df.drop('Outcome', axis=1), diabetes_df['Outcome'])
print("Selected features:", diabetes_df.drop('Outcome', axis=1).columns[rfe.get_support()])
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)
model.fit(X_train_rfe, y_train)
y_pred_rfe = model.predict(X_test_rfe)
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
indices = np.argsort(importances)[-5:]
top_features = diabetes_df.drop('Outcome', axis=1).columns[indices]
print("Top 5 features:", top_features)
X_train_rf = X_train[top_features]
X_test_rf = X_test[top_features]
rf.fit(X_train_rf, y_train)
y_pred_rf = rf.predict(X_test_rf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("correlation filter:", accuracy_score(y_test_corr, y_pred_corr))
print("low variance filter:", accuracy_score(y_test_var, y_pred_var))
print("forward feature selection:", accuracy_score(y_test, y_pred_fs))
print("backward elimination:", accuracy_score(y_test, y_pred_rfe))
print("random features:", accuracy_score(y_test, y_pred_rf))

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
Accuracy: 0.7359307359307359
Selected features: Index(['Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age'], dtype='object')
Top 5 features: Index(['BloodPressure', 'DiabetesPedigreeFunction', 'Age', 'BMI', 'Glucose'], dtype='object')
Accuracy: 0.7359307359307359
correlation filter: 0.