In [1]:
import numpy as np
import pandas as pd

from sklearn import svm, datasets
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn import metrics

In [2]:
columns = ["Age", "Gender", "Polyuria", "Polydipsia", "sudden weight loss", "weakness", "Polyphagia", 
           "Genital thrush", "visual blurring", "Itching", "Irritability", "delayed healing", 
           "partial paresis", "muscle stiffness", "Alopecia", "Obesity", "class"]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00529/diabetes_data_upload.csv")

In [3]:
print(df.head())

   Age Gender Polyuria Polydipsia sudden weight loss weakness Polyphagia  \
0   40   Male       No        Yes                 No      Yes         No   
1   58   Male       No         No                 No      Yes         No   
2   41   Male      Yes         No                 No      Yes        Yes   
3   45   Male       No         No                Yes      Yes        Yes   
4   60   Male      Yes        Yes                Yes      Yes        Yes   

  Genital thrush visual blurring Itching Irritability delayed healing  \
0             No              No     Yes           No             Yes   
1             No             Yes      No           No              No   
2             No              No     Yes           No             Yes   
3            Yes              No     Yes           No             Yes   
4             No             Yes     Yes          Yes             Yes   

  partial paresis muscle stiffness Alopecia Obesity     class  
0              No              Yes      

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 520 non-null    int64 
 1   Gender              520 non-null    object
 2   Polyuria            520 non-null    object
 3   Polydipsia          520 non-null    object
 4   sudden weight loss  520 non-null    object
 5   weakness            520 non-null    object
 6   Polyphagia          520 non-null    object
 7   Genital thrush      520 non-null    object
 8   visual blurring     520 non-null    object
 9   Itching             520 non-null    object
 10  Irritability        520 non-null    object
 11  delayed healing     520 non-null    object
 12  partial paresis     520 non-null    object
 13  muscle stiffness    520 non-null    object
 14  Alopecia            520 non-null    object
 15  Obesity             520 non-null    object
 16  class               520 no

In [5]:
from collections import Counter
print(Counter(df["class"]))

Counter({'Positive': 320, 'Negative': 200})


In [6]:
y = df["class"]
X = df.drop(columns=["class"])

In [10]:
# scale the age variable
from sklearn.preprocessing import MinMaxScaler
X[["Age"]] = MinMaxScaler().fit_transform(X[["Age"]])

In [11]:
print(df.head())

        Age Gender Polyuria Polydipsia sudden weight loss weakness Polyphagia  \
0  0.324324   Male       No        Yes                 No      Yes         No   
1  0.567568   Male       No         No                 No      Yes         No   
2  0.337838   Male      Yes         No                 No      Yes        Yes   
3  0.391892   Male       No         No                Yes      Yes        Yes   
4  0.594595   Male      Yes        Yes                Yes      Yes        Yes   

  Genital thrush visual blurring Itching Irritability delayed healing  \
0             No              No     Yes           No             Yes   
1             No             Yes      No           No              No   
2             No              No     Yes           No             Yes   
3            Yes              No     Yes           No             Yes   
4             No             Yes     Yes          Yes             Yes   

  partial paresis muscle stiffness Alopecia Obesity     class  
0         

In [12]:
# Create dummy variables for categorical predictors
cat_vars = ["Gender", "Polyuria", "Polydipsia", "sudden weight loss", "weakness", "Polyphagia", 
           "Genital thrush", "visual blurring", "Itching", "Irritability", "delayed healing", 
           "partial paresis", "muscle stiffness", "Alopecia", "Obesity"]

X = pd.get_dummies(X[cat_vars])

In [13]:
print(df.head())

        Age Gender Polyuria Polydipsia sudden weight loss weakness Polyphagia  \
0  0.324324   Male       No        Yes                 No      Yes         No   
1  0.567568   Male       No         No                 No      Yes         No   
2  0.337838   Male      Yes         No                 No      Yes        Yes   
3  0.391892   Male       No         No                Yes      Yes        Yes   
4  0.594595   Male      Yes        Yes                Yes      Yes        Yes   

  Genital thrush visual blurring Itching Irritability delayed healing  \
0             No              No     Yes           No             Yes   
1             No             Yes      No           No              No   
2             No              No     Yes           No             Yes   
3            Yes              No     Yes           No             Yes   
4             No             Yes     Yes          Yes             Yes   

  partial paresis muscle stiffness Alopecia Obesity     class  
0         

In [22]:
df[cat_vars].replace({"No": 0, "Yes": 1})
# df[cat_vars].replace({"Male": 0, "Female": 1})
print(df.head())

        Age Gender Polyuria Polydipsia sudden weight loss weakness Polyphagia  \
0  0.324324   Male       No        Yes                 No      Yes         No   
1  0.567568   Male       No         No                 No      Yes         No   
2  0.337838   Male      Yes         No                 No      Yes        Yes   
3  0.391892   Male       No         No                Yes      Yes        Yes   
4  0.594595   Male      Yes        Yes                Yes      Yes        Yes   

  Genital thrush visual blurring Itching Irritability delayed healing  \
0             No              No     Yes           No             Yes   
1             No             Yes      No           No              No   
2             No              No     Yes           No             Yes   
3            Yes              No     Yes           No             Yes   
4             No             Yes     Yes          Yes             Yes   

  partial paresis muscle stiffness Alopecia Obesity     class  
0         

In [23]:
#Split data into a train and test set
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

In [27]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(min_impurity_decrease=0.05, ccp_alpha=0.01,criterion='gini')
dt.fit(x_train, y_train)

In [28]:
dt_preds = dt.predict(x_test)

In [29]:
from sklearn.metrics import accuracy_score
dt_score = accuracy_score(y_test, dt_preds)
print(dt_score)

0.8717948717948718


In [30]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt_preds = dt.predict(x_test)
dt_score = accuracy_score(y_test, dt_preds)
print(dt_score)

0.9743589743589743


In [31]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr_preds = lr.predict(x_test)
lr_score = accuracy_score(y_test, lr_preds)
print(lr_score)

0.9358974358974359


In [34]:
# Create a sequential forward selection model

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

sfs = SFS(lr,
          forward=True,
          floating=False,
          k_features=9,
          scoring='accuracy',
          cv=0)

# Fit the sequential forward selection model to X and y
sfs.fit(x_train, y_train)

# Inspect the results of sequential forward selection
print(sfs.subsets_[9])

# See which features sequential forward selection chose
print(sfs.subsets_[9]['feature_names'])

# Print the model accuracy after doing sequential forward selection
print(sfs.subsets_[9]['avg_score'])

{'feature_idx': (0, 1, 2, 3, 4, 8, 18, 20, 26), 'cv_scores': array([0.93131868]), 'avg_score': 0.9313186813186813, 'feature_names': ('Gender_Female', 'Gender_Male', 'Polyuria_No', 'Polyuria_Yes', 'Polydipsia_No', 'weakness_No', 'Irritability_No', 'delayed healing_No', 'Alopecia_No')}
('Gender_Female', 'Gender_Male', 'Polyuria_No', 'Polyuria_Yes', 'Polydipsia_No', 'weakness_No', 'Irritability_No', 'delayed healing_No', 'Alopecia_No')
0.9313186813186813
