In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score


In [2]:
diabetes_dataset = pd.read_csv('diabetes.csv') 
#checking upon our csv data.
diabetes_dataset.head()
diabetes_dataset.shape
diabetes_dataset.describe()
#lets check the varios classes..
print(diabetes_dataset['Outcome'].value_counts()) # found 500 non diabetic -> 0 and 268 diabetic -> 1

Outcome
0    500
1    268
Name: count, dtype: int64


In [3]:
# separating features and predictions/output.
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']
"""print(X)
print(Y)"""

'print(X)\nprint(Y)'

In [4]:
#scaling our data 
scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)
print(standardized_data)
X = standardized_data
Y = diabetes_dataset['Outcome']

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [5]:
# lets split our test and training data...
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)


In [6]:
#training our model 
classifier = svm.SVC(kernel='rbf')
classifier.fit(X_train, Y_train)
import joblib
joblib.dump(classifier,'diabetes_classifier.sav')
ab = [[2,90,68,42,0,38.2,0.503,27]]
random_prediction = classifier.predict(ab)
print(random_prediction)

[0]


In [7]:
#trying model2 -> random forest classifier
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(n_estimators=20)
random_forest_model.fit(X_train, Y_train)


In [12]:
#trying model3-> descion trees...
from sklearn.tree import DecisionTreeClassifier
des_tree_model = DecisionTreeClassifier()
des_tree_model.fit(X_train, Y_train)


In [46]:
#trying logistic regresssion
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
log_model.fit(X_train, Y_train)
print (log_model.get_params())
import joblib
joblib.dump(log_model,'diabetes_logmodel.sav')

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


['diabetes_logmodel.sav']

In [44]:
#checking accuracy first over training data...
X_train_prediction = classifier.predict(X_train)
X_train_prediction2 = random_forest_model.predict(X_train)
X_train_prediction3 = des_tree_model.predict(X_train)
X_train_prediction4 = log_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
training_data_accuracy2 = accuracy_score(X_train_prediction2, Y_train)
training_data_accuracy3 = accuracy_score(X_train_prediction3, Y_train)
training_data_accuracy4 = accuracy_score(X_train_prediction4, Y_train)
print(training_data_accuracy)
print(training_data_accuracy2)
print(training_data_accuracy3)
print(training_data_accuracy4)

0.7833876221498371
1.0
1.0
0.7866449511400652


In [43]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

X_test_prediction2 = random_forest_model.predict(X_test)
test_data_accuracy2 = accuracy_score(X_test_prediction2, Y_test)

X_test_prediction3 = des_tree_model.predict(X_test)
test_data_accuracy3 = accuracy_score(X_test_prediction3, Y_test)

X_test_prediction4 = log_model.predict(X_test)
test_data_accuracy4 = accuracy_score(X_test_prediction4, Y_test)
print(test_data_accuracy)
print(test_data_accuracy2)
print(test_data_accuracy3)
print(test_data_accuracy4)

0.7792207792207793
0.7272727272727273
0.7337662337662337
0.7857142857142857


In [27]:
#fine tuning using grid search cv
from sklearn.model_selection import GridSearchCV
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization type
    'C': [0.01, 0.1, 1],      # Inverse of regularization strength (lower values for stronger regularization)
    'solver': ['liblinear'],  # Suitable for small datasets
    'max_iter': [100, 200],   # Maximum number of iterations
    'tol': [1e-4, 1e-3],      # Tolerance for stopping criteria
}





grid_search = GridSearchCV(log_model, param_grid, cv=10,
 scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_train, Y_train)
grid_search.best_estimator_

In [28]:
# here is a more fine tunes logistic model
log_model_tuned = grid_search.best_estimator_
X_test_prediction5 = log_model_tuned.predict(X_test)
test_data_accuracy5 = accuracy_score(X_test_prediction5, Y_test)
print(test_data_accuracy5)
print(log_model_tuned)

0.7792207792207793
LogisticRegression(C=1, solver='liblinear')


In [12]:
input_data = (0	,137,	40,	35	,168	,43.1	,2.288,33
)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')
import joblib
joblib.dump(classifier,'diaba_yt_model.sav')

[[-1.14185152  0.5040552  -1.50468724  0.90726993  0.76583594  1.4097456
   5.4849091  -0.0204964 ]]
[1]
The person is diabetic




['diabetes_final_model.sav']