In [8]:
import sklearn
from sklearn import datasets, svm, metrics
print(dir(datasets))

['__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__getattr__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_arff_parser', '_base', '_california_housing', '_covtype', '_kddcup99', '_lfw', '_olivetti_faces', '_openml', '_rcv1', '_samples_generator', '_species_distributions', '_svmlight_format_fast', '_svmlight_format_io', '_twenty_newsgroups', 'clear_data_home', 'dump_svmlight_file', 'fetch_20newsgroups', 'fetch_20newsgroups_vectorized', 'fetch_california_housing', 'fetch_covtype', 'fetch_kddcup99', 'fetch_lfw_pairs', 'fetch_lfw_people', 'fetch_olivetti_faces', 'fetch_openml', 'fetch_rcv1', 'fetch_species_distributions', 'get_data_home', 'load_breast_cancer', 'load_diabetes', 'load_digits', 'load_files', 'load_iris', 'load_linnerud', 'load_sample_image', 'load_sample_images', 'load_svmlight_file', 'load_svmlight_files', 'load_wine', 'make_biclusters', 'make_blobs', 'make_checkerboard', 'make_circles', 'make_classification', 'make_friedman1', 'make_f

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [11]:
# Load the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ["NumPregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
data = pd.read_csv(url, names=columns)
print(data.columns)

Index(['NumPregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [12]:
# Handling missing values: Replace zeros with NaN and then fill with mean of the column
columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[columns_to_replace] = data[columns_to_replace].replace(0, pd.NA)
data.fillna(data.mean(), inplace=True)

  data.fillna(data.mean(), inplace=True)


In [13]:
# Feature Scaling
scaler = StandardScaler()
features = ['NumPregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
data[features] = scaler.fit_transform(data[features])

In [14]:
# Splitting the dataset
X = data.drop('Outcome', axis=1)
y = data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Assuming 'X_train', 'X_test', 'y_train', 'y_test' are prepared from Part 1
# Initialize the Logistic Regression model
model = LogisticRegression(solver='liblinear', random_state=42)

In [15]:
# Train the model
model.fit(X_train, y_train)
# Predict on the testing set
y_pred = model.predict(X_test)
# Model accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')
# Detailed classification report
print(classification_report(y_test, y_pred))

Model Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.80      0.83      0.81        99
           1       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154



In [16]:
from sklearn.model_selection import GridSearchCV
# Set up the parameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'solver': ['liblinear', 'lbfgs']}
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
# Perform the grid search
grid_search.fit(X_train, y_train)
# Best parameters and best score
print('Best parameters:', grid_search.best_params_)
print('Best cross-validation score: {:.2f}'.format(grid_search.best_score_))
# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print('Test set score: {:.2f}'.format(best_model.score(X_test, y_test)))

import pickle
# Assuming 'best_model' is the model obtained from GridSearchCV in Lecture 34
with open('diabetes_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
print("Model has been serialized and saved as 'diabetes_model.pkl'")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'C': 0.1, 'solver': 'lbfgs'}
Best cross-validation score: 0.77
Test set score: 0.77
Model has been serialized and saved as 'diabetes_model.pkl'


In [17]:
pip freeze >> requirements.txt

Note: you may need to restart the kernel to use updated packages.
