In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Preparing the Data:

### a) Load the dataset

In [2]:
file_path = r"C:\Users\Charles\Documents\CNA Lectures\Third Term Sept to Dec 2023\Emerging Trends & Innovation CP4477 Arun Rameshbabu\Assignment2Diabetes\diabetes_dataset.csv"

In [3]:
df = pd.read_csv(file_path)

In [4]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


#### First five rows and dataset info

In [5]:
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


### b. Handle missing values

In [7]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

### There are no missing values

### b) Handle missing values and outliers effectively.

##### Using Interquartile range

In [8]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

##### Removing outliers based on the Interquartile range above

In [9]:
df_remove_outliers = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]


##### Show statistics after handing outliers effectively.

In [10]:
df_remove_outliers.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,639.0,639.0,639.0,639.0,639.0,639.0,639.0,639.0,639.0
mean,3.804382,119.112676,72.120501,20.56338,65.931142,32.00579,0.429177,32.71518,0.312989
std,3.260995,29.162175,11.348686,15.339991,79.569482,6.43397,0.250957,11.080651,0.464073
min,0.0,44.0,38.0,0.0,0.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,0.0,0.0,27.3,0.242,24.0,0.0
50%,3.0,114.0,72.0,23.0,37.0,32.0,0.358,29.0,0.0
75%,6.0,137.0,80.0,32.0,120.0,35.95,0.586,40.0,1.0
max,13.0,198.0,106.0,60.0,318.0,50.0,1.191,66.0,1.0


### C) Perform any necessary data transformations, standardization, or encoding.

##### Now working with the dataframe of no ouutliers ie "df_remove_outliers"

##### filter numeric data

In [11]:
numeric_data_filter = df_remove_outliers.select_dtypes(include=['float64', 'int64']).columns


#### Now let us  standardize numerical data features

In [12]:
standard_scaler = StandardScaler()
df_standardized = pd.DataFrame(standard_scaler.fit_transform(df_remove_outliers[numeric_data_filter]), columns=numeric_data_filter)


#### Let's view the statistics distribution of the standardization

In [13]:
df_standardized.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,639.0,639.0,639.0,639.0,639.0,639.0,639.0,639.0,639.0
mean,-1.13281e-16,-1.030301e-16,-4.021735e-16,1.2509560000000001e-17,1.389951e-17,-1.598443e-16,-1.825179e-16,1.791299e-16,-1.55327e-16
std,1.000783,1.000783,1.000783,1.000783,1.000783,1.000783,1.000783,1.000783,1.000783
min,-1.167546,-2.577706,-3.008915,-1.341558,-0.8292475,-2.147446,-1.400446,-1.058093,-0.6749673
25%,-0.8606511,-0.690224,-0.716106,-1.341558,-0.8292475,-0.7319706,-0.7464363,-0.7871386,-0.6749673
50%,-0.2468609,-0.1754561,-0.01062636,0.1589654,-0.3638808,-0.0009006622,-0.2838437,-0.335548,-0.6749673
75%,0.6738245,0.6138547,0.6948533,0.7461268,0.6800499,0.6135092,0.6253899,0.6579512,1.481553
max,2.82209,2.707244,2.987662,2.572851,3.17039,2.798942,3.038049,3.006222,1.481553


# 2. Exploratory Data Analysis

#### 2a)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [15]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [16]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

#### 2b) 

#### Descriptive statistics of data

In [17]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

#### Using Pairpolot to visualize the relationships between numerical data

In [None]:
sns.set(style="whitegrid")
sns.pairplot(df, hue='Outcome', corner=True, diag_kind='kde', markers=["o", "s"], palette="husl")
plt.show()

#### Using Correlation heatmap

In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
custom_palette = sns.color_palette("viridis")
sns.heatmap(correlation_matrix, annot=True, cmap=custom_palette, linewidths=0.5)
plt.title('Correlation Heatmap for data')
plt.show()

#### 2c) Identify important features that could impact diabetes prediction.

##### Using  Boxplots to visualize features

In [None]:
plt.figure(figsize=(18, 12))
for i, column in enumerate(df.columns[:-1], 1):
    plt.subplot(3, 3, i)
    sns.boxplot(x='Outcome', y=column, data=df)
    plt.title(f'{column} vs. Outcome')
plt.tight_layout()    
plt.show()

#### 2c) Based on that data from the boxplot, the important features that could impact diabetes are: Glucose levels, BMI, Age, Pregnancies, Insulin Levels

# 3. Linear Regression Model


####  Define features (X) and target variable (y)

In [None]:
X = df.drop("Glucose", axis=1) 
y = df["Glucose"]  

#### 3a) Split the dataset into training and testing sets.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Now Build a linear regression model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

#### Now make prediction based on the prepared test set

In [None]:
y_pred = model.predict(X_test)

#### Check model's performance using mse and r2 score

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

#### Analysis of result of r2 and mse

##### The MSE of 704.68 reflects the average squared difference between predicted and actual glucose values. A lower MSE is better, so minimizing it is the goal. 
##### The R² value of 0.30 implies that around 30% of the glucose variance is explained by the model's features. R² ranges from 0 to 1, with 1 indicating a perfect fit. A higher R² is better, but interpretation depends on the specific problem and expectations.

# 4. Logistic Regression Model

#### 4a) Use outcome as the target variable.

In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

#### 4b) Split the dataset into training and testing sets.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)


#### 4c) Build a logistic regression model to classify patients into diabetic or non-diabetic based on suitable features.

In [None]:
logreg_model = LogisticRegression(random_state=48, max_iter=1000)  # Increase max_iter
logreg_model.fit(X_train, y_train)


#### 4d) Evaluate the model's performance using appropriate metrics (e.g., accuracy, precision, recall, F1-score).

In [None]:
y_pred = logreg_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)

#### Display results of model performance using metrics above

In [None]:
print(f'Accuracy: {accuracy:.5f}')
print(f'Precision: {precision:.5f}')
print(f'Recall: {recall:.5f}')
print(f'F1-Score: {f1:.5f}')
print(f'Confusion Matrix:\n{confusion_matrix}')

#### 4e). Utilize and demonstrate cross-validation and grid search techniques to optimize the model's hyperparameters.

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(LogisticRegression(random_state=48, solver='lbfgs', penalty='l2', max_iter=1000), param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [None]:
best_logreg_model = grid_search.best_estimator_
best_params = grid_search.best_params_

In [None]:
print(f'Best Hyperparameters are: {best_params}')

#### Now Evaluating the model with best hyperparameters

In [None]:
y_pred_cv = best_logreg_model.predict(X_test)

In [None]:
accuracy_cv = accuracy_score(y_test, y_pred_cv)
precision_cv = precision_score(y_test, y_pred_cv)
recall_cv = recall_score(y_test, y_pred_cv)
f1_cv = f1_score(y_test, y_pred_cv)

In [None]:
print(f'Cross-Validated Accuracy result : {accuracy_cv:.4f}')
print(f'Cross-Validated Precision result: {precision_cv:.4f}')
print(f'Cross-Validated Recall result: {recall_cv:.4f}')
print(f'Cross-Validated F1-Score: {f1_cv:.4f}')

# 5 Comparison and Interpretation

#### 5a) Discuss the pros and cons of each model in the given context.

##### For the Linear Regression Model, the pros are it is straightforward and easy to understand. It helps us see how different things, like age or BMI, might change someone's blood sugar level. it is simple to use, just like basic math. If one thing goes up, what happens to the blood sugar? etc.

##### The cons for the Linear Regression Model are it is picky. It works best if everything goes up or down in a straight line. If not, it might get confused. Secondly, when outliers are involved, it might not give best answers. weird numbers can mess it up

##### For Logistic Regression Models, Instead of saying "yes" or "no" directly, it tells us the chance of having diabetes. Like, "There's a 70% chance this person has diabetes." more like it gives probabilities which is better. Secondly, It handles strange values better. If there are some really high or low numbers, it doesn't mess up the performance

##### As per the Cons for the Logistic Regression Models, logistic regression assumes a linear relationship between the features and the log-odds of the target variable. Secondly, Logistic regression might struggle with complex relationships in the data, especially if there are non-linear patterns. If things are too complicated, it might struggle

## In summary

##### The model obtained an accuracy of 78%. In overall performance, I think it is good. As per precision, if it says someone has diabetes, it's right about 70% of the time, that is not bad. A recall of 57% implies that the model captures 57% of the actual people with diabetic cases. Lastly, an F1-Score of 0.63 suggests a middle-ground performance – not bad, not excellent, just a fair balance between being accurate and catching the right cases.

##### In summary, both models are good tools. One is like a basic calculator, and the other is like advanced. They both help us understand diabetes in different ways. Based on requirements, tasks and objective, both models provide valuable insights.

## 5b) Provide insights into the meaningful predictors for diabetes.

##### For Glucose levels, People with higher glucose levels are more likely to have diabetes. It seems like a significant predictor. For Body Mass Index (BMI), Body Mass Index (BMI) is also a factor. Higher BMI may contribute to the likelihood of diabetes. As per age, age seems to be connected with diabetes. Older individuals might have a higher chance of having diabetes. Furthermore, for females, the number of pregnancies might influence diabetes risk. More pregnancies could mean a higher likelihood. Insulin and blood sugar are also predicators for diabetes; higher levels or unusual blood pressure might indicate a higher risk. Again as per genetics, the family history of diabetes, represented by the Diabetes Pedigree Function, could contribute. Genetics might play a role. Lastly, skin thickness and other factors also contributing factors to diabetes, although their impacts might be little.

# 6. Conclusion and Documentation

## 6a) Summarize the results obtained from both models.

##### The model obtained an accuracy of 78%. In overall performance, I think it is good. As per precision, if it says someone has diabetes, it's right about 70% of the time, that is not bad. A recall of 57% implies that the model captures 57% of the actual people with diabetic cases. Lastly, an F1-Score of 0.63 suggests a middle-ground performance – not bad, not excellent, just a fair balance between being accurate and catching the right cases.

### 6b) Discuss limitations and potential areas for improvement.

##### 1) The dataset might have an imbalance between diabetic and non-diabetic cases, affecting model performance. 2) Feature engineering and careful selection could enhance model performance. Identifying more relevant features might improve predictions. 3) Further tuning of hyperparameters, especially in the logistic regression model, might optimize performance. 4) A larger dataset could provide a more robust understanding of predictors and enhance model training. 5) Inclusion of domain expertise and additional medical features might enhance the model's predictive capabilities. In summary, the models look promising, but we need to keep refining them and being mindful of the dataset's limitations to make predictions more accurate. It's key to understand that models are tools, and they get better when we keep adjusting and getting insights from both data and domain experts