<a href="https://colab.research.google.com/github/dolly500/All-Coursera-assignment-on-on-Python-on-Git/blob/main/Copy_of_Diabetes_prediction_ML_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'pima-indians-diabetes-database:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F228%2F482%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240218%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240218T083722Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D6fc7b27b8c68f8c4eb09fa82f48ef442b35215e8cbbc094d9ec21fbe46cae36a96cdc4993e192a1a403c39af449f51cc6391eb4368d778344f468813d3622f7f12acd98122a911f43d5e79d07ccb7873ea9142cdefba7b9bc8b28320008643e511af0f0828d4cdbc540663ff8afbc8d8b4ab17fdc19b8ca8e85295eefc4de7cf8cf5d762edf9e30af22c25d4bec4f8687534bf1534cba27ab771bec1e23bce0786b88e244e90d248c10c9b342c0c13dc4acf55a4042b7cfb582b0f32cbad468cd17fad3d8d9a80ed8cbe187c9a2b0432a660e8a1db12c9c1ae99703070abcc99dd64b6dd171c7ceb34e9a88cc1e181fc63542bc63497564c64abc0b75fd90edb'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

# Exploring the dataset

**Displaying the first fiew rows of the dataset :**  

In [None]:
df.head()

**Displayig the last fiew rows of the dataset :**

In [None]:
df.tail()

**Getting a random sample from the dataset :**

In [None]:
df.sample(5)

**Exploring the dataset as a structure :**

In [None]:
df.info(verbose = True )

In [None]:
df.shape

In [None]:
df.size

In [None]:
df.columns

In [None]:
df.isnull().sum()

**Getting a statistical summary of the dataset :**

In [None]:
df.describe().transpose()

In [None]:
# Let's first separate features and target :
X = df[df.columns[:-1]]
y = df[df.columns[-1]]

In [None]:
# Exploring correlations between the different variables :
corr = X.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr  , annot=True)
plt.show()

In [None]:
# Visualizing some very correlated variables in relation to each other :
sns.pairplot(data = X , diag_kind = 'kde')

In [None]:
# Let's visualize the parameters for the two kinds of outcome separately
for feature in df.columns[:-1] :
    sns.histplot(data=df.loc[df['Outcome']==1][feature] , bins=20 , color = 'Red' , kde=True , label = f'Diabetic'  , stat='probability' , alpha = 0.5)
    sns.histplot(data=df.loc[df['Outcome']==0][feature] , bins=20 , color='green' , kde=True , label = f'Non-Diabetic' , stat='probability' , alpha = 0.3)
    plt.legend(['Diabetic' , 'Non-Diabetic'])
    plt.show()

# Let's split data intop training , test and validation datasets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Splitting the dataframe into training and temporary dataset :
X_train , X_temp , y_train , y_temp = train_test_split(X , y , test_size=0.3 , random_state=42)

# Further split the temporary dataset into testing and validation :
X_test , X_valid , y_test , y_valid = train_test_split(X_train , y_train , test_size=0.5 , random_state=42)

**Fixing the problem of scaling and balancing :**

**When training a machine learning model, it is generally recommended to scale the data before balancing it. Scaling the data involves normalizing or standardizing the features to ensure that they have a similar influence on the model, regardless of their original range. This is particularly important when using algorithms sensitive to the scale of the features, such as k-Nearest Neighbors (k-NN). By scaling the data before balancing, potential issues related to features with significantly different ranges can be mitigated, leading to improved model performance**


**Balancing a training dataset refers to adjusting the distribution of classes to ensure that each class is represented in a more equal manner. This is important because machine learning algorithms, particularly classification algorithms, may be biased towards the majority class when the dataset is imbalanced, leading to poor performance on the minority classes. Balancing the dataset can be achieved through various techniques such as undersampling the majority class, oversampling the minority class, or adjusting the weights of each class during training**


In [None]:
# Data varies in terms of scale which can negatively affect the learning process of our model
X_train.sample(5)

In [None]:
# Training data is unbalanced wa have more diabetic persons in our training dataset :
y_train.replace({1 : 'Diabetic' , 0  : 'Non-Diabetic'}).value_counts().plot.pie(autopct='%1.1f%%')

In [None]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

**SMOTE (Synthetic Minority Over-sampling Technique) is an over-sampling algorithm used to address the problem of imbalanced datasets, where the number of instances in the minority class is much smaller than the number of instances in the majority class. SMOTE works by generating synthetic samples for the minority class by creating new instances similar to the existing minority class instances. It selects a minority class instance at random and finds its k nearest minority class neighbors. It then generates synthetic instances along the line segments joining these k nearest neighbors in the feature space. This process helps in creating a more robust and balanced dataset for training machine learning models, particularly in scenarios where the minority class is underrepresented.**


In [None]:
def sclaing (X) :
    sds = StandardScaler()
    X = sds.fit_transform(X)
    return X

In [None]:
def balancing(X , y) :
    smote = SMOTE()
    X_balanced , y_balanced = smote.fit_resample(X , y)
    return X_balanced , y_balanced


In [None]:
# Scaling training and testing :
X_train = sclaing(X_train)
X_test = sclaing(X_test)
X_valid = sclaing(X_valid)

In [None]:
# Balancing the training dataset
X_train , y_train = balancing(X_train , y_train)

In [None]:
y_train.replace({1 : 'Diabetic' , 0 : 'Non-diabetic'}).value_counts().plot.pie(autopct='%1.1f%%')

# Training and predicting

In [None]:
from sklearn.neighbors import KNeighborsClassifier

**K-Nearest Neighbors (KNN) is a simple algorithm for classification and regression. It predicts the class or value of a new data point based on the majority class or average value of its nearest neighbors. No explicit training is required, making it easy to implement. However, it can be computationally expensive during prediction, and the choice of the number of neighbors (K) is critical for its performance.**




**Searching for the best value of the k-parameter:**

In [None]:
train_score = list()
test_score = list()

for i in range (1 , 15) :
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train , y_train)
    train_score.append(knn.score(X_train , y_train))
    test_score.append(knn.score(X_test , y_test))

In [None]:
sns.lineplot(x=range(1,15) , y=train_score , label = "Training score")
sns.lineplot(x=range(1,15) , y=test_score , label = "Training score")
plt.xlabel('K-parameter')
plt.ylabel('Score')
plt.grid(True)
plt.show()

In [None]:
best_k = train_score.index(max(train_score)) + 1
best_k

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train , y_train)

In [None]:
y_predict = knn_model.predict(X_test)

**Evaluating the perfomance of the model**

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

**Classification report :**

In [None]:
print(classification_report(y_test , y_predict))

**Confusion matrix :**

In [None]:
confusion_matrix(y_test , y_predict)

In [None]:
sns.heatmap(confusion_matrix(y_test , y_predict) , annot=True , xticklabels=['Non-Diabetic' , 'Diabetic'] , yticklabels=['Non-Diabetic' , 'Diabetic'])