# Setting Up the Notebook:

In [None]:
# Access Kaggle's API to get the path to the dataset.

!pip install kaggle
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ankushpanday1/pcos-prediction-datasettop-75-countries")
filepath = path + '/pcos_prediction_dataset.csv'

print("Path to dataset files:", filepath)

In [None]:
# Import the libraries we'll be using: pandas, matplotlib, sklearn

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Technovation ML Component: PCOS Risk Predictor

### We followed [the machine learning process](https://www.codecademy.com/article/the-ml-process) to train and evaluate a model that predicts PCOS risk.

**The Machine Learning Process:**


1.   Formulate a Question
2.   Find and Understand the Data
3.   Clean the Data and Feature Engineer
4.   Choose a Model
5.   Tune and Evaluate
6.   Use the Model and Present Results

 **What do we want to find out? What will we predict?**

 First, we need to learn more about our topic. What tool can we build that addresses a problem around our topic?

### Our Questions:

1. What is PCOS?
2. What are risk factors and symptoms associated with PCOS?
3. Can we predict if someone is at high or low risk for having PCOS?




## Collect data and visualize the data.

We've found a dataset on Kaggle that we'll explore.


In [None]:
# Load the dataset into a pandas dataframe.
pcos_df = pd.read_csv(filepath)

# Preview the dataframe.
pcos_df.head()


In [None]:
# Print out a summary of the dataframe to check for missing values.
pcos_df.info()

In [None]:
# Look into the column with missing/null values.
pcos_df['Acne Severity'].value_counts(dropna=False)

In [None]:
# Fill in the NaN values with "No acne", instead of None
pcos_df['Acne Severity'].fillna(value='No Acne',inplace=True)
pcos_df['Acne Severity'].value_counts(dropna=False)

In [None]:
# Make plots to look visually for trends.
severity_counts = pcos_df['Acne Severity'].value_counts()
plt.bar(severity_counts.index,severity_counts)
print(severity_counts)
plt.title('Counts of Acne Severity')
plt.xlabel('Acne Severity')
plt.ylabel('Counts')

In [None]:
# Look at each column's data type and decide which columns need to be encoded.
pcos_df.info()

In [None]:
# Import the OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder

# Create a list of columns that we'll encode using the ordinal encoder
columns_to_encode = ['BMI','Menstrual Regularity','Hirsutism','Acne Severity','Family History of PCOS','Insulin Resistance','Stress Levels','Urban/Rural','Socioeconomic Status','Fertility Concerns','Ethnicity']
# Create a list of columns that we'll store the encoded columns in (ex. "BMI encoded" for "BMI" column)
encoded_columns = ['BMI encoded','Menstrual Regularity encoded','Hirsutism encoded','Acne Severity encoded','Family History of PCOS encoded','Insulin Resistance encoded','Stress Levels encoded','Urban/Rural encoded','Socioeconomic Status encoded','Fertility Concerns encoded','Ethnicity encoded']

# Create an instance of OrdinalEncoder. Then, store the unique variables of each column in columns_to_encode in ORDER in a list
BMI_category = ['Underweight','Normal','Overweight','Obese']
MR_category = ['Regular','Irregular']
Hirsutism_category = ['No','Yes']
AS_category = ['No acne','Mild','Moderate','Severe']
FM_of_PCOS_category = ['No','Yes']
IR_category = ['No','Yes']
SL_category = ['Low','Medium','High']
UR_category = ['Urban','Rural']
SS_category = ['Low','Middle','High']
FC_category = ['No','Yes']
E_category = ['African','Asian','Caucasian','Hispanic','Other']

encoder = OrdinalEncoder(categories=[BMI_category,MR_category,Hirsutism_category,AS_category,FM_of_PCOS_category,IR_category,SL_category,UR_category,SS_category,FC_category,E_category])

# Fit and transform the data
pcos_df[encoded_columns] = encoder.fit_transform(pcos_df[columns_to_encode])

# Check that we've added the columns we want to the df.
pcos_df.info()

In [None]:
# Create a new dataframe with only the relevant columns.
knn_columns = encoded_columns
knn_columns.append('Age')
knn_columns.append('Lifestyle Score')
print(knn_columns)

In [None]:
# Split the data into training and testing datasets.
x_train,x_test,y_train,y_test = train_test_split(pcos_df[knn_columns],pcos_df['Diagnosis'],test_size=0.2,random_state=42)

In [None]:
# Print out the number of rows in the training set
print(x_train.shape)

In [None]:
# Create and fit model
model = KNeighborsClassifier(n_neighbors=310)
model.fit(x_train,y_train)

# Make predictions
predictions = model.predict(x_train)

# Evaluate the model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("accuracy")
accuracy = accuracy_score(y_train,predictions)
print(accuracy)
print("confusion matrix")
confusion_mat = confusion_matrix(y_train,predictions)
print(confusion_mat)
print("classification report")
classification_rep = classification_report(y_train,predictions)
print(classification_rep)

In [None]:
# Create and fit weighted KNN model
weighted_model = KNeighborsClassifier(n_neighbors=310, weights="distance")
weighted_model.fit(x_train,y_train)

# Make predictions on the training data
predictions = weighted_model.predict(x_train)

# Evaluate the model on the training data
print("accuracy")
accuracy = accuracy_score(y_train,predictions)
print(accuracy)
print("confusion matrix")
confusion_mat = confusion_matrix(y_train,predictions)
print(confusion_mat)
print("classification report")
classification_rep = classification_report(y_train,predictions)
print(classification_rep)

# Make predictions on the test data
predictions = weighted_model.predict(x_test)

# Evaluate the model on the test data
print("accuracy")
accuracy = accuracy_score(y_test,predictions)
print(accuracy)
print("confusion matrix")
confusion_mat = confusion_matrix(y_test,predictions)
print(confusion_mat)
print("classification report")
classification_rep = classification_report(y_test,predictions)
print(classification_rep)

In [None]:
# SMOTE = Synthetic Minority Oversampling Technique
from imblearn.over_sampling import SMOTE


# Apply SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train,y_train)


# How many rows are in the resampled training set?
print(x_train_resampled.shape)
print(y_train_resampled.shape)

In [None]:
# Train KNN on the resampled data
smote_model = KNeighborsClassifier(n_neighbors=415, weights="distance")
smote_model.fit(x_train_resampled,y_train_resampled)

# Make predictions on the training data
predictions = smote_model.predict(x_train)


# Evaluate the model on the training data
print("accuracy")
accuracy = accuracy_score(y_train,predictions)
print(accuracy)
print("confusion matrix")
confusion_mat = confusion_matrix(y_train,predictions)
print(confusion_mat)
print("classification report")
classification_rep = classification_report(y_train,predictions)
print(classification_rep)

# Make predictions on the test data
predictions = smote_model.predict(x_test)

# Evaluate the model on the test data
print("accuracy")
accuracy = accuracy_score(y_test,predictions)
print(accuracy)
print("confusion matrix")
confusion_mat = confusion_matrix(y_test,predictions)
print(confusion_mat)
print("classification report")
classification_rep = classification_report(y_test,predictions)
print(classification_rep)

In [None]:
# Hyperparameter tuning with grid search
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Define the model without any hyperparameters
grid_search_knn = KNeighborsClassifier()

# Define the hyperparameter grid
params = {
    'n_neighbors': [265,315,365,415,465,515,565],
    'weights': ['uniform','distance']
}

# Set up GridSearchCV and fit it to the training data
grid_search_smote = GridSearchCV(estimator=grid_search_knn
, param_grid=params)
grid_search_smote.fit(x_train_resampled, y_train_resampled)

# Print the best hyperparameters (hint: use the best_params_ attribute)
best_params = grid_search_smote.best_params_
best_model = grid_search_smote.best_estimator_

print(f"Best parameters: {best_params}")

# Use the best model from grid search to make predictions on the test set
y_pred = best_model.predict(x_test)


# Evaluate the model
print("accuracy")
accuracy = accuracy_score(y_test,y_pred)
print(accuracy)
print("confusion matrix")
confusion_mat = confusion_matrix(y_test,y_pred)
print(confusion_mat)
print("classification report")
classification_rep = classification_report(y_test,y_pred)
print(classification_rep)


In [None]:
# Save model to Google Drive:
import pickle

from google.colab import drive
drive.mount('/content/drive')

# Save the file to your Google Drive
with open('/content/drive/MyDrive/model.pkl', 'wb') as f:
    pickle.dump(best_model, f)