# **Data Preprocessing & Encoding**

In [6]:
import pandas as pd
from google.colab import files
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
# Load your dataset
file_path = '/content/diabetes_prediction_dataset.xlsx'
df = pd.read_excel(file_path)

#gender_mapping = {'Male': 0, 'Female': 1}
#df['gender'] = df['gender'].map(gender_mapping)

# Apply one hot encoding to gender column
encoder= OneHotEncoder(sparse_output=False)
gender_encoded = encoder.fit_transform(df[['gender']])
gender_encoded_df= pd.DataFrame(gender_encoded, columns=encoder.get_feature_names_out(['gender']))
df =pd.concat([df.drop('gender',axis=1),gender_encoded_df],axis=1)

# Separate the rows with 'No Info' in smoking history
no_info_rows = df[df['smoking_history'].isnull()]
known_rows = df[~df['smoking_history'].isnull()]

# Impute missing values for the entire dataset
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df.drop('smoking_history', axis=1)), columns=df.drop('smoking_history', axis=1).columns)

# Re-attach the smoking history column
df_imputed['smoking_history'] = df['smoking_history']

# Separate the rows again after imputation
no_info_rows = df_imputed[df_imputed['smoking_history'] == 'No Info']
known_rows = df_imputed[df_imputed['smoking_history'] != 'No Info']

# Drop any remaining NaN values in smoking_history column
known_rows = known_rows.dropna(subset=['smoking_history'])

# Prepare the data for KNN
X = known_rows.drop('smoking_history', axis=1)
y = known_rows['smoking_history']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# Predict the smoking history for the 'No Info' rows
no_info_X = no_info_rows.drop('smoking_history', axis=1)
no_info_X_scaled = scaler.transform(no_info_X)
no_info_predictions = knn.predict(no_info_X_scaled)

# Assign the predicted values to the 'No Info' rows
no_info_rows['smoking_history'] = no_info_predictions

# Combine all the rows back into a single DataFrame
processed_df = pd.concat([known_rows, no_info_rows])

# Apply one-hot encoding to the smoking_history column
encoder = OneHotEncoder(sparse_output=False)
smoking_history_encoded = encoder.fit_transform(processed_df[['smoking_history']])



# Create a DataFrame with the encoded smoking history
smoking_history_encoded_df = pd.DataFrame(smoking_history_encoded, columns=encoder.get_feature_names_out(['smoking_history']))

# Combine the encoded columns with the original DataFrame (excluding the original smoking_history column)
final_df =pd.concat([processed_df.drop('smoking_history', axis=1), smoking_history_encoded_df], axis=1)

# Step 4: Save the processed DataFrame to a new Excel file
processed_file_name = 'processed_diabetes_data.xlsx'
final_df.to_excel(processed_file_name, index=False)

print(final_df.head(10))
#  Step 5: Download the processed file
# files.download(processed_file_name)

print("Data preprocessing and one-hot encoding completed and saved to", processed_file_name)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_info_rows['smoking_history'] = no_info_predictions


     age  hypertension  heart_disease    bmi  HbA1c_level  \
0   80.0           0.0            1.0  25.19          6.6   
2   28.0           0.0            0.0  27.32          5.7   
3   36.0           0.0            0.0  23.45          5.0   
4   76.0           1.0            1.0  20.14          4.8   
5   20.0           0.0            0.0  27.32          6.6   
6   44.0           0.0            0.0  19.31          6.5   
8   42.0           0.0            0.0  33.64          4.8   
9   32.0           0.0            0.0  27.32          5.0   
10  53.0           0.0            0.0  27.32          6.1   
11  54.0           0.0            0.0  54.70          6.0   

    blood_glucose_level  diabetes  gender_Female  gender_Male  gender_Other  \
0                 140.0       0.0            1.0          0.0           0.0   
2                 158.0       0.0            0.0          1.0           0.0   
3                 155.0       0.0            1.0          0.0           0.0   
4           

In [None]:
#Step 5: Download the processed file
files.download(processed_file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Over Sampling Using Smote**

In [None]:
#Oversampling

import pandas as pd
from google.colab import files
from imblearn.over_sampling import SMOTE

# Load the data
file_path = '/content/drive/MyDrive/Diabetes Prediction/processed_diabetes_data.xlsx'
data = pd.read_excel(file_path)

# Assuming 'class' is the target column
X = data.drop('diabetes', axis=1)
y = data['diabetes']

# Apply SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine resampled X and y
resampled_data = pd.concat([X_resampled, y_resampled], axis=1)
resampled_data.to_excel('oversampled_data.xlsx', index=False)
files.download('oversampled_data.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **OverSampling Using ADASYN**

In [None]:
import pandas as pd
from google.colab import files
from imblearn.over_sampling import ADASYN

# Load the data
file_path = '/content/processed_diabetes_data.xlsx'
data = pd.read_excel(file_path)

# Assuming 'diabetes' is the target column
X = data.drop('diabetes', axis=1)
y = data['diabetes']

# Apply ADASYN
adasyn = ADASYN()
X_resampled, y_resampled = adasyn.fit_resample(X, y)

# Combine resampled X and y
resampled_data = pd.concat([X_resampled, y_resampled], axis=1)
resampled_data.to_excel('adasyn_oversampled_data.xlsx', index=False)

# Download the oversampled data
files.download('adasyn_oversampled_data.xlsx')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Oversampling using Borderline-SMOTE**

In [None]:
import pandas as pd
from imblearn.over_sampling import BorderlineSMOTE
from google.colab import files

# Load the data
file_path = '/content/processed_diabetes_data.xlsx'
data = pd.read_excel(file_path)

# Assuming 'diabetes' is the target column
X = data.drop('diabetes', axis=1)
y = data['diabetes']

# Apply Borderline-SMOTE
borderline_smote = BorderlineSMOTE()
X_resampled, y_resampled = borderline_smote.fit_resample(X, y)

# Combine resampled X and y
resampled_data = pd.concat([X_resampled, y_resampled], axis=1)

# Save the resampled data to an Excel file
resampled_data.to_excel('borderline_oversampled_data.xlsx', index=False)

# Download the file
files.download('borderline_oversampled_data.xlsx')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Undersampling**

In [None]:
# Undersampling
from imblearn.under_sampling import RandomUnderSampler

# Apply Random Undersampling
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_resample(X, y)

# Combine resampled X and y
resampled_data = pd.concat([X_resampled, y_resampled], axis=1)
resampled_data.to_excel('undersampled_data.xlsx', index=False)
files.download('undersampled_data.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>