In [4]:
import pandas as pd

# Load the dataset to inspect it
file_path = 'cleaned_diabetes_012_health_indicators_BRFSS2015.csv'
data = pd.read_csv(file_path)

# Check the distribution of the target class to identify the imbalance
class_distribution = data['Diabetes_012'].value_counts()

class_distribution


Diabetes_012
0.0    190055
2.0     35097
1.0      4629
Name: count, dtype: int64

In [5]:
# Grouping class 1 and 2 together as class 1 (diabetes and prediabetes) and keeping class 0 as no diabetes.
data['Diabetes_01'] = data['Diabetes_012'].replace({2.0: 1.0})

# Check the new distribution of the classes after grouping
new_class_distribution = data['Diabetes_01'].value_counts()

new_class_distribution

Diabetes_01
0.0    190055
1.0     39726
Name: count, dtype: int64

In [7]:
from imblearn.over_sampling import RandomOverSampler

# Define features (X) and target (y)
X = data.drop(columns=['Diabetes_012', 'Diabetes_01'])
y = data['Diabetes_01']

# Apply Random Over Sampling to balance the dataset
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Check the distribution after oversampling
resampled_class_distribution = pd.Series(y_resampled).value_counts()
print(resampled_class_distribution)

# Create a new DataFrame with the resampled data
resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
resampled_data['Diabetes_01'] = y_resampled

# Save the resampled dataset to a CSV file (already saved)
#resampled_data.to_csv('overesampled_diabetes_data.csv', index=False)

Diabetes_01
0.0    190055
1.0    190055
Name: count, dtype: int64
