### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

### Load Data

In [2]:
data = pd.read_csv('FedCycleData.csv')

### Check dataset

In [3]:
data.shape

(1665, 80)

In [4]:
data =data.replace(' ', np.nan)

In [5]:
cols_with_missing_cols= []
cols_without_missing_values = []
for cols in data.columns:
    if data[cols].isnull().sum() == 0:
        cols_without_missing_values.append(cols)

    else:
        cols_with_missing_cols.append(cols)

print(f'Number of Columns without any missing values:  {len(cols_without_missing_values)}\n')
print(f'Number of Columns with missing values :  {len(cols_with_missing_cols)}')

Number of Columns without any missing values:  6

Number of Columns with missing values :  74


In [6]:
cols_without_missing_values

['ClientID',
 'CycleNumber',
 'Group',
 'CycleWithPeakorNot',
 'ReproductiveCategory',
 'LengthofCycle']

In [7]:
null = data.isnull().sum().sort_values(ascending=False)
null_per = ((data.isnull().sum()) / (data.shape[0])).sort_values(ascending=False)*100.
null_values = pd.DataFrame({
    "Column Name": null.index,
    "Total Number of Missing Values": null.values,
    "Missing Values in Percentage": null_per.values
})

In [8]:
remove_cols = null_values[null_values["Missing Values in Percentage"] >= 50]["Column Name"].tolist()
data = data.drop(columns=remove_cols)

In [9]:
data.shape

(1665, 24)

In [10]:
data.dtypes

ClientID                      object
CycleNumber                    int64
Group                          int64
CycleWithPeakorNot             int64
ReproductiveCategory           int64
LengthofCycle                  int64
EstimatedDayofOvulation       object
LengthofLutealPhase           object
FirstDayofHigh                object
TotalNumberofHighDays         object
TotalHighPostPeak             object
TotalNumberofPeakDays         object
TotalDaysofFertility          object
TotalFertilityFormula         object
LengthofMenses                object
MensesScoreDayOne             object
MensesScoreDayTwo             object
MensesScoreDayThree           object
MensesScoreDayFour            object
MensesScoreDayFive            object
TotalMensesScore              object
NumberofDaysofIntercourse     object
IntercourseInFertileWindow    object
UnusualBleeding               object
dtype: object

### Select Data to keep

In [11]:
# List of columns to keep
columns_to_keep = ['ReproductiveCategory', 'Group', 'CycleWithPeakorNot', 'LengthofCycle',
                   'EstimatedDayofOvulation', 'LengthofLutealPhase',
                   'LengthofMenses', 'TotalMensesScore', 'NumberofDaysofIntercourse',
                   'IntercourseInFertileWindow','UnusualBleeding']

In [12]:
data = data[columns_to_keep]

In [13]:
data.shape

(1665, 11)

### Convert data type to numeric

In [14]:
data.dtypes

ReproductiveCategory           int64
Group                          int64
CycleWithPeakorNot             int64
LengthofCycle                  int64
EstimatedDayofOvulation       object
LengthofLutealPhase           object
LengthofMenses                object
TotalMensesScore              object
NumberofDaysofIntercourse     object
IntercourseInFertileWindow    object
UnusualBleeding               object
dtype: object

In [15]:
categorical_columns = data.select_dtypes(include= object).columns

In [16]:
encoder = LabelEncoder()

for cols in categorical_columns:
        data[cols] = encoder.fit_transform(data[cols])

### Check Selected Data

In [17]:
for i in data.columns:
    if ( data[i].isnull().sum() ) > 100:
        data[i]= data[i].fillna(data[i].mean())

In [18]:
data.isnull().sum()

ReproductiveCategory          0
Group                         0
CycleWithPeakorNot            0
LengthofCycle                 0
EstimatedDayofOvulation       0
LengthofLutealPhase           0
LengthofMenses                0
TotalMensesScore              0
NumberofDaysofIntercourse     0
IntercourseInFertileWindow    0
UnusualBleeding               0
dtype: int64

### Identifying and Handling Outliers

In [19]:
outliers_percentages = []

for col in data.columns:
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    outliers_count = ((data[col] < lower_bound) | (data[col] > upper_bound)).sum()
    outliers_percentage = (outliers_count / len(data[col])) * 100

    outliers_percentages.append({'Column': col, 'Outlier Percentage': outliers_percentage})

outliers_data = pd.DataFrame(outliers_percentages)
outliers_data = outliers_data.sort_values(by='Outlier Percentage', ascending=False)

In [20]:
for col in data.columns:
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Calculate outliers
    outliers = (data[col] < lower_bound) | (data[col] > upper_bound)
    outlier_percentage = outliers.mean() * 100


    data[col] = data[col].where(~outliers, np.mean(data[col]))

### Random Forest Regression Model

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd
import joblib

In [22]:
# Define features and targets
x = data.drop(['LengthofCycle', 'EstimatedDayofOvulation'], axis=1)
y = data[['LengthofCycle', 'EstimatedDayofOvulation']]

In [23]:
# Scale features
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [24]:
# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.8, random_state=42)

### Model Initialization and Training

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [26]:
# Train MultiOutput Regressor with Random Forest
rf_model = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=49, criterion="squared_error", max_depth=67)
)
rf_model.fit(x_train, y_train)


In [27]:
# Predictions
y_pred = rf_model.predict(x_test)

### Model Prediction and Evaluation

In [28]:
# Metrics for LengthofCycle
mae_length = mean_absolute_error(y_test.iloc[:, 0], y_pred[:, 0])
rmse_length = np.sqrt(mean_squared_error(y_test.iloc[:, 0], y_pred[:, 0]))

print(f'Mean Absolute Error (Length of Cycle): {mae_length}')
print(f'Root Mean Squared Error (Length of Cycle): {rmse_length}')

Mean Absolute Error (Length of Cycle): 2.4512431458722137
Root Mean Squared Error (Length of Cycle): 3.10900431568746


In [29]:
# Metrics for EstimatedDayofOvulation
mae_ovulation = mean_absolute_error(y_test.iloc[:, 1], y_pred[:, 1])
rmse_ovulation = np.sqrt(mean_squared_error(y_test.iloc[:, 1], y_pred[:, 1]))

print(f'Mean Absolute Error (Estimated Day of Ovulation): {mae_ovulation}')
print(f'Root Mean Squared Error (Estimated Day of Ovulation): {rmse_ovulation}')

Mean Absolute Error (Estimated Day of Ovulation): 2.3639230988927626
Root Mean Squared Error (Estimated Day of Ovulation): 3.128102858409152


### Save Model and Scaler

In [30]:
# Save the model and scaler
joblib.dump(rf_model, 'trained_rf_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [None]:
# Prediction Function
def predict_cycle_length_and_ovulation():
    # Load the model and scaler
    model = joblib.load('trained_rf_model.pkl')
    scaler = joblib.load('scaler.pkl')

In [32]:
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

### Test Model

In [34]:
# Prediction Function
def predict_cycle_length_and_ovulation():
    # Load the model and scaler
    model = joblib.load('trained_rf_model.pkl')
    scaler = joblib.load('scaler.pkl')

    # Prompt user for input
    user_input = {
        'ReproductiveCategory': int(input("Enter Reproductive Category (0 for regular, 1 for irregular): ")),
        'Group': int(input("Enter Group (0 for control, 1 for experimental): ")),
        'CycleWithPeakorNot': int(input("Enter Cycle with peak (0 for No, 1 for Yes): ")),
        'LengthofLutealPhase': int(input("Enter Length of Luteal Phase (in days): ")),
        'LengthofMenses': int(input("Enter Length of Menses (in days): ")),
        'TotalMensesScore': int(input("Enter Total Menses Score: ")),
        'NumberofDaysofIntercourse': int(input("Enter Number of Days of Intercourse: ")),
        'IntercourseInFertileWindow': int(input("Enter Intercourse In Fertile Window (0 for No, 1 for Yes): ")),
        'UnusualBleeding': int(input("Enter Unusual Bleeding (0 for No, 1 for Yes): "))
    }

    # Convert to DataFrame
    input_data = pd.DataFrame([user_input])

    # Scale the input data
    input_data_scaled = scaler.transform(input_data)

    # Predict
    predictions = model.predict(input_data_scaled)
    predicted_length, predicted_ovulation = predictions[0]

    print(f'Predicted Menstrual Cycle Length: {round(predicted_length)} days')
    print(f'Predicted Estimated Day of Ovulation: {round(predicted_ovulation)}')

# Call the prediction function
predict_cycle_length_and_ovulation()

Predicted Menstrual Cycle Length: 29 days
Predicted Estimated Day of Ovulation: 2


In [35]:
import joblib
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the model
model = joblib.load('trained_rf_model.pkl')

# Prompt user for input
reproductive_category = int(input("Enter Reproductive Category (0 for regular, 1 for irregular): "))
group = int(input("Enter Group (0 for control, 1 for experimental): "))
cycle_with_peak_or_not = int (input("Enter Cycle with peak (0 for No, 1 for Yes): "))
estimated_day_of_ovulation = int(input("Enter Estimated Day of Ovulation: "))
length_of_luteal_phase = int(input("Enter Length of Luteal Phase (in days): "))
length_of_menses = int(input("Enter Length of Menses (in days): "))
total_menses_score = int(input("Enter Total Menses Score: "))
number_of_days_of_intercourse = int(input("Enter Number of Days of Intercourse: "))
intercourse_in_fertile_window = int(input("Enter Intercourse In Fertile Window (0 for No, 1 for Yes): "))
unusual_bleeding = int(input("Enter Unusual Bleeding (0 for no, 1 for yes): "))

# Create DataFrame with user input
input_data = pd.DataFrame({
    'ReproductiveCategory': [reproductive_category],
    'Group': [group],
    'CycleWithPeakorNot': [cycle_with_peak_or_not],
    'EstimatedDayofOvulation': [estimated_day_of_ovulation],
    'LengthofLutealPhase': [length_of_luteal_phase],
    'LengthofMenses': [length_of_menses],
    'TotalMensesScore': [total_menses_score],
    'NumberofDaysofIntercourse': [number_of_days_of_intercourse],
    'IntercourseInFertileWindow': [intercourse_in_fertile_window],
    'UnusualBleeding': [unusual_bleeding]
})

# Prediction with the model
predicted_length = model.predict(input_data)
rounded_length = round(predicted_length.item())  # Round the prediction

print(f'Predicted Menstrual Cycle Length: {rounded_length} days')



Predicted Menstrual Cycle Length: 33 days


