In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from datetime import datetime, timedelta

In [3]:
# Load data from CSV file
data = pd.read_csv('FedCycleData071012_edited.csv')

print(data.head())

   LengthofCycle EstimatedDayofOvulation LengthofLutealPhase LengthofMenses  \
0             29                      17                  12              2   
1             27                      15                  12              2   
2             29                      15                  14              2   
3             27                      15                  12              2   
4             28                      16                  12              2   

  MensesScoreDayOne MensesScoreDayTwo MensesScoreDayThree MensesScoreDayFour  \
0                 3                 3                   2                  1   
1                 3                 3                   2                  1   
2                 3                 3                   2                  1   
3                 3                 3                   3                  2   
4                 3                 3                   2                  2   

  MensesScoreDayFive MensesScoreDaySix Mense

In [4]:
# Convert all columns to numeric where applicable
data = data.apply(pd.to_numeric, errors='coerce')
# Verify the conversion by checking the data types
print(data.dtypes)

LengthofCycle                int64
EstimatedDayofOvulation    float64
LengthofLutealPhase        float64
LengthofMenses             float64
MensesScoreDayOne          float64
MensesScoreDayTwo          float64
MensesScoreDayThree        float64
MensesScoreDayFour         float64
MensesScoreDayFive         float64
MensesScoreDaySix          float64
MensesScoreDaySeven        float64
MensesScoreDayEight        float64
MensesScoreDayNine         float64
MensesScoreDayTen          float64
MensesScoreDay11           float64
MensesScoreDay12           float64
MensesScoreDay13           float64
MensesScoreDay14           float64
MensesScoreDay15           float64
TotalMensesScore           float64
dtype: object


In [5]:
# Assume today's date for the entire dataset
data['current_date'] = pd.Timestamp(datetime.today())

# Assume last period start date is a random value between 0 and 28 days ago for everyone
np.random.seed(42)  # For reproducibility
data['last_period_start_date'] = data['current_date'] - pd.to_timedelta(np.random.randint(0, 30, size=len(data)), unit='D')

# Convert date columns to datetime
data['current_date'] = pd.to_datetime(data['current_date'])
data['last_period_start_date'] = pd.to_datetime(data['last_period_start_date'])

# Calculate the cycle day using menstrual_length
data['cycle_day'] = (data['current_date'] - data['last_period_start_date']).dt.days

In [6]:
# Label the phases based on cycle day and duration
def label_phase(row):
    if 0 <= row['cycle_day'] < row['LengthofMenses']:
        return 'Menstrual'
    elif row['LengthofMenses'] <= row['cycle_day'] < (row['LengthofCycle'] - 14):
        return 'Follicular'
    elif (row['cycle_day'] == row['EstimatedDayofOvulation']):
        # row['LengthofCycle'] - 14) <= row['cycle_day'] < (row['LengthofCycle'] - 15)
        return 'Ovulation'
    else:
        return 'Luteal'

# Apply the label_phase function to create the 'phase' column
data['phase'] = data.apply(label_phase, axis=1)

print(data['cycle_day'])

phase_counts = data['phase'].value_counts()
print(phase_counts)

print(data.head())

0        6
1       19
2       28
3       14
4       10
        ..
1660    27
1661    14
1662     7
1663     1
1664     2
Name: cycle_day, Length: 1665, dtype: int64
phase
Luteal        805
Follicular    499
Menstrual     324
Ovulation      37
Name: count, dtype: int64
   LengthofCycle  EstimatedDayofOvulation  LengthofLutealPhase  \
0             29                     17.0                 12.0   
1             27                     15.0                 12.0   
2             29                     15.0                 14.0   
3             27                     15.0                 12.0   
4             28                     16.0                 12.0   

   LengthofMenses  MensesScoreDayOne  MensesScoreDayTwo  MensesScoreDayThree  \
0             2.0                3.0                3.0                  2.0   
1             2.0                3.0                3.0                  2.0   
2             2.0                3.0                3.0                  2.0   
3             

In [7]:
print(data.columns)

Index(['LengthofCycle', 'EstimatedDayofOvulation', 'LengthofLutealPhase',
       'LengthofMenses', 'MensesScoreDayOne', 'MensesScoreDayTwo',
       'MensesScoreDayThree', 'MensesScoreDayFour', 'MensesScoreDayFive',
       'MensesScoreDaySix', 'MensesScoreDaySeven', 'MensesScoreDayEight',
       'MensesScoreDayNine', 'MensesScoreDayTen', 'MensesScoreDay11',
       'MensesScoreDay12', 'MensesScoreDay13', 'MensesScoreDay14',
       'MensesScoreDay15', 'TotalMensesScore', 'current_date',
       'last_period_start_date', 'cycle_day', 'phase'],
      dtype='object')


In [8]:
# Prepare features and labels
X = data.drop(['phase', 'MensesScoreDayOne', 'MensesScoreDayTwo', 
               'MensesScoreDayThree', 'MensesScoreDayFour', 'MensesScoreDayFive', 
               'MensesScoreDaySix', 'MensesScoreDaySeven', 'MensesScoreDayEight', 
               'MensesScoreDayNine', 'MensesScoreDayTen', 'MensesScoreDay11', 
               'MensesScoreDay12', 'MensesScoreDay13', 'MensesScoreDay14', 'MensesScoreDay15',
               'TotalMensesScore', 'LengthofLutealPhase'], axis=1)
y = data['phase']

In [14]:
print(X.columns)

Index(['LengthofCycle', 'EstimatedDayofOvulation', 'LengthofMenses',
       'current_date', 'last_period_start_date', 'cycle_day'],
      dtype='object')


In [9]:
# Check data types and fix if necessary
print(X.dtypes)
X = X.apply(pd.to_numeric, errors='coerce')  # Convert columns to numeric, forcing errors to NaN

# Drop rows with NaN values if any
X = X.dropna()
y = y[X.index]  # Ensure y matches the indices of X

LengthofCycle                       int64
EstimatedDayofOvulation           float64
LengthofMenses                    float64
current_date               datetime64[us]
last_period_start_date     datetime64[ns]
cycle_day                           int64
dtype: object


In [10]:
print(X)

      LengthofCycle  EstimatedDayofOvulation  LengthofMenses  \
0                29                     17.0             2.0   
1                27                     15.0             2.0   
2                29                     15.0             2.0   
3                27                     15.0             2.0   
4                28                     16.0             2.0   
...             ...                      ...             ...   
1656             29                     17.0            10.0   
1657             30                     19.0            10.0   
1658             30                     19.0            10.0   
1659             32                     19.0            11.0   
1660             29                     19.0            15.0   

          current_date  last_period_start_date  cycle_day  
0     1725103722208017     1724585322208017000          6  
1     1725103722208017     1723462122208017000         19  
2     1725103722208017     1722684522208017000     

In [11]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)

# Make predictions
y_pred = model.predict(x_test)

# Evaluate the model
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

  Follicular       0.96      0.98      0.97       128
      Luteal       0.96      0.95      0.95       227
   Menstrual       1.00      1.00      1.00        87
   Ovulation       0.46      0.50      0.48        12

    accuracy                           0.95       454
   macro avg       0.85      0.86      0.85       454
weighted avg       0.95      0.95      0.95       454



In [12]:
# Assuming 'model' and 'X' are already defined

# Create new data
new_data = pd.DataFrame({
    'LengthofCycle': [28],
    'LengthofMenses': [6],
    'current_date': [pd.Timestamp(datetime.today())],
    'last_period_start_date': [pd.Timestamp(datetime.today() - timedelta(days=0))]
})

# Feature engineering for new data
new_data['cycle_day'] = (new_data['current_date'] - new_data['last_period_start_date']).dt.days

# Drop original DateTime columns as they are not needed for prediction
new_data = new_data.drop(columns=['current_date', 'last_period_start_date'])

# Ensure the new data has the same columns as the training data
new_data = new_data.reindex(columns=X.columns, fill_value=0)

# Predict the phase
predicted_phase = model.predict(new_data)
print(f'Predicted Phase: {predicted_phase}')


# Recommend food based on the phase
food_recommendations = {
    'Menstrual': ['Iron-rich foods', 'Hydration'],
    'Follicular': ['Folate-rich foods', 'Protein'],
    'Ovulation': ['Anti-inflammatory foods', 'Healthy fats'],
    'Luteal': ['Magnesium-rich foods', 'Complex carbs']
}

print(f'Recommended Foods: {food_recommendations[predicted_phase[0]]}')

Predicted Phase: ['Luteal']
Recommended Foods: ['Magnesium-rich foods', 'Complex carbs']
