In [None]:
%run preprocessing_3.ipynb

In [2]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from datetime import datetime, timedelta

y = df['phase_encoded']

x = df.drop(['phase', 'phase_encoded', 'current_date', 'last_period_start_date'], axis=1)

y = y[x.index]

In [3]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10)

In [4]:
model_xgb = XGBClassifier(random_state=42, eval_metric='mlogloss')
model_xgb.fit(x_train, y_train)

y_pred_xgb = model_xgb.predict(x_test)

report_xgb = classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Classification Report:\n", report_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb}")

XGBoost Classification Report:
               precision    recall  f1-score   support

  Follicular       0.99      0.98      0.98       161
      Luteal       0.99      0.97      0.98       169
   Menstrual       0.99      0.99      0.99       104
   Ovulation       0.90      0.97      0.93        66

    accuracy                           0.98       500
   macro avg       0.97      0.98      0.97       500
weighted avg       0.98      0.98      0.98       500

XGBoost Accuracy: 0.976


In [5]:
# check for overfitting
train_accuracy = accuracy_score(y_train, model_xgb.predict(x_train))
test_accuracy = accuracy_score(y_test, model_xgb.predict(x_test))
print(f'Training Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')

Training Accuracy: 1.0
Test Accuracy: 0.976


In [7]:
# from xgboost import plot_importance

# # Plot feature importance
# plot_importance(model_xgb)
# plt.show()

# # Get feature importances
# importance = model_xgb.feature_importances_
# importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': importance})
# importance_df = importance_df.sort_values(by='Importance', ascending=False)
# print(importance_df)

In [8]:
# # dropping features with importance less than 0.03
# threshold = 0.03
# features_to_drop = importance_df[importance_df['Importance'] < threshold]['Feature'].tolist()
# print("Features to drop:", features_to_drop)

# # Dropping features from the training and test datasets
# x_train_reduced = x_train.drop(columns=features_to_drop)
# x_test_reduced = x_test.drop(columns=features_to_drop)

# # Train the model again with reduced features
# model_retrained = XGBClassifier()
# model_retrained.fit(x_train_reduced, y_train)

# # Predict and evaluate again
# y_pred_retrained = model_retrained.predict(x_test_reduced)

# print("Classification Report:\n", classification_report(y_test, y_pred_retrained))
# print(f"Accuracy: {accuracy_score(y_test, y_pred_retrained)}")

In [9]:
# # CLASS WEIGHTS METHOD
# # to increase frequency of ovulation
# # Compute class weights
# from sklearn.utils.class_weight import compute_class_weight
# class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# class_weights_dict = dict(enumerate(class_weights))

# # Map sample weights to each class in y_train
# sample_weights = np.array([class_weights_dict[label] for label in y_train])

# # Initialize XGBoost model
# model_xgb_wts = XGBClassifier(random_state=42, eval_metric='mlogloss')

# # Fit the model with sample weights
# model_xgb_wts.fit(x_train, y_train, sample_weight=sample_weights)

# # Predict on the test data
# y_pred_xgb_wts = model_xgb_wts.predict(x_test)

# # Generate classification report and accuracy score
# report_xgb = classification_report(y_test, y_pred_xgb_wts, target_names=label_encoder.classes_)
# accuracy_xgb = accuracy_score(y_test, y_pred_xgb_wts)

# print("XGBoost Classification Report:\n", report_xgb)
# print(f"XGBoost Accuracy (With class weights): {accuracy_xgb}")

In [6]:
from sklearn.model_selection import cross_val_score

x_train_crossval, x_test_crossval, y_train_crossval, y_test_crossval = train_test_split(x, y, test_size=0.3, random_state=42)

# Initialize the model
model_crossval = XGBClassifier(eval_metric='mlogloss', random_state=42)

# Perform k-fold cross-validation
k = 5  # Choose the number of folds
cv_scores = cross_val_score(model_crossval, x_train_crossval, y_train_crossval, cv=k)

# Print the cross-validation scores
print(f"Cross-Validation Scores for {k} folds: {cv_scores}")
print(f"Mean CV Score: {np.mean(cv_scores)}")

Cross-Validation Scores for 5 folds: [0.96995708 0.95708155 0.98712446 0.96137339 0.98283262]
Mean CV Score: 0.9716738197424892


In [7]:
# SMOTE METHOD

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=41)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

model_xgb.fit(x_train_smote, y_train_smote)

y_pred_xgb = model_xgb.predict(x_test)

report_xgb = classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Classification Report after SMOTE:\n", report_xgb)
print(f"XGBoost Accuracy after SMOTE: {accuracy_xgb}")

XGBoost Classification Report after SMOTE:
               precision    recall  f1-score   support

  Follicular       0.99      0.96      0.97       161
      Luteal       1.00      0.95      0.98       169
   Menstrual       0.99      1.00      1.00       104
   Ovulation       0.84      0.98      0.91        66

    accuracy                           0.97       500
   macro avg       0.96      0.98      0.96       500
weighted avg       0.97      0.97      0.97       500

XGBoost Accuracy after SMOTE: 0.97


In [8]:
# Stratified K-fold cross validation Method
from sklearn.model_selection import StratifiedKFold

# Set up cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store metrics
accuracies = []
reports = []

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Apply SMOTE on the training set
    smote = SMOTE(random_state=42)
    x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

    # Train the model
    model_xgb.fit(x_train_resampled, y_train_resampled)

    # Make predictions
    y_pred = model_xgb.predict(x_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

    accuracies.append(accuracy)
    reports.append(report)

# Print average accuracy and reports
print(f"Average Accuracy: {sum(accuracies) / len(accuracies)}")
for i, report in enumerate(reports):
    print(f"\nClassification Report for Fold {i+1}:\n{report}")

Average Accuracy: 0.9819819819819819

Classification Report for Fold 1:
              precision    recall  f1-score   support

  Follicular       0.98      0.99      0.98        99
      Luteal       1.00      0.98      0.99       123
   Menstrual       1.00      0.98      0.99        62
   Ovulation       0.94      0.98      0.96        49

    accuracy                           0.98       333
   macro avg       0.98      0.98      0.98       333
weighted avg       0.99      0.98      0.99       333


Classification Report for Fold 2:
              precision    recall  f1-score   support

  Follicular       0.98      0.95      0.96        99
      Luteal       0.98      0.98      0.98       122
   Menstrual       0.97      0.98      0.98        62
   Ovulation       0.91      0.96      0.93        50

    accuracy                           0.97       333
   macro avg       0.96      0.97      0.96       333
weighted avg       0.97      0.97      0.97       333


Classification Report 

In [9]:
# FINDING HYPERTUNING PARAMETERS
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'learning_rate': [0.01, 0.1, 1]
}
grid_search = GridSearchCV(XGBClassifier(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best parameters:", grid_search.best_params_)

Best parameters: {'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 100}


In [10]:
best_params = {'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 100}
final_model = XGBClassifier(**best_params, eval_metric='mlogloss')
final_model.fit(x_train, y_train)

In [11]:
y_pred_final = final_model.predict(x_test)
report_final = classification_report(y_test, y_pred_final, target_names=label_encoder.classes_)
accuracy_final = accuracy_score(y_test, y_pred_final)

print("Final XGBoost Classification Report:\n", report_final)
print(f"Final XGBoost Accuracy: {accuracy_final}")

train_accuracy = accuracy_score(y_train, final_model.predict(x_train))
test_accuracy = accuracy_score(y_test, final_model.predict(x_test))
print(f'Training Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')

Final XGBoost Classification Report:
               precision    recall  f1-score   support

  Follicular       0.98      0.99      0.98        98
      Luteal       0.97      0.99      0.98       123
   Menstrual       1.00      1.00      1.00        63
   Ovulation       0.96      0.88      0.91        49

    accuracy                           0.98       333
   macro avg       0.98      0.96      0.97       333
weighted avg       0.98      0.98      0.98       333

Final XGBoost Accuracy: 0.975975975975976
Training Accuracy: 1.0
Test Accuracy: 0.975975975975976


In [12]:
from sklearn.model_selection import GridSearchCV

# Define your model with regularization parameters
model_xgb = XGBClassifier(eval_metric='mlogloss', random_state=42)

# Define the parameter grid
param_grid = {
    'alpha': [0, 0.1, 0.5, 1],   # L1 regularization
    'lambda': [0, 0.1, 0.5, 1],  # L2 regularization
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    'n_estimators': [50, 100, 200]  # Number of trees
}

# Perform grid search to find the best parameters
grid_search = GridSearchCV(estimator=model_xgb, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1)
grid_search.fit(x_train, y_train)

# Print the best parameters found
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best parameters found:  {'alpha': 0, 'lambda': 0, 'learning_rate': 0.1, 'n_estimators': 50}


In [13]:
# Fit the model
best_model = grid_search.best_estimator_
best_model.fit(x_train, y_train)

# Predict and evaluate
y_pred = best_model.predict(x_test)

# Print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

train_accuracy = accuracy_score(y_train, best_model.predict(x_train))
test_accuracy = accuracy_score(y_test, best_model.predict(x_test))
print(f'Training Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')

              precision    recall  f1-score   support

           0       0.99      0.99      0.99        98
           1       0.97      0.98      0.98       123
           2       1.00      1.00      1.00        63
           3       0.94      0.90      0.92        49

    accuracy                           0.98       333
   macro avg       0.97      0.97      0.97       333
weighted avg       0.98      0.98      0.98       333

Training Accuracy: 1.0
Test Accuracy: 0.975975975975976


In [None]:
from sklearn.linear_model import LogisticRegression

# Separate the minority class (Ovulation) from the rest
minority_class_data = df[df['phase_encoded'] == 3]
majority_class_data = df[df['phase_encoded'] != 3]

# Split the data into features and labels
x_minority = minority_class_data.drop(['phase', 'phase_encoded', 'current_date', 'last_period_start_date'], axis=1)
y_minority = minority_class_data['phase_encoded']

x_majority = majority_class_data.drop(['phase', 'phase_encoded', 'current_date', 'last_period_start_date'], axis=1)
y_majority = majority_class_data['phase_encoded']

# Split the majority class data into training and testing sets
x_train_maj, x_test_maj, y_train_maj, y_test_maj = train_test_split(
    x_majority, y_majority, test_size=0.3, random_state=42
)

# Train the XGBoost model for majority classes
xgb_model = XGBClassifier()
xgb_model.fit(x_train_maj, y_train_maj)

# Split the minority class data into training and testing sets
x_train_min, x_test_min, y_train_min, y_test_min = train_test_split(
    x_minority, y_minority, test_size=0.3, random_state=42
)

# Train a Logistic Regression model for the minority class (Ovulation)
logistic_model = LogisticRegression()
logistic_model.fit(x_train_min, y_train_min)

# Predict on the majority test set using XGBoost
majority_predictions = xgb_model.predict(x_test_maj)

# Predict on the minority test set using Logistic Regression
minority_predictions = logistic_model.predict(x_test_min)

# Combine predictions
# Create an array for final predictions
# Ensure they are in the correct order based on the test data
final_predictions = np.concatenate([majority_predictions, minority_predictions])

# Create an array for actual values for the combined dataset
y_test_combined = np.concatenate([y_test_maj, y_test_min])

# Print classification report
print(classification_report(y_test_combined, final_predictions))


In [None]:
print(x.columns)

In [None]:
# Create new data
new_data = pd.DataFrame({
    'LengthofCycle': [34],
    'LengthofMenses': [5],
    'current_date': [pd.Timestamp(datetime.today())],
    'last_period_start_date': [pd.Timestamp(datetime.today() - timedelta(days=4))]
})

# Feature engineering for new data
# new_data['cycle_day'] = (new_data['current_date'] - new_data['last_period_start_date']).dt.days
new_data['cycle_day'] = (new_data['current_date'] - new_data['last_period_start_date']).dt.days % new_data['LengthofCycle']

# Drop original DateTime columns as they are not needed for prediction
new_data = new_data.drop(columns=['current_date', 'last_period_start_date'])

# Ensure the new data has the same columns as the training data
new_data = new_data.reindex(columns=x.columns, fill_value=0)

#### SEE HEREE
#### if model is normal xgboost, change the name
## final model is for stratified k fold

model = final_model
# Predict the phase with XGBoost
predicted_phase_xgb = model_xgb.predict(new_data)
predicted_phase_label = label_encoder.inverse_transform(predicted_phase_xgb)
print(f'Predicted Phase with XGBoost: {predicted_phase_label[0]}')

# Recommend food based on the phase
food_recommendations = {
    'Menstrual': ['Iron-rich foods', 'Hydration'],
    'Follicular': ['Folate-rich foods', 'Protein'],
    'Ovulation': ['Anti-inflammatory foods', 'Healthy fats'],
    'Luteal': ['Magnesium-rich foods', 'Complex carbs']
}

print(f'Recommended Foods: {food_recommendations[predicted_phase_label[0]]}')

In [19]:
# Define function to manually assign phases and visualize predictions
def manual_phase_predictions_and_visualization(cycle_length, menses_length):
    """
    Manually assigns phases for each day of the cycle and visualizes them.
    """
    # Step 1: Manually assign phases across the entire cycle
    def make_predictions_for_cycle_manual(cycle_length, menses_length):
        predictions = []
        for day in range(cycle_length):
            # Manually assign phases based on expected cycle behavior
            if 0 <= day < menses_length:
                predictions.append('Menstrual')
            elif menses_length <= day < (cycle_length - 14):
                predictions.append('Follicular')
            elif (cycle_length - 14) <= day < (cycle_length - 13):
                predictions.append('Ovulation')
            else:
                predictions.append('Luteal')
        return predictions

    # Step 2: Visualize predictions
    def visualize_predictions(predictions, cycle_length):
        predictions_df = pd.DataFrame({
            'Day': np.arange(cycle_length),
            'Predicted Phase': predictions
        })
        # Plot the results
        plt.figure(figsize=(12, 6))
        plt.plot(predictions_df['Day'], predictions_df['Predicted Phase'], marker='o', linestyle='-', markersize=8, color='b')
        plt.title('Predicted Phases Across the Menstrual Cycle')
        plt.xlabel('Day of Cycle')
        plt.ylabel('Predicted Phase')
        plt.xticks(np.arange(0, cycle_length, 1))
        plt.yticks(sorted(predictions_df['Predicted Phase'].unique()))
        plt.grid()
        plt.show()
        return predictions_df

    # Run both steps
    predictions_manual = make_predictions_for_cycle_manual(cycle_length, menses_length)
    predictions_df_manual = visualize_predictions(predictions_manual, cycle_length)

    # Optionally, print the predictions for reference
    print(predictions_df_manual)

    return predictions_df_manual

In [20]:
# Create new data
new_data = pd.DataFrame({
    'LengthofCycle': [34],
    'LengthofMenses': [5],
    'current_date': [pd.Timestamp(datetime.today())],
    'last_period_start_date': [pd.Timestamp(datetime.today() - timedelta(days=4))]
})

# Feature engineering for new data
new_data['cycle_day'] = (new_data['current_date'] - new_data['last_period_start_date']).dt.days % new_data['LengthofCycle']

# Drop original DateTime columns as they are not needed for prediction
new_data = new_data.drop(columns=['current_date', 'last_period_start_date'])

# Get length_of_cycle and length_of_menses for the next code block
length_of_cycle = new_data['LengthofCycle'].values[0]
length_of_menses = new_data['LengthofMenses'].values[0]

In [None]:
# Call the manual phase prediction and visualization function
manual_phase_predictions_and_visualization(length_of_cycle, length_of_menses)

In [None]:
menstrual_tips = {
    "menstrual": {
        "veg": ["Spinach", "Broccoli", "Sweet Potatoes", "Beetroot"],
        "non_veg": ["Chicken", "Salmon", "Eggs"],
        "vegan": ["Kale", "Tofu", "Quinoa", "Avocado"],
        "hydration": "Stay hydrated by drinking plenty of water and herbal teas like ginger or chamomile to ease cramps.",
        "foods_to_avoid": ["Caffeine", "Processed foods", "Sugary snacks", "Salty foods", "Red Meat"]
    },
    "follicular": {
        "veg": ["Lentils", "Zucchini", "Bell Peppers", "Pumpkin Seeds"],
        "non_veg": ["Turkey", "Tuna", "Lean Beef"],
        "vegan": ["Chia Seeds", "Oats", "Flaxseeds", "Almonds"],
        "hydration": "Focus on hydrating with electrolyte-rich drinks such as coconut water and green juices.",
        "foods_to_avoid": ["Refined carbs", "Excessive sugar", "Heavy fats"]
    },
    "ovulation": {
        "veg": ["Asparagus", "Brussels Sprouts", "Peas"],
        "non_veg": ["Chicken", "Shrimp", "Eggs"],
        "vegan": ["Hemp Seeds", "Tempeh", "Chickpeas"],
        "hydration": "Drink plenty of water and consider adding fruit-infused water to boost hydration during this phase.",
        "foods_to_avoid": ["Processed foods", "Fried foods", "Caffeine"]
    },
    "luteal": {
        "veg": ["Carrots", "Sweet Potatoes", "Cauliflower"],
        "non_veg": ["Lamb", "Salmon", "Turkey"],
        "vegan": ["Brown Rice", "Walnuts", "Chia Seeds"],
        "hydration": "Herbal teas like peppermint or chamomile can help reduce bloating. Continue with adequate water intake.",
        "foods_to_avoid": ["Excess salt", "Alcohol", "Caffeine", "Greasy foods"]
    }
}

def get_tips(phase, diet):
    if phase in menstrual_tips:
        if diet in menstrual_tips[phase]:
            recommendations = menstrual_tips[phase][diet]
            hydration = menstrual_tips[phase]["hydration"]
            foods_to_avoid = menstrual_tips[phase]["foods_to_avoid"]
            return recommendations, hydration, foods_to_avoid
        else:
            return None, None, None
    else:
        return None, None, None

def get_diet_from_choice(choice):
    if choice == 1:
        return "non_veg"
    elif choice == 2:
        return "veg"
    elif choice == 3:
        return "vegan"
    else:
        return None

def predict_phase(cycle_length, menses_length):
    """Use the XGBoost model to predict the phases for each day of the cycle."""
    # Create new data for prediction
    new_data = pd.DataFrame({
        'LengthofCycle': [cycle_length],
        'LengthofMenses': [menses_length],
        'current_date': [pd.Timestamp(datetime.today())],
        'last_period_start_date': [pd.Timestamp(datetime.today() - timedelta(days=menses_length))]
    })

    # Feature engineering for new data
    new_data['cycle_day'] = (new_data['current_date'] - new_data['last_period_start_date']).dt.days % new_data['LengthofCycle']
    new_data = new_data.drop(columns=['current_date', 'last_period_start_date'])
    
    # Ensure the new data has the same columns as the training data
    new_data = new_data.reindex(columns=['LengthofCycle', 'LengthofMenses', 'cycle_day'], fill_value=0)
    
    # Predict phases with the XGBoost model
    predicted_phase_xgb = model_xgb.predict(new_data)
    predicted_phase_label = label_encoder.inverse_transform(predicted_phase_xgb)

    return predicted_phase_label

def visualize_predictions(predictions, cycle_length):
    """Visualize the predictions across the cycle."""
    predictions_df = pd.DataFrame({
        'Day': np.arange(cycle_length),
        'Predicted Phase': predictions
    })
    
    # Plot the results
    plt.figure(figsize=(12, 6))
    plt.plot(predictions_df['Day'], predictions_df['Predicted Phase'], marker='o', linestyle='-', markersize=8, color='b')
    plt.title('Predicted Phases Across the Menstrual Cycle')
    plt.xlabel('Day of Cycle')
    plt.ylabel('Predicted Phase')
    plt.xticks(np.arange(0, cycle_length, 1))
    plt.yticks(sorted(predictions_df['Predicted Phase'].unique()))
    plt.grid()
    plt.show()
    
    return predictions_df

def combined_prediction_and_recommendation():
    # Get user input for cycle length and menses length
    try:
        cycle_length = int(input("Enter your cycle length (in days): "))
        menses_length = int(input("Enter your menses length (in days): "))

        # Predict phases for the cycle using the model
        predictions = predict_phase(cycle_length, menses_length)

        # Visualize predictions
        visualize_predictions(predictions, cycle_length)

        # Get today's predicted phase
        current_day = (datetime.today() - datetime.today().replace(day=1)).days % cycle_length
        predicted_phase = predictions[current_day].lower()

        print(f"\nToday's predicted phase is: {predicted_phase.capitalize()}")
        
        # Get diet choice from user
        print("\nChoose your diet:")
        print("1. Non-veg")
        print("2. Veg")
        print("3. Vegan")
        
        diet_choice = int(input("Enter your choice (1/2/3): "))
        diet = get_diet_from_choice(diet_choice)
        
        if diet:
            recommendations, hydration, foods_to_avoid = get_tips(predicted_phase, diet)
            
            if recommendations:
                print(f"\nRecommended foods for the {predicted_phase} phase (Diet: {diet}):")
                for food in recommendations:
                    print(f"- {food}")
                
                print(f"\nHydration tip for the {predicted_phase} phase:")
                print(hydration)
                
                print(f"\nFoods to avoid during the {predicted_phase} phase:")
                for food in foods_to_avoid:
                    print(f"- {food}")
            else:
                print("Invalid diet choice for this phase.")
        else:
            print("Invalid choice! Please select 1 for Non-veg, 2 for Veg, or 3 for Vegan.")
    except ValueError:
        print("Invalid input. Please enter a valid number.")

combined_prediction_and_recommendation()