In [7]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta

df_raw_cycle = pd.read_csv('cycle_data.csv')

df_raw_cycle.head()


Unnamed: 0,date,cycle_flg,black_flg
0,2015-07-08,1,0
1,2015-07-09,1,0
2,2015-07-10,1,0
3,2015-07-11,1,0
4,2015-07-12,1,0


In [13]:
df_raw_cycle['date'] = pd.to_datetime(df_raw_cycle['date'], errors='coerce')

# Add row number, day, and month columns
df_raw_cycle['row'] = np.arange(1, len(df_raw_cycle) + 1)
df_raw_cycle['day'] = df_raw_cycle['date'].dt.day
df_raw_cycle['month'] = df_raw_cycle['date'].dt.month

# Reorder columns
df_raw_cycle = df_raw_cycle[['row', 'date', 'day', 'month', 'cycle_flg', 'black_flg']]

df_raw_cycle.head()

Unnamed: 0,row,date,day,month,cycle_flg,black_flg
0,1,2015-07-08,8,7,1,0
1,2,2015-07-09,9,7,1,0
2,3,2015-07-10,10,7,1,0
3,4,2015-07-11,11,7,1,0
4,5,2015-07-12,12,7,1,0


In [16]:
# Initialize the transformed dataset
df_refine_cycle = []

# Iterate over the dataset to create the transformed rows
for i in range(len(df_raw_cycle) - 60):
    # Extract the 60 consecutive days of labels
    labels_60_days = df_raw_cycle['cycle_flg'].iloc[i:i + 60].tolist()
    
    # Extract the ending month
    ending_month = df_raw_cycle['date'].iloc[i + 59].month
    
    # Extract the label of the 61st day
    label_day_61 = df_raw_cycle['cycle_flg'].iloc[i + 60]
    
    # Create a row with the 60 labels, ending month, and day-61 label
    row = labels_60_days + [ending_month, label_day_61]
    df_refine_cycle.append(row)

# Convert the transformed data into a DataFrame
column_names = [f'Day_{i+1}' for i in range(60)] + ['EndingMonth', 'CycleDay_61']
df_refine_cycle = pd.DataFrame(df_refine_cycle, columns=column_names)

print(df_refine_cycle.head())

df_refine_cycle.to_csv('cycle_data_refine.csv', index=False)

   Day_1  Day_2  Day_3  Day_4  Day_5  Day_6  Day_7  Day_8  Day_9  Day_10  ...  \
0      1      1      1      1      1      1      1      0      0       0  ...   
1      1      1      1      1      1      1      0      0      0       0  ...   
2      1      1      1      1      1      0      0      0      0       0  ...   
3      1      1      1      1      0      0      0      0      0       0  ...   
4      1      1      1      0      0      0      0      0      0       0  ...   

   Day_53  Day_54  Day_55  Day_56  Day_57  Day_58  Day_59  Day_60  \
0       0       0       0       0       0       0       0       0   
1       0       0       0       0       0       0       0       0   
2       0       0       0       0       0       0       0       0   
3       0       0       0       0       0       0       0       0   
4       0       0       0       0       0       0       0       0   

   EndingMonth  CycleDay_61  
0            9            0  
1            9            0  
2       

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Load the transformed dataset
data = pd.read_csv('cycle_data_refine.csv')

# Split the data into input features (X) and target variable (y)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split the data into training, validation (cross-validation), and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)#, stratify=y)
X_cv, X_test, y_cv, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)#, stratify=y_temp)

# Standardize the input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_cv = scaler.transform(X_cv)
X_test = scaler.transform(X_test)

# Define the neural network model
model = Sequential()
model.add(Dense(61, input_dim=61, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model using Adam optimizer
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_cv, y_cv))

# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Print the classification report
cr = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(cr)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7457 - loss: 0.4978 - val_accuracy: 0.9149 - val_loss: 0.3029
Epoch 2/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9075 - loss: 0.2677 - val_accuracy: 0.9357 - val_loss: 0.2149
Epoch 3/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9433 - loss: 0.1892 - val_accuracy: 0.9398 - val_loss: 0.1992
Epoch 4/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9445 - loss: 0.1718 - val_accuracy: 0.9440 - val_loss: 0.2032
Epoch 5/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9550 - loss: 0.1338 - val_accuracy: 0.9461 - val_loss: 0.2021
Epoch 6/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9420 - loss: 0.1543 - val_accuracy: 0.9461 - val_loss: 0.2067
Epoch 7/50
[1m71/71[0m [32m━━━━━━━━━━

In [30]:
data = pd.read_csv('cycle_data_refine.csv')
y = data.iloc[:, -1].values
# Create an empty array to store the predicted values
y_pred_rule_based = np.zeros_like(y)
i = 0
# Apply the rule to each row in the original dataset
while i < len(y)-5:
    if i >= 33 and y[i-28]==1 and y[i-29]==0 and all(element == 0 for element in y[i-20:i]):
        y_pred_rule_based[i] = 1
        y_pred_rule_based[i+1] = 1
        y_pred_rule_based[i+2] = 1
        y_pred_rule_based[i+3] = 1
        y_pred_rule_based[i+4] = 1   
        i+=4

    if y[i-1] and not(all(element == 1 for element in y[i-5:i])):
        y_pred_rule_based[i] = 1
 

    i+=1 

# Combine the arrays into a DataFrame
df = pd.DataFrame({
    'y': y,
    'y_pred_rule_based': y_pred_rule_based
})

df.to_csv('combined_output.csv', index=False)

In [31]:
y_pred_rule_based_train, y_pred_rule_based_temp = train_test_split(y_pred_rule_based, test_size=0.3, random_state=42)
y_pred_rule_based_cv, y_pred_rule_based_test = train_test_split(y_pred_rule_based_temp, test_size=0.5, random_state=42)

In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_rule_based = y_pred_rule_based_test

# Evaluate the rule-based model
accuracy_rule_based = accuracy_score(y_test, y_pred_rule_based)
precision_rule_based = precision_score(y_test, y_pred_rule_based)
recall_rule_based = recall_score(y_test, y_pred_rule_based)
f1_score_rule_based = f1_score(y_test, y_pred_rule_based)

# Print evaluation metrics for the rule-based model
print("Rule-Based Model Performance:")
print(f"Accuracy: {accuracy_rule_based:.4f}")
print(f"Precision: {precision_rule_based:.4f}")
print(f"Recall: {recall_rule_based:.4f}")
print(f"F1 Score: {f1_score_rule_based:.4f}")

# Evaluate the neural network model (already trained)
y_pred_nn = (y_pred_prob > 0.5).astype(int)
accuracy_nn = accuracy_score(y_test, y_pred_nn)
precision_nn = precision_score(y_test, y_pred_nn)
recall_nn = recall_score(y_test, y_pred_nn)
f1_score_nn = f1_score(y_test, y_pred_nn)

# Print evaluation metrics for the neural network model
print("\nNeural Network Model Performance:")
print(f"Accuracy: {accuracy_nn:.4f}")
print(f"Precision: {precision_nn:.4f}")
print(f"Recall: {recall_nn:.4f}")
print(f"F1 Score: {f1_score_nn:.4f}")

Rule-Based Model Performance:
Accuracy: 0.8734
Precision: 0.7327
Recall: 0.6852
F1 Score: 0.7081

Neural Network Model Performance:
Accuracy: 0.9232
Precision: 0.8257
Recall: 0.8333
F1 Score: 0.8295


In [36]:
#Check Overfitting
y_pred_prob = model.predict(X_train)
y_pred = (y_pred_prob > 0.5).astype(int)
y_pred_nn = (y_pred_prob > 0.5).astype(int)
accuracy_nn = accuracy_score(y_train, y_pred_nn)
precision_nn = precision_score(y_train, y_pred_nn)
recall_nn = recall_score(y_train, y_pred_nn)
f1_score_nn = f1_score(y_train, y_pred_nn)

# Print evaluation metrics for the neural network model
print("\nNeural Network Model Performance on Train Set:")
print(f"Accuracy: {accuracy_nn:.4f}")
print(f"Precision: {precision_nn:.4f}")
print(f"Recall: {recall_nn:.4f}")
print(f"F1 Score: {f1_score_nn:.4f}")

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Neural Network Model Performance on Train Set:
Accuracy: 0.9924
Precision: 0.9901
Recall: 0.9765
F1 Score: 0.9832


In [None]:
# A common way to reduce overfitting is to simplify the model by using fewer parameters. 
# Maybe considering less than 60 past days is a good strategy.
# How much should we go in the past? 