In [8]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta

df_raw_cycle = pd.read_csv('cycle_data.csv')

df_raw_cycle.head()


Unnamed: 0,date,cycle_flg,black_flg
0,2015-07-08,1,0
1,2015-07-09,1,0
2,2015-07-10,1,0
3,2015-07-11,1,0
4,2015-07-12,1,0


In [9]:
df_raw_cycle['date'] = pd.to_datetime(df_raw_cycle['date'], errors='coerce')

# Add row number, day, and month columns
df_raw_cycle['row'] = np.arange(1, len(df_raw_cycle) + 1)
df_raw_cycle['day'] = df_raw_cycle['date'].dt.day
df_raw_cycle['month'] = df_raw_cycle['date'].dt.month

# Reorder columns
df_raw_cycle = df_raw_cycle[['row', 'date', 'day', 'month', 'cycle_flg', 'black_flg']]

df_raw_cycle.head()

Unnamed: 0,row,date,day,month,cycle_flg,black_flg
0,1,2015-07-08,8,7,1,0
1,2,2015-07-09,9,7,1,0
2,3,2015-07-10,10,7,1,0
3,4,2015-07-11,11,7,1,0
4,5,2015-07-12,12,7,1,0


In [11]:
# Initialize the transformed dataset
df_refine_cycle = []

# Iterate over the dataset to create the transformed rows
for i in range(len(df_raw_cycle) - 60):
    # Extract the 60 consecutive days of labels
    labels_60_days = df_raw_cycle['cycle_flg'].iloc[i:i + 60].tolist()
    
    # Extract the ending month
    ending_month = df_raw_cycle['date'].iloc[i + 59].month
    
    # Extract the label of the 61st day
    label_day_61 = df_raw_cycle['cycle_flg'].iloc[i + 60]
    
    # Create a row with the 60 labels, ending month, and day-61 label
    row = labels_60_days + [ending_month, label_day_61]
    df_refine_cycle.append(row)

# Convert the transformed data into a DataFrame
column_names = [f'Day_{i+1}' for i in range(60)] + ['EndingMonth', 'CycleDay_61']
df_refine_cycle = pd.DataFrame(df_refine_cycle, columns=column_names)

print(df_refine_cycle.head())

df_refine_cycle.to_csv('cycle_data_refine.csv', index=False)

   Day_1  Day_2  Day_3  Day_4  Day_5  Day_6  Day_7  Day_8  Day_9  Day_10  ...  \
0      1      1      1      1      1      1      1      0      0       0  ...   
1      1      1      1      1      1      1      0      0      0       0  ...   
2      1      1      1      1      1      0      0      0      0       0  ...   
3      1      1      1      1      0      0      0      0      0       0  ...   
4      1      1      1      0      0      0      0      0      0       0  ...   

   Day_53  Day_54  Day_55  Day_56  Day_57  Day_58  Day_59  Day_60  \
0       0       0       0       0       0       0       0       0   
1       0       0       0       0       0       0       0       0   
2       0       0       0       0       0       0       0       0   
3       0       0       0       0       0       0       0       0   
4       0       0       0       0       0       0       0       0   

   EndingMonth  CycleDay_61  
0            9            0  
1            9            0  
2       

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

# Load the transformed dataset
data = pd.read_csv('cycle_data_refine.csv')

# Split the data into input features (X) and target variable (y)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split the data into training, validation (cross-validation), and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)#, stratify=y)
X_cv, X_test, y_cv, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)#, stratify=y_temp)

# Standardize the input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_cv = scaler.transform(X_cv)
X_test = scaler.transform(X_test)

# Define the neural network model
model = Sequential()
model.add(Dense(61, input_dim=61, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model using Adam optimizer
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) #stop training when the validation loss stops improving or starts to increase (prevents overfitting)
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_cv, y_cv), callbacks=[early_stopping])

# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Print the classification report
cr = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(cr)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.6378 - loss: 0.6043 - val_accuracy: 0.8963 - val_loss: 0.3087
Epoch 2/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9226 - loss: 0.2329 - val_accuracy: 0.9336 - val_loss: 0.2227
Epoch 3/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9447 - loss: 0.1780 - val_accuracy: 0.9461 - val_loss: 0.2130
Epoch 4/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9439 - loss: 0.1709 - val_accuracy: 0.9481 - val_loss: 0.1997
Epoch 5/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9432 - loss: 0.1543 - val_accuracy: 0.9481 - val_loss: 0.2099
Epoch 6/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9537 - loss: 0.1392 - val_accuracy: 0.9398 - val_loss: 0.2176
Epoch 7/50
[1m71/71[0m [32m━━━━━━━━━━

In [25]:
data = pd.read_csv('cycle_data_refine.csv')
y = data.iloc[:, -1].values
# Create an empty array to store the predicted values
y_pred_rule_based = np.zeros_like(y)
i = 0
# Apply the rule to each row in the original dataset
while i < len(y):
    if y[i-1]==1 and y[i-2]==0:
        y_pred_rule_based[i] = 1
        if y[i]==1:
            y_pred_rule_based[i+1] = 1
        else:
            i+=2
        if y[i+1]==1:
            y_pred_rule_based[i+2] = 1
        else:
            i+=3
        if y[i+2]==1:
            y_pred_rule_based[i+3] = 1
        else:
            i+=4
        if y[i+3]==1:
            y_pred_rule_based[i+4] = 1
        else:
            i+=5
        if y[i+4]==1:
            y_pred_rule_based[i+5] = 1
        else:
            i+=6
 

    i+=1 
# Combine the arrays into a DataFrame
df = pd.DataFrame({
    'y': y,
    'y_pred_rule_based': y_pred_rule_based
})

df.to_csv('combined_output.csv', index=False)

In [14]:
y_pred_rule_based_train, y_pred_rule_based_temp = train_test_split(y_pred_rule_based, test_size=0.3, random_state=42)
y_pred_rule_based_cv, y_pred_rule_based_test = train_test_split(y_pred_rule_based_temp, test_size=0.5, random_state=42)

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_rule_based = y_pred_rule_based_test

# Evaluate the rule-based model
accuracy_rule_based = accuracy_score(y_test, y_pred_rule_based)
precision_rule_based = precision_score(y_test, y_pred_rule_based)
recall_rule_based = recall_score(y_test, y_pred_rule_based)
f1_score_rule_based = f1_score(y_test, y_pred_rule_based)

# Print evaluation metrics for the rule-based model
print("Rule-Based Model Performance:")
print(f"Accuracy: {accuracy_rule_based:.4f}")
print(f"Precision: {precision_rule_based:.4f}")
print(f"Recall: {recall_rule_based:.4f}")
print(f"F1 Score: {f1_score_rule_based:.4f}")

# Evaluate the neural network model (already trained)
y_pred_nn = (y_pred_prob > 0.5).astype(int)
accuracy_nn = accuracy_score(y_test, y_pred_nn)
precision_nn = precision_score(y_test, y_pred_nn)
recall_nn = recall_score(y_test, y_pred_nn)
f1_score_nn = f1_score(y_test, y_pred_nn)

# Print evaluation metrics for the neural network model
print("\nNeural Network Model Performance:")
print(f"Accuracy: {accuracy_nn:.4f}")
print(f"Precision: {precision_nn:.4f}")
print(f"Recall: {recall_nn:.4f}")
print(f"F1 Score: {f1_score_nn:.4f}")

Rule-Based Model Performance:
Accuracy: 0.9398
Precision: 0.8376
Recall: 0.9074
F1 Score: 0.8711

Neural Network Model Performance:
Accuracy: 0.9398
Precision: 0.8911
Recall: 0.8333
F1 Score: 0.8612


In [28]:
# Combine the arrays into a DataFrame
y_pred_prob = model.predict(X)
y_pred = (y_pred_prob > 0.5).astype(int)
len(y_pred_rule_based)
df = pd.DataFrame({
    'y': y,
    'y_pred': y_pred.flatten()
})

df.to_csv('combined_output_nn.csv', index=False)

[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [42]:
#Check Overfitting
y_pred_prob = model.predict(X_train)
y_pred = (y_pred_prob > 0.5).astype(int)
y_pred_nn = (y_pred_prob > 0.5).astype(int)
accuracy_nn = accuracy_score(y_train, y_pred_nn)
precision_nn = precision_score(y_train, y_pred_nn)
recall_nn = recall_score(y_train, y_pred_nn)
f1_score_nn = f1_score(y_train, y_pred_nn)

# Print evaluation metrics for the neural network model
print("\nNeural Network Model Performance on Train Set:")
print(f"Accuracy: {accuracy_nn:.4f}")
print(f"Precision: {precision_nn:.4f}")
print(f"Recall: {recall_nn:.4f}")
print(f"F1 Score: {f1_score_nn:.4f}")

# Evaluate the neural network model (already trained)
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)
y_pred_nn = (y_pred_prob > 0.5).astype(int)
accuracy_nn = accuracy_score(y_test, y_pred_nn)
precision_nn = precision_score(y_test, y_pred_nn)
recall_nn = recall_score(y_test, y_pred_nn)
f1_score_nn = f1_score(y_test, y_pred_nn)

# Print evaluation metrics for the neural network model
print("\nNeural Network Model Performance on Test Set:")
print(f"Accuracy: {accuracy_nn:.4f}")
print(f"Precision: {precision_nn:.4f}")
print(f"Recall: {recall_nn:.4f}")
print(f"F1 Score: {f1_score_nn:.4f}")

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step

Neural Network Model Performance on Train Set:
Accuracy: 0.9440
Precision: 0.9615
Recall: 0.7843
F1 Score: 0.8639
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

Neural Network Model Performance on Test Set:
Accuracy: 0.9378
Precision: 0.8900
Recall: 0.8241
F1 Score: 0.8558


In [25]:
# A common way to reduce overfitting is to simplify the model by using fewer parameters. 
# Maybe considering less than 60 past days is a good strategy.
# How much should we go in the past? Let's find the maximum distance of ones.
indices_of_ones = [i for i, x in enumerate(y) if x == 1]

# Calculate the distances between consecutive ones
distances = [indices_of_ones[i] - indices_of_ones[i - 1] for i in range(1, len(indices_of_ones))]
    
# Find the maximum distance
max_distance = max(distances)

print("The maximum distance between ones is:", max_distance)

The maximum distance between ones is: 46


In [26]:
# This means that reducing the number of passt days to 50 we handle most of the situations
# Initialize the transformed dataset
df_refine_cycle = []

# Iterate over the dataset to create the transformed rows
for i in range(len(df_raw_cycle) - 50):
    # Extract the 50 consecutive days of labels
    labels_50_days = df_raw_cycle['cycle_flg'].iloc[i:i + 50].tolist()
    
    # Extract the ending month
    ending_month = df_raw_cycle['date'].iloc[i + 49].month
    
    # Extract the label of the 51st day
    label_day_51 = df_raw_cycle['cycle_flg'].iloc[i + 50]
    
    # Create a row with the 50 labels, ending month, and day-51 label
    row = labels_50_days + [ending_month, label_day_51]
    df_refine_cycle.append(row)

# Convert the transformed data into a DataFrame
column_names = [f'Day_{i+1}' for i in range(50)] + ['EndingMonth', 'CycleDay_51']
df_refine_cycle = pd.DataFrame(df_refine_cycle, columns=column_names)

print(df_refine_cycle.head())

df_refine_cycle.to_csv('cycle_data_refine.csv', index=False)

   Day_1  Day_2  Day_3  Day_4  Day_5  Day_6  Day_7  Day_8  Day_9  Day_10  ...  \
0      1      1      1      1      1      1      1      0      0       0  ...   
1      1      1      1      1      1      1      0      0      0       0  ...   
2      1      1      1      1      1      0      0      0      0       0  ...   
3      1      1      1      1      0      0      0      0      0       0  ...   
4      1      1      1      0      0      0      0      0      0       0  ...   

   Day_43  Day_44  Day_45  Day_46  Day_47  Day_48  Day_49  Day_50  \
0       1       1       1       1       1       1       1       0   
1       1       1       1       1       1       1       0       0   
2       1       1       1       1       1       0       0       0   
3       1       1       1       1       0       0       0       0   
4       1       1       1       0       0       0       0       0   

   EndingMonth  CycleDay_51  
0            8            0  
1            8            0  
2       

In [27]:
# We also simplify the hidden layers of the neural network
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Load the transformed dataset
data = pd.read_csv('cycle_data_refine.csv')

# Split the data into input features (X) and target variable (y)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split the data into training, validation (cross-validation), and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)#, stratify=y)
X_cv, X_test, y_cv, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)#, stratify=y_temp)

# Standardize the input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_cv = scaler.transform(X_cv)
X_test = scaler.transform(X_test)

# Define the neural network model
model = Sequential()
model.add(Dense(51, input_dim=51, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model using Adam optimizer
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_cv, y_cv))

# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Print the classification report
cr = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(cr)


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5177 - loss: 0.6891 - val_accuracy: 0.8344 - val_loss: 0.3842
Epoch 2/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9123 - loss: 0.2720 - val_accuracy: 0.9027 - val_loss: 0.2689
Epoch 3/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9347 - loss: 0.2065 - val_accuracy: 0.9151 - val_loss: 0.2583
Epoch 4/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9408 - loss: 0.1750 - val_accuracy: 0.9193 - val_loss: 0.2500
Epoch 5/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9392 - loss: 0.1743 - val_accuracy: 0.9193 - val_loss: 0.2558
Epoch 6/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9441 - loss: 0.1545 - val_accuracy: 0.9193 - val_loss: 0.2520
Epoch 7/50
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━

In [28]:
#Check Overfitting
y_pred_prob = model.predict(X_train)
y_pred = (y_pred_prob > 0.5).astype(int)
y_pred_nn = (y_pred_prob > 0.5).astype(int)
accuracy_nn = accuracy_score(y_train, y_pred_nn)
precision_nn = precision_score(y_train, y_pred_nn)
recall_nn = recall_score(y_train, y_pred_nn)
f1_score_nn = f1_score(y_train, y_pred_nn)

# Print evaluation metrics for the neural network model
print("\nNeural Network Model Performance on Train Set:")
print(f"Accuracy: {accuracy_nn:.4f}")
print(f"Precision: {precision_nn:.4f}")
print(f"Recall: {recall_nn:.4f}")
print(f"F1 Score: {f1_score_nn:.4f}")

# Evaluate the neural network model (already trained)
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)
y_pred_nn = (y_pred_prob > 0.5).astype(int)
accuracy_nn = accuracy_score(y_test, y_pred_nn)
precision_nn = precision_score(y_test, y_pred_nn)
recall_nn = recall_score(y_test, y_pred_nn)
f1_score_nn = f1_score(y_test, y_pred_nn)

# Print evaluation metrics for the neural network model
print("\nNeural Network Model Performance on Test Set:")
print(f"Accuracy: {accuracy_nn:.4f}")
print(f"Precision: {precision_nn:.4f}")
print(f"Recall: {recall_nn:.4f}")
print(f"F1 Score: {f1_score_nn:.4f}")

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Neural Network Model Performance on Train Set:
Accuracy: 0.9840
Precision: 0.9958
Recall: 0.9333
F1 Score: 0.9636
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

Neural Network Model Performance on Test Set:
Accuracy: 0.9483
Precision: 0.9158
Recall: 0.8365
F1 Score: 0.8744


In [None]:
# We reduced overfitting and obtained better performances on test set. 
# Even though I tried to re.train the 60-network with early stop when val set loss function increases and overfitting reduces too

In [29]:
# Di seguito mi assicuro che il rule based model venga battuto anche nel caso in cui sia personalizzato

# Variables to store lengths of consecutive ones and distances between series of ones
consecutive_lengths = []
distances_between_series = []

# Initialize counters
current_length = 0
previous_end_index = None

# Iterate through the array
for i, value in enumerate(y):
    if value == 1:
        current_length += 1
        if previous_end_index is not None and current_length == 1:
            distances_between_series.append(i - previous_end_index - 1)
    else:
        if current_length > 0:
            consecutive_lengths.append(current_length)
            previous_end_index = i - 1
            current_length = 0

# Capture the last series if it ends at the end of the array
if current_length > 0:
    consecutive_lengths.append(current_length)

# Calculate averages
average_length_of_ones = sum(consecutive_lengths) / len(consecutive_lengths) if consecutive_lengths else 0
average_distance_between_series = sum(distances_between_series) / len(distances_between_series) if distances_between_series else 0

print("Average length of consecutive ones:", average_length_of_ones)
print("Average distance between series of ones:", average_distance_between_series)


Average length of consecutive ones: 6.159663865546219
Average distance between series of ones: 20.822033898305083


In [7]:
data = pd.read_csv('cycle_data_refine.csv')
y = data.iloc[:, -1].values
# Create an empty array to store the predicted values
y_pred_rule_based = np.zeros_like(y)
i = 0
# Apply the rule to each row in the original dataset
while i < len(y):
    if y[i-1]==1 and y[i-2]==0:
        y_pred_rule_based[i] = 1
        if y[i]==1:
            y_pred_rule_based[i+1] = 1
        else:
            i+=2
        if y[i+1]==1:
            y_pred_rule_based[i+2] = 1
        else:
            i+=3
        if y[i+2]==1:
            y_pred_rule_based[i+3] = 1
        else:
            i+=4
        if y[i+3]==1:
            y_pred_rule_based[i+4] = 1
        else:
            i+=5
        if y[i+4]==1:
            y_pred_rule_based[i+5] = 1
        else:
            i+=6
 

    i+=1 
# Combine the arrays into a DataFrame
df = pd.DataFrame({
    'y': y,
    'y_pred_rule_based': y_pred_rule_based
})

df.to_csv('combined_output.csv', index=False)

In [31]:
y_pred_rule_based_train, y_pred_rule_based_temp = train_test_split(y_pred_rule_based, test_size=0.3, random_state=42)
y_pred_rule_based_cv, y_pred_rule_based_test = train_test_split(y_pred_rule_based_temp, test_size=0.5, random_state=42)

In [32]:
y_pred_rule_based = y_pred_rule_based_test

# Evaluate the rule-based model
accuracy_rule_based = accuracy_score(y_test, y_pred_rule_based)
precision_rule_based = precision_score(y_test, y_pred_rule_based)
recall_rule_based = recall_score(y_test, y_pred_rule_based)
f1_score_rule_based = f1_score(y_test, y_pred_rule_based)

# Print evaluation metrics for the rule-based model
print("Rule-Based Model Performance:")
print(f"Accuracy: {accuracy_rule_based:.4f}")
print(f"Precision: {precision_rule_based:.4f}")
print(f"Recall: {recall_rule_based:.4f}")
print(f"F1 Score: {f1_score_rule_based:.4f}")

Rule-Based Model Performance:
Accuracy: 0.8843
Precision: 0.6875
Recall: 0.8462
F1 Score: 0.7586


In [None]:
#prova a inserire tutto in una funzione e prova a impostare una lista di hyperparameters: grandezza della rete e numero di feature. 
#Addestra tutti i modelli e trova gli hyperparameter per cui la distanza di score fra train e test è minima