In [7]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [1]:
import numpy as np
import os


# parameters
cycleLength = 24  # cycleLength is D -- we consider features in first D days
sexRisks = {
    "unprotected_sex": 0.5,
    "protected_sex": 0.2,
    "withdrawl_sex": 0.3,
}

emotionImpacts = {
    "emotion_happy": 1.2,
    "emotion_neutral": 1.0,
    "emotion_sad": 0.8,
}

# Convert dict_keys objects to lists and concatenate them
symptomNames = list(sexRisks.keys()) + list(emotionImpacts.keys())

# Create a dictionary to map symptom names to indices
symptomIndices = {name: idx for idx, name in enumerate(symptomNames)}


# TODO: use some better way to simulate this instead of fixed fertilities
def genFertilities(cycleLength):
    fertilities = np.array([
        1, 1, 1, 1, 1,
        1, 2, 3, 10, 12,
        13, 11, 10, 8, 10,
        9, 5, 3, 2, 1,
        1, 1, 1, 1, 1,
        2, 1, 1, 2, 1,
        1, 1, 0, 0, 0,
    ], dtype=float)
    maxFertility = 0.8
    assert cycleLength <= fertilities.shape[0]
    return fertilities[:cycleLength] / np.max(fertilities) * maxFertility


def flipWithProb(prob):
    return np.random.random() <= prob


# Basic assumptions for single cycle data generation
#   - fertility: agrees with fertility window. Some factors like age, emotion
#     can affect fertility on the cycle or on a specific day.
#   - sex: different sex types have different risks, and it should be
#     considered together with fertility.
# Return:
#   (features, label), where features = [(day, symptom)...]
def genSingleCycleData():
    features = []
    sexProb = 0.15  # prob for sex symptom
    emotionProb = 0.3  # prob for emotion symptom
    epsilon = 0.05  # random noise scale
    probs = []  # probs for each sex
    fertility = genFertilities(cycleLength)
    for d in range(cycleLength):
        fertilityFactor = 1.
        if flipWithProb(emotionProb):  # emotion
            emotionIdx = np.random.randint(0, 3)
            emotionName = list(emotionImpacts.keys())[emotionIdx]
            fertilityFactor *= emotionImpacts[emotionName]
            features.append([d, emotionName])
        if flipWithProb(sexProb):  # have sex
            sexIdx = np.random.randint(0, 3)
            sexName = symptomNames[sexIdx]
            f = fertility[d] * fertilityFactor
            probs.append(f * sexRisks[sexName])
            features.append([d, sexName])
    finalProb = 1 - np.prod(1 - np.array(probs)) + np.random.randn() * epsilon
    finalProb = max(0, finalProb)
    # print("# finalProb: {}".format(finalProb))
    label = 1 if finalProb > 0.5 else 0
    return features, label


def genCycleData(num=0):
    data = [genSingleCycleData() for i in range(num)]
    return data

def convertToNdarray(data):
    numSymptom = len(symptomNames)
    dataNpy = np.zeros((len(data), 1 + numSymptom * cycleLength))
    for i, entry in enumerate(data):
        dataNpy[i][0] = entry[1]
        for symptom in entry[0]:
            day = symptom[0]
            symptomName = symptom[1]
            symptomIndex = symptomIndices[symptomName]
            dataNpy[i][1 + numSymptom * day + symptomIndex] = 1
    return dataNpy

def splitAndSave(dataNpy, trainPercent, devPercent, testPercent, dataDir):
    n = dataNpy.shape[0]

    np.random.shuffle(dataNpy)

    trainCount = int(n * trainPercent)
    devCount = int(n * devPercent)

    dataSplits = {
        "train": dataNpy[:trainCount, :],
        "dev": dataNpy[trainCount:trainCount+devCount, :],
        "test": dataNpy[trainCount+devCount:, :]
    }

    for k, v in dataSplits.items():
        splitDir = os.path.join(dataDir, k)
        os.mkdir(splitDir)
        splitFile = os.path.join(splitDir, k+".npy")
        np.save(splitFile, v)

if __name__ == "__main__":
    data = genCycleData(1)
    print("sample = {}".format(data))
    print(len(data[0][0]))
    data = genCycleData(1000)


sample = [([[3, 'protected_sex'], [8, 'emotion_neutral'], [10, 'withdrawl_sex'], [12, 'withdrawl_sex'], [13, 'emotion_sad'], [14, 'emotion_neutral'], [14, 'protected_sex'], [17, 'emotion_sad'], [19, 'emotion_happy'], [21, 'emotion_sad'], [23, 'emotion_sad']], 0)]
11


In [2]:
all_features = set()
for sample, _ in data:
    features = [feature for _, feature in sample]
    all_features.update(features)

feature_to_index = {feature: idx+1 for idx, feature in enumerate(sorted(all_features))}

In [3]:
feature_to_index

{'emotion_happy': 1,
 'emotion_neutral': 2,
 'emotion_sad': 3,
 'protected_sex': 4,
 'unprotected_sex': 5,
 'withdrawl_sex': 6}

In [4]:
data[:5]

[([[0, 'withdrawl_sex'],
   [1, 'emotion_sad'],
   [2, 'emotion_sad'],
   [3, 'emotion_happy'],
   [6, 'emotion_neutral'],
   [8, 'unprotected_sex'],
   [14, 'withdrawl_sex'],
   [16, 'emotion_sad'],
   [19, 'emotion_happy'],
   [20, 'emotion_neutral']],
  0),
 ([[0, 'unprotected_sex'],
   [1, 'withdrawl_sex'],
   [3, 'unprotected_sex'],
   [6, 'emotion_happy'],
   [10, 'protected_sex'],
   [14, 'emotion_neutral'],
   [15, 'emotion_neutral'],
   [20, 'emotion_sad'],
   [21, 'withdrawl_sex'],
   [22, 'protected_sex'],
   [23, 'emotion_happy']],
  0),
 ([[1, 'emotion_neutral'],
   [2, 'emotion_happy'],
   [3, 'unprotected_sex'],
   [5, 'withdrawl_sex'],
   [11, 'emotion_sad'],
   [11, 'unprotected_sex'],
   [12, 'emotion_sad'],
   [13, 'emotion_happy'],
   [14, 'emotion_sad'],
   [16, 'emotion_happy'],
   [17, 'emotion_neutral'],
   [17, 'withdrawl_sex'],
   [20, 'unprotected_sex'],
   [22, 'emotion_sad']],
  0),
 ([[2, 'emotion_happy'],
   [4, 'emotion_neutral'],
   [7, 'emotion_sad'],


In [5]:
# Prepare sequences and labels
sequences = []
labels = []
for sample, label in data:
    sequence = [[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0],[0,0]]
    for day,feature in sample:
        if feature_to_index[feature] > 3:
            sequence[day][1] = feature_to_index[feature]
        else: 
            sequence[day][0] = feature_to_index[feature]
    sequences.append(sequence)
    labels.append(label)

# Pad sequences
max_length = 24
X = np.array(sequences)
y = np.array(labels)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Build RNN model
model = Sequential([
    SimpleRNN(50, input_shape=(X.shape[1], X.shape[2]), return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=2, validation_split=0.1)

# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Epoch 1/10
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8159 - loss: 0.4623 - val_accuracy: 0.9125 - val_loss: 0.2945
Epoch 2/10
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8362 - loss: 0.4352 - val_accuracy: 0.8625 - val_loss: 0.2849
Epoch 3/10
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8780 - loss: 0.3523 - val_accuracy: 0.9125 - val_loss: 0.2798
Epoch 4/10
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8576 - loss: 0.3935 - val_accuracy: 0.9125 - val_loss: 0.2386
Epoch 5/10
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8777 - loss: 0.3195 - val_accuracy: 0.8875 - val_loss: 0.2541
Epoch 6/10
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9011 - loss: 0.2476 - val_accuracy: 0.9000 - val_loss: 0.2263
Epoch 7/10
[1m360/360[0m 

In [16]:
values = [[1, 'protected_sex'],[2, 'protected_sex'],[3, 'withdrawl_sex'],[6, 'protected_sex'],[14,'withdrawl_sex'],[17,'withdrawl_sex'],[18,'emotion_sad']]
values = [[0,0],
        [0,4],
        [0,4],
        [0,6],
        [0,0],
        [0,0],
        [0,4],
        [0,0],
        [0,0],
        [0,0],
        [0,0],
        [0,0],
        [0,0],
        [0,0],
        [0,5],
        [0,0],
        [0,0],
        [0,6],
        [3,0],
        [0,0],
        [0,0],
        [0,0],
        [0,0],
        [0,0]]
values = np.array(values)
values = values.reshape(1, len(values), 2)
result = model.predict(values)
result

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


array([[0.03194568]], dtype=float32)