# Dependencies

In [1]:
# --- Dependencies ---
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import pickle

# Split

### Load

In [2]:
df = pd.read_csv("review_history_with_time_features.csv")
df

Unnamed: 0,card_id,was_remembered,answer_score,review_type,time_since_first_review,time_since_prev_review_minmax,time_since_prev_review_standard,time_to_answer
0,1538605201072,1,2,Learn,-0.590727,0.000000,-0.554948,0.903710
1,1538605201072,1,3,Learn,2.573911,0.142023,6.511450,-0.222998
2,1538605201072,1,3,Learn,2.575053,0.000051,-0.552399,-0.432999
3,1538605201072,1,3,Review,3.010807,0.019556,0.418059,-0.357999
4,1538605201077,1,3,Learn,-0.590727,0.000000,-0.554948,-0.108180
...,...,...,...,...,...,...,...,...
70253,1682644975046,1,3,Learn,-0.589338,0.000012,-0.554336,-0.493681
70254,1682644975046,1,3,Review,-0.432017,0.007060,-0.203664,-0.459045
70255,1682645064444,1,3,Learn,-0.590727,0.000000,-0.554948,-0.775818
70256,1682645064444,1,3,Learn,-0.589623,0.000050,-0.552482,-0.620363


In [3]:
def load_embeddings(filename):
  with open(filename, 'r') as f:
    return json.load(f)

# embds = load_embeddings("embeddings.json")
# If embeddings are zipped, create a cell and run: !unzip "embeddings.json.zip"
embds = None

### Split

Rather than splitting by review, we split by card.

(i.e. so even though test_size=0.1, the test set may contain more or less than 10% of reviews)

**With default 70/10/10 + seed**

Total: 70,258 reviews

Train: 56,417 (80%)

Validation: 6,939 (9.9%)

Test: 6,902 (9.8%)

In [4]:
def split(df, *, val_size = 0.10, test_size = 0.10):
  unique_card_ids = df['card_id'].unique()

  # Create validation split
  train_ids, val_ids = train_test_split(unique_card_ids, test_size=val_size, random_state=12)

  # Create test split
  test_ratio = test_size / (1 - val_size) # update so split is accurate
  train_ids, test_ids = train_test_split(train_ids, test_size=test_ratio, random_state=23)

  # IDs -> dataframes
  df_train = df[df['card_id'].isin(train_ids)]
  df_val = df[df['card_id'].isin(val_ids)]
  df_test = df[df['card_id'].isin(test_ids)]
    
  return df_train, df_val, df_test

In [5]:
df_train, df_val, df_test = split(df)

total = len(df)
print(f"Total: {total}")
print(f"Train: {len(df_train)} ({len(df_train) / total}%)")
print(f"Validation: {len(df_val)} ({len(df_val) / total}%)")
print(f"Test: {len(df_test)} ({len(df_test) / total}%)")

Total: 70258
Train: 56417 (0.8029975234137038%)
Validation: 6939 (0.09876455350280396%)
Test: 6902 (0.09823792308349227%)


# Sequences

For a flashcard with reviews A, B, C, D, E we want to create sequences:

**Input**: A, B         **Target**: C

**Input**: A, B, C      **Target**: D

**Input**: A, B, C, D   **Target**: E

**Distributions**

Total: 33,200 sequences

Train: 26,577 (80%) -- Negative: 3535 | Positive: 23042 | % Pos: 0.8669902547315348


Validation: 3,328 (10%) -- Negative: 422 | Positive: 2906 | % Pos: 0.8731971153846154


Test: 3,295 (9.9%) -- Negative: 406 | Positive: 2889 | % Pos: 0.876783004552352

In [6]:
# Number of input features
non_features = ['card_id', 'time_since_prev_review_minmax', 'review_type']

num_review_features = len(df.drop(columns=non_features).columns)
emb_size = len(embds[df['front'].iloc[0]]) if embds is not None else 0
non_features = non_features if emb_size == 0 else non_features + ['front', 'back']

INPUT_SIZE = num_review_features + 2 * emb_size
print("# of Features: ", INPUT_SIZE)

# of Features:  5


In [7]:
def fmt_sequences(df):
  all_sequences = []

  for card_id, group in df.groupby('card_id'):
    reviews = group.copy().reset_index(drop=True)
    review_features = reviews.drop(columns=non_features)

    # Add embedding text if relevant
    if embds is not None:
      front_emb = embds[reviews['front'].iloc[0]]
      back_emb = embds[reviews['back'].iloc[0]]
      card_embs = np.concatenate([front_emb, back_emb])

      card_embs_matrix = np.repeat(card_embs[np.newaxis, :], len(reviews), axis=0)
      seqs = np.hstack((review_features.values, card_embs_matrix))
    
    # We do not want the target to come after a card is forgotten
    potential_targets = reviews[reviews['review_type'] == 'Review'].index.tolist()

    for i in potential_targets:
      current_seqs = seqs[:i] if embds is not None else review_features.values[:i]
      # final_item_remembered = current_seqs[-1, 0] == 1
      # if final_item_remembered is False:
        # continue
      target = (reviews.loc[i]['time_since_prev_review_minmax'], reviews.loc[i]['was_remembered'])
      all_sequences.append((current_seqs, target))

  return all_sequences

In [8]:
train_sequences = fmt_sequences(df_train)
val_sequences = fmt_sequences(df_val)
test_sequences = fmt_sequences(df_test)

num_train = len(train_sequences)
num_val = len(val_sequences)
num_test = len(test_sequences)

total = num_train + num_val + num_test
print(f"Total: {total}")
print(f"Train: {num_train} ({num_train / total}%)")
print(f"Validation: {num_val} ({num_val / total}%)")
print(f"Test: {num_test} ({num_test / total}%)")

Total: 33200
Train: 26577 (0.8005120481927711%)
Validation: 3328 (0.10024096385542168%)
Test: 3295 (0.09924698795180723%)


In [9]:
def print_pos_neg_ratios(seqs):
    pos_sequences = [seq for seq in seqs if seq[1][1] == 1]
    neg_sequences = [seq for seq in seqs if seq[1][1] == 0]
    num_neg = len(neg_sequences)
    num_pos = len(pos_sequences)
    print(f"Negative: {num_neg} | Positive: {num_pos} | % Pos: {num_pos / (num_neg + num_pos)}")

print("Train")
print_pos_neg_ratios(train_sequences)

print("\nVal")
print_pos_neg_ratios(val_sequences)

print("\nTest")
print_pos_neg_ratios(test_sequences)

Train
Negative: 3535 | Positive: 23042 | % Pos: 0.8669902547315348

Val
Negative: 422 | Positive: 2906 | % Pos: 0.8731971153846154

Test
Negative: 406 | Positive: 2889 | % Pos: 0.876783004552352


In [10]:
def upsample_negative_examples(seqs):
    pos_sequences = [seq for seq in seqs if seq[1][1] == 1]
    neg_sequences = [seq for seq in seqs if seq[1][1] == 0]
    
    # Upsample negative examples
    neg_sequences_upsampled = resample(neg_sequences, 
                                replace=True,   
                                n_samples=len(pos_sequences),
                                random_state=123)
    return pos_sequences + neg_sequences_upsampled

train_sequences_up = upsample_negative_examples(train_sequences)
print_pos_neg_ratios(train_sequences_up)

Negative: 23042 | Positive: 23042 | % Pos: 0.5


In [11]:
with open('train.pkl', 'wb') as f:
    pickle.dump(train_sequences, f)
with open('validation.pkl', 'wb') as f:
    pickle.dump(val_sequences, f)
with open('test.pkl', 'wb') as f:
    pickle.dump(test_sequences, f)