In [19]:
# Create a dataset  of the format
{
  "current_article_text": "Full text of Article A...",
  "possible_next_articles": ["Article B", "Article C", "Article D"],
  "target_article": "Article D",
  "final_target_article": "Article Z"
}


{'current_article_text': 'Full text of Article A...',
 'possible_next_articles': ['Article B', 'Article C', 'Article D'],
 'target_article': 'Article D',
 'final_target_article': 'Article Z'}

In [20]:
import pandas as pd

In [21]:
article_data = pd.read_feather('Data/dataframes/article_dataframe.feather')

In [47]:
article_data['linkTarget']

# Ge max number of links
med_links = article_data['linkTarget'].apply(len).describe()

In [48]:
med_links

count    4604.000000
mean       26.038662
std        24.201491
min         0.000000
25%        11.000000
50%        19.000000
75%        33.000000
max       294.000000
Name: linkTarget, dtype: float64

In [23]:
# Load the paths
paths_df = pd.read_feather('Data/dataframes/paths.feather')
# only include successful paths
paths_df = paths_df[paths_df['finished']]

In [24]:
paths_df.columns

Index(['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating',
       'finished', 'failure_reason', 'start_article', 'target_article'],
      dtype='object')

In [25]:
import pandas as pd
from multiprocessing import Pool, cpu_count

# Define the function again if necessary
def replace_back_steps(path_str):
    articles = path_str.split(';')
    stack = []
    processed = []
    
    for article in articles:
        if article == '<':
            if stack:
                stack.pop()
                if stack:
                    last_article = stack[-1]
                    processed.append(last_article)
            # Else, skip appending anything
        else:
            stack.append(article)
            processed.append(article)
    
    return ';'.join(processed)

# Function to apply in parallel
def parallel_process(paths):
    with Pool(cpu_count()) as pool:
        processed_paths = pool.map(replace_back_steps, paths)
    return processed_paths

# Apply parallel processing
paths_df['processed_path'] = parallel_process(paths_df['path'].tolist())

# Verify the result
print(paths_df[['path', 'processed_path']])


                                                    path  \
0      14th_century;15th_century;16th_century;Pacific...   
1      14th_century;Europe;Africa;Atlantic_slave_trad...   
2      14th_century;Niger;Nigeria;British_Empire;Slav...   
3         14th_century;Renaissance;Ancient_Greece;Greece   
4      14th_century;Italy;Roman_Catholic_Church;HIV;R...   
...                                                  ...   
51313                   Yagan;Ancient_Egypt;Civilization   
51314  Yagan;Folklore;Brothers_Grimm;<;19th_century;C...   
51315  Yagan;Australia;England;France;United_States;T...   
51316  Yarralumla,_Australian_Capital_Territory;Austr...   
51317                            Ziad_Jarrah;Germany;Jew   

                                          processed_path  
0      14th_century;15th_century;16th_century;Pacific...  
1      14th_century;Europe;Africa;Atlantic_slave_trad...  
2      14th_century;Niger;Nigeria;British_Empire;Slav...  
3         14th_century;Renaissance;Ancient_

In [26]:

# Assuming `paths_df` and `article_data` are your existing DataFrames

# Step 1: Precompute Lookup Dictionaries
# Ensure 'linkTarget' is a list. If it's a string separated by a delimiter (e.g., ';'), split it accordingly.
# article_data['linkTarget'] = article_data['linkTarget'].apply(lambda x: x.split(';') if isinstance(x, str) else [])
article_text_dict = article_data.set_index('article')['plain_text'].to_dict()
article_links_dict = article_data.set_index('article')['linkTarget'].to_dict()

# Step 2: Collect Data in Lists
dataset_list = []

for idx, row in paths_df.iterrows():
    path_str = row['processed_path']
    final_target_article = row['target_article']
    
    # Split the path into individual articles
    current_path = path_str.split(';')
    path_length = len(current_path)
    
    # Iterate over each article in the current path
    for i, article_name in enumerate(current_path):
        # Retrieve the current article's text
        current_article_text = article_text_dict.get(article_name, "")
        
        # Retrieve the list of possible next articles
        possible_next_articles = article_links_dict.get(article_name, [])
        
        # Determine the target article
        if i + 1 < path_length:
            target_article = current_path[i + 1]
        else:
            target_article = final_target_article
        

        # Append the data point to the list
        dataset_list.append({
            'current_article_text': current_article_text,
            'possible_next_articles': possible_next_articles,
            'target_article': target_article,
            'final_target_article': final_target_article
        })

# Step 3: Bulk DataFrame Creation
dataset = pd.DataFrame(dataset_list)

# Optional: Free up memory by deleting the list
del dataset_list

# Optional: Reset index if needed
dataset.reset_index(drop=True, inplace=True)


In [27]:
dataset

Unnamed: 0,current_article_text,possible_next_articles,target_article,final_target_article
0,#copyright\n\n14th century\n\n2007 Schools ...,"[13th_century, 15th_century, Abacus, Aztec, Bl...",15th_century,African_slave_trade
1,#copyright\n\n15th century\n\n2007 Schools ...,"[10th_century, 11th_century, 12th_century, 13t...",16th_century,African_slave_trade
2,#copyright\n\n16th century\n\n2007 Schools ...,"[10th_century, 11th_century, 12th_century, 13t...",Pacific_Ocean,African_slave_trade
3,#copyright\n\nPacific Ocean\n\n2007 Schools...,"[16th_century, 17th_century, 18th_century, 19t...",Atlantic_Ocean,African_slave_trade
4,#copyright\n\nAtlantic Ocean\n\n2007 School...,"[Aberdeen, Abidjan, Accra, Africa, Airship, Al...",Accra,African_slave_trade
...,...,...,...,...
346773,#copyright\n\nUnited States\n\n2007 Schools...,"[Abraham_Lincoln, Advertising, Agriculture, Am...",Abraham_Lincoln,Abraham_Lincoln
346774,#copyright\n\nAbraham Lincoln\n\n2007 Schoo...,"[Aircraft_carrier, American_Civil_War, Andrew_...",Abraham_Lincoln,Abraham_Lincoln
346775,#copyright\n\nZiad Jarrah\n\n2007 Schools W...,"[Afghanistan, Aircraft, Arabic_language, Atlan...",Germany,Jew
346776,#copyright\n\nGermany\n\n2007 Schools Wikip...,"[Adolf_Hitler, Afghanistan, Agnosticism, Alban...",Jew,Jew


In [28]:
# Assign unique IDs to articles

all_articles = article_data['article'].tolist()

article_to_id = {article: idx for idx, article in enumerate(sorted(all_articles))}
id_to_article = {idx: article for article, idx in article_to_id.items()}

In [29]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text(text, max_length=512):
    return tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )


In [30]:
from tqdm import tqdm

# Initialize tqdm with pandas
tqdm.pandas()

In [31]:
from transformers import BertTokenizerFast

# Initialize the fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


In [32]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from tqdm import tqdm

def batch_preprocess_and_save_dataset(
    dataset, 
    article_to_id, 
    tokenizer, 
    output_dir, 
    max_length=512, 
    max_possible_next=10, 
    batch_size=1000
):
    """
    Preprocess the dataset in batches and save to disk incrementally.

    Parameters:
    - dataset (pd.DataFrame): DataFrame containing the data points.
    - article_to_id (dict): Dictionary mapping article names to IDs.
    - tokenizer (transformers.PreTrainedTokenizerFast): Fast tokenizer instance.
    - output_dir (str): Directory to save processed batches.
    - max_length (int): Maximum length for tokenization.
    - max_possible_next (int): Maximum number of possible next articles.
    - batch_size (int): Number of samples to process in each batch.
    """
    num_samples = len(dataset)
    batch_num = 0

    # Iterate over the dataset in batches with a progress bar
    for start in tqdm(range(0, num_samples, batch_size), desc='Processing Batches'):
        end = min(start + batch_size, num_samples)
        batch = dataset.iloc[start:end]

        # 1. Tokenize 'current_article_text'
        texts = batch['current_article_text'].tolist()
        encoded = tokenizer(
            texts,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='np'  # Return as NumPy arrays
        )
        input_ids_batch = encoded['input_ids']
        attention_mask_batch = encoded['attention_mask']

        # 2. Map 'possible_next_articles' to IDs with padding/truncation
        possible_next_articles = batch['possible_next_articles'].tolist()
        possible_next_ids_batch = np.array([
            [article_to_id.get(article, 0) for article in articles[:max_possible_next]] + [0]*(max_possible_next - len(articles)) 
            if len(articles) < max_possible_next else [article_to_id.get(article, 0) for article in articles[:max_possible_next]]
            for articles in possible_next_articles
        ], dtype=np.int32)

        # 3. Map 'target_article' and 'final_target_article' to IDs
        target_ids_batch = batch['target_article'].map(lambda x: article_to_id.get(x, 0)).values
        final_target_ids_batch = batch['final_target_article'].map(lambda x: article_to_id.get(x, 0)).values

        # Combine into a dictionary
        processed_batch = {
            'input_ids': input_ids_batch,
            'attention_mask': attention_mask_batch,
            'possible_next_ids': possible_next_ids_batch,
            'target_id': np.array(target_ids_batch, dtype=np.int32),
            'final_target_id': np.array(final_target_ids_batch, dtype=np.int32),
        }

        # Save batch to disk using pickle
        batch_path = f"{output_dir}/batch_{batch_num}.pkl"
        with open(batch_path, 'wb') as f:
            pickle.dump(processed_batch, f)

        batch_num += 1

    print(f"All batches saved to {output_dir}")


In [33]:
batch_preprocess_and_save_dataset(
    dataset=dataset,
    article_to_id=article_to_id,
    tokenizer=tokenizer,
    output_dir='./processed_batches',
    max_length=1024,
    max_possible_next=32,
    batch_size=1000
)


Processing Batches: 100%|██████████| 347/347 [18:40<00:00,  3.23s/it]


All batches saved to ./processed_batches


In [86]:
import tensorflow as tf
import pickle
import os
import random
import numpy as np

def load_sampled_batches_to_tf_dataset(batch_dir, sample_size=5):
    """
    Load a random sample of batches from disk and create a TensorFlow dataset.

    Parameters:
    - batch_dir (str): Directory containing the saved batches.
    - sample_size (int): Number of batch files to sample.

    Returns:
    - tf.data.Dataset: TensorFlow dataset containing sampled data.
    """
    # List all batch files in the directory
    all_files = [f for f in sorted(os.listdir(batch_dir)) if f.endswith('.pkl')]

    # Sample a subset of the files
    sampled_files = random.sample(all_files, min(sample_size, len(all_files)))

    # Placeholder lists for TensorFlow dataset creation
    input_ids_list = []
    attention_mask_list = []
    possible_next_ids_list = []
    final_target_ids_list = []
    target_ids_list = []

    # Load sampled batches
    for filename in sampled_files:
        with open(os.path.join(batch_dir, filename), 'rb') as f:
            batch = pickle.load(f)
            
            # Append to lists
            input_ids_list.append(batch['input_ids'])
            attention_mask_list.append(batch['attention_mask'])
            possible_next_ids_list.append(batch['possible_next_ids'])
            final_target_ids_list.append(batch['final_target_id'])
            target_ids_list.append(batch['target_id'])

    # Concatenate sampled batches into NumPy arrays
    input_ids = np.concatenate(input_ids_list, axis=0)
    attention_mask = np.concatenate(attention_mask_list, axis=0)
    possible_next_ids = np.concatenate(possible_next_ids_list, axis=0)
    final_target_ids = np.concatenate(final_target_ids_list, axis=0)
    target_ids = np.concatenate(target_ids_list, axis=0)

    # Reshape final_target_ids to (batch_size, 1)
    final_target_ids = final_target_ids.reshape(-1, 1)
    target_ids = target_ids.reshape(-1, 1)

    # Create a TensorFlow dataset
    tf_dataset = tf.data.Dataset.from_tensor_slices(({
        'input_ids': tf.constant(input_ids, dtype=tf.int32),
        'attention_mask': tf.constant(attention_mask, dtype=tf.int32),
        'possible_next_ids': tf.constant(possible_next_ids, dtype=tf.int32),
        'final_target_id': tf.constant(final_target_ids, dtype=tf.int32)
    }, tf.constant(target_ids, dtype=tf.int32)))

    return tf_dataset

# Load a sampled dataset and apply transformations
batch_dir = './processed_batches'
sample_size = 20  # Adjust based on your memory capacity

tf_dataset = load_sampled_batches_to_tf_dataset(batch_dir, sample_size=sample_size)

# Shuffle, batch, and prefetch
BATCH_SIZE = 32
BUFFER_SIZE = 10000

tf_dataset = tf_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


In [87]:
tf_dataset.element_spec

({'input_ids': TensorSpec(shape=(None, 1024), dtype=tf.int32, name=None),
  'attention_mask': TensorSpec(shape=(None, 1024), dtype=tf.int32, name=None),
  'possible_next_ids': TensorSpec(shape=(None, 32), dtype=tf.int32, name=None),
  'final_target_id': TensorSpec(shape=(None, 1), dtype=tf.int32, name=None)},
 TensorSpec(shape=(None, 1), dtype=tf.int32, name=None))

In [80]:
for batch in tf_dataset.take(1):
    inputs, targets = batch
    print({k: v.shape for k, v in inputs.items()})
    print(f"Targets shape: {targets.shape}")


{'input_ids': TensorShape([32, 1024]), 'attention_mask': TensorShape([32, 1024]), 'possible_next_ids': TensorShape([32, 32]), 'final_target_id': TensorShape([32, 1])}
Targets shape: (32,)


In [52]:
for batch in tf_dataset.take(1):

    print(batch[0])
    print(batch[1])    

{'input_ids': <tf.Tensor: shape=(32, 1024), dtype=int32, numpy=
array([[  101,  1001,  9385, ...,  6719,  2965,   102],
       [  101,  1001,  9385, ...,  1999,  1996,   102],
       [  101,  1001,  9385, ...,  1012, 14627,   102],
       ...,
       [  101,  1001,  9385, ...,  2576,  3574,   102],
       [  101,  1001,  9385, ...,  2043,  2109,   102],
       [  101,  1001,  9385, ..., 15699, 12849,   102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(32, 1024), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>, 'possible_next_ids': <tf.Tensor: shape=(32, 32), dtype=int32, numpy=
array([[ 151,  191,  385, ..., 3882, 3994, 4206],
       [  26,  122,  234, ..., 2312, 2323, 2346],
       [ 330,  389,  431, ...,    0,    0,    0],
       ...,
       [  87,   96,  214, ..., 1688, 1745, 1787],
   

2024-12-10 21:02:46.588930: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [60]:
# Describe the dataset
tf_dataset.element_spec

({'input_ids': TensorSpec(shape=(None, 1024), dtype=tf.int32, name=None),
  'attention_mask': TensorSpec(shape=(None, 1024), dtype=tf.int32, name=None),
  'possible_next_ids': TensorSpec(shape=(None, 32), dtype=tf.int32, name=None),
  'final_target_id': TensorSpec(shape=(None,), dtype=tf.int32, name=None)},
 TensorSpec(shape=(None,), dtype=tf.int32, name=None))

In [70]:
import tensorflow as tf

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"),
             tf.keras.layers.Dense(embed_dim)]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [None]:
def build_custom_transformer_model(vocab_size, embedding_dim, num_possible_next):
    # Inputs
    input_ids = tf.keras.Input(shape=(1024,), dtype=tf.int32, name='input_ids')
    possible_next_ids = tf.keras.Input(shape=(32,), dtype=tf.int32, name='possible_next_ids')
    final_target_id = tf.keras.Input(shape=(1,), dtype=tf.int32, name='final_target_id')

    # Embedding for inputs
    embedding_layer = tf.keras.layers.Embedding(vocab_size + 1, embedding_dim, mask_zero=True)
    x = embedding_layer(input_ids)

    # Transformer Encoder
    transformer_block = TransformerBlock(embed_dim=embedding_dim, num_heads=4, ff_dim=128)
    x = transformer_block(x, training=True)  # Pass `training=True`

    # CLS Token
    cls_token = tf.keras.layers.GlobalAveragePooling1D()(x)

    # Embedding for possible next articles
    possible_next_embedding = embedding_layer(possible_next_ids)
    possible_next_pooled = tf.keras.layers.GlobalAveragePooling1D()(possible_next_embedding)

    # Embedding for final target article
    final_target_embedding = embedding_layer(final_target_id)
    final_target_pooled = tf.keras.layers.Flatten()(final_target_embedding)

    concatenated = tf.keras.layers.Concatenate()([cls_token, possible_next_pooled, final_target_pooled])
    print(f"Concatenated shape: {concatenated.shape}")

    # Feed concatenated tensor into dense layers
    dense = tf.keras.layers.Dense(256, activation='relu')(concatenated)
    dense = tf.keras.layers.Dropout(0.3)(dense)
    dense = tf.keras.layers.Dense(128, activation='relu')(dense)
    dense = tf.keras.layers.Dropout(0.3)(dense)

    # Output layer
    output = tf.keras.layers.Dense(num_possible_next, activation='softmax')(dense)

    # Define the model
    model = tf.keras.Model(inputs=[input_ids, possible_next_ids, final_target_id], outputs=output)
    print(f"cls_token shape: {cls_token.shape}")
    print(f"possible_next_pooled shape: {possible_next_pooled.shape}")
    print(f"final_target_pooled shape: {final_target_pooled.shape}")

    return model

# Build and summarize the model
model = build_custom_transformer_model(vocab_size=len(article_to_id), embedding_dim=128, num_possible_next=32)
model.summary()


cls_token shape: (None, 128)
possible_next_pooled shape: (None, 128)
final_target_pooled shape: (None, 128)




In [75]:
# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='sparse_categorical_crossentropy',  # Assumes target_ids are sparse integers
    metrics=['accuracy']
)


In [77]:
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath='model_checkpoint.keras',
        save_best_only=True,
        monitor='val_loss',
        verbose=1
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        verbose=1,
        restore_best_weights=True
    ),
    tf.keras.callbacks.TensorBoard(
        log_dir='./logs',
        update_freq='batch'
    )
]


In [85]:
for batch in tf_dataset.take(1):
    inputs, targets = batch
    for key, value in inputs.items():
        print(f"{key}: {value.shape}")
    print(f"Targets: {targets.shape}")


input_ids: (32, 1024)
attention_mask: (32, 1024)
possible_next_ids: (32, 32)
final_target_id: (32, 1)
Targets: (32,)


In [88]:
# Split the dataset
dataset_size = sum(1 for _ in tf_dataset)
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size

train_dataset = tf_dataset.take(train_size)
val_dataset = tf_dataset.skip(train_size)

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=20,  # Adjust based on your requirements
    callbacks=callbacks
)



Epoch 1/20




ValueError: Exception encountered when calling Functional.call().

[1mInput 0 of layer "dense_4" is incompatible with the layer: expected axis -1 of input shape to have value 384, but received input with shape (None, 131328)[0m

Arguments received by Functional.call():
  • inputs={'input_ids': 'tf.Tensor(shape=(None, 1024), dtype=int32)', 'attention_mask': 'tf.Tensor(shape=(None, 1024), dtype=int32)', 'possible_next_ids': 'tf.Tensor(shape=(None, 32), dtype=int32)', 'final_target_id': 'tf.Tensor(shape=(None, 1), dtype=int32)'}
  • training=True
  • mask={'input_ids': 'None', 'attention_mask': 'None', 'possible_next_ids': 'None', 'final_target_id': 'None'}