## Preprocessing Training Dataset

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
# specify where you saved these documents
link =

In [None]:
# import all csv files in training dataset
barclays = pd.read_csv(link + 'Barclays_Wealth_Management.csv')
hsbc = pd.read_csv(link + 'HSBC_Loans.csv')
santander = pd.read_csv(link + 'Santander_Conditions.csv')
natwest = pd.read_csv(link + 'NatWest_Reward_Terms.csv')
lloyds = pd.read_csv(link + 'Lloyds_Benefits.csv')
first_direct = pd.read_csv(link + 'First_Direct_Facts.csv')
santander_c = pd.read_csv(link + 'Santander_Cashback.csv')
natwest_p = pd.read_csv(link + 'NatWest_Privacy_Policy.csv')
barclays_m = pd.read_csv(link + 'Barclays_Mortgages.csv')

In [None]:
# change data type
def change_datatype(df):
  df[['tag', 'content']] = df[['tag', 'content']].astype('str')
  return df

In [None]:
## define first dataframe creation

# header and body pairs
def extract_header_body_pairs(df):
    headers = []
    bodies = []

    current_header = None
    current_body = []

    for index, row in df.iterrows():
        if row['tag'] == 'header':
            if current_header is not None:
                headers.append(current_header)
                bodies.append(' '.join(current_body))

            current_header = row['content']
            current_body = []
        else:
            current_body.append(row['content'])

    if current_header is not None:
        headers.append(current_header)
        bodies.append(' '.join(current_body))

    df_pairs = pd.DataFrame({'header': headers, 'body': bodies})

    return df_pairs

In [None]:
## second dataframe creation

# only headers
def extract_all_headers(df):
    df_headers = df[df['tag'] == 'header'].drop('tag', axis = 1)
    df_headers.rename(columns = {'content':'header'}, inplace = True)
    df_headers.reset_index(drop=True, inplace=True)
    return df_headers

In [None]:
# execute all
bank_dfs = {
    'barclays': barclays,
    'hsbc': hsbc,
    'santander': santander,
    'natwest': natwest,
    'lloyds': lloyds,
    'first_direct': first_direct,
    'santander_c': santander_c,
    'natwest_p': natwest_p,
    'barclays_m': barclays_m
}

for bank_name, bank_df in bank_dfs.items():

    exec(change_datatype(bank_dfs))

    # Create dynamic variable names for header and body
    headings_var = f"{bank_name}_headings"
    header_body_var = f"{bank_name}_header_body"

    # Execute code to assign the variables
    exec(f"{header_body_var} = extract_header_body_pairs(bank_df)")
    exec(f"{headings_var} = extract_all_headers(bank_df)")

In [None]:
barclays_header_body.head()

Unnamed: 0,header,body
0,Barclays Wealth Management Additional Banking ...,These additional terms and conditions apply to...
1,1. How the Customer Agreement applies to non-p...,Section 1 of the Customer Agreement says that ...
2,2. Keeping each other informed,In addition to the various ways you can contac...
3,3. Carrying out your instructions,If we receive an instruction that contains inc...
4,4. Making payments out of and into your account,


In [None]:
barclays_headings.head()

Unnamed: 0,header,level
0,Barclays Wealth Management Additional Banking ...,0
1,1. How the Customer Agreement applies to non-p...,0
2,2. Keeping each other informed,0
3,3. Carrying out your instructions,0
4,4. Making payments out of and into your account,0


In [None]:
# add a column to identify the document by bank name
def add_column(df, bank_name):
  df['bank'] = bank_name
  return df

for bank_name, bank_df in bank_dfs.items():
    headings_var = f"{bank_name}_headings"
    header_body_var = f"{bank_name}_header_body"

    # Get the actual DataFrames using globals()
    globals()[headings_var] = add_column(globals()[headings_var], bank_name)
    globals()[header_body_var] = add_column(globals()[header_body_var], bank_name)

In [None]:
# combine datasets
header_body = pd.concat([barclays_header_body, hsbc_header_body, lloyds_header_body, natwest_header_body, santander_header_body, first_direct_header_body, santander_c_header_body, natwest_p_header_body, barclays_m_header_body])
headings = pd.concat([barclays_headings, hsbc_headings, lloyds_headings, natwest_headings, santander_headings, first_direct_headings, santander_c_headings, natwest_p_headings, barclays_m_headings])

In [None]:
# make corrections to symbols
# ‚Äì means -
# ‚Ä¢ means bullet point
# ‚Äú and ‚Äù means "
# ‚Äô and ‚Äò means '

def correct_text(df, column_name):
  df[column_name] = df[column_name].str.replace('‚Äì', '-')
  df[column_name] = df[column_name].str.replace('‚Ä¢', '*') #replacement for bullet points
  df[column_name] = df[column_name].str.replace('‚Äú', '"')
  df[column_name] = df[column_name].str.replace('‚Äú', '"')
  df[column_name] = df[column_name].str.replace('‚Äô', "'")
  df[column_name] = df[column_name].str.replace('‚Äò', "'")
  return df

In [None]:
correct_text(header_body, 'body')

In [None]:
# create subset header-body pairs of 60 random rows
header_body_small = header_body.dropna(inplace = False).drop('bank', axis = 1, inplace = False)
header_body_small = header_body_small.sample(n=60).reset_index(drop = True)

In [None]:
# export as csv files
header_body.to_csv('header_body_train.csv', index=False)
headings.to_csv('headings_train.csv', index=False)
header_body_small.to_csv('header_body_small.csv', index=False)

## Making Errors

After were generated here, they were exported as a csv file where I made additional errors for other metrics at "random".

In [None]:
# randomise 12 header-content pairs
import random

header_body_small['is_shuffled'] = 0

num_rows_to_shuffle = 12
rows_to_shuffle = random.sample(range(len(header_body_small)), num_rows_to_shuffle)

# shuffle the body row
shuffled_values = header_body_small.loc[rows_to_shuffle, 'body'].sample(frac=1).values
header_body_small.loc[rows_to_shuffle, 'body'] = shuffled_values
header_body_small.loc[rows_to_shuffle, 'is_shuffled'] = 1

In [None]:
# save as csv file
header_body_small.to_csv('header_body_shuffled.csv', index=False)

In [None]:
# randomise 10 random rows - i ended up randomising more manually
def shuffle_subheadings(data):

    modified_data = data.copy()

    valid_indices = data[data['level'] != 0].index

    shuffled_indices = random.sample(list(valid_indices), min(10, len(valid_indices)))
    shuffled_data = data.loc[shuffled_indices]

    modified_data['is_shuffled'] = 0
    modified_data.loc[shuffled_data.index, 'is_shuffled'] = 1

    return modified_data

In [None]:
headings_shuffled = headings.copy()
headings_shuffled = shuffle_subheadings(headings_shuffled)

In [None]:
# save as csv file
headings_shuffled.to_csv('headings_shuffled.csv', index=False)