In [1]:
import pandas as pd
import numpy as np
import sklearn
import random

RAND_SEED = 49
random.seed(RAND_SEED)
np.random.seed(RAND_SEED)

## Importing Files

In [6]:
# import a sample of files
barclays = pd.read_csv('Barclays_Wealth_Management.csv')
hsbc = pd.read_csv('HSBC_Loans.csv')
santander = pd.read_csv('Santander_Conditions.csv')
natwest = pd.read_csv('NatWest_Reward_Terms.csv')
lloyds = pd.read_csv('Lloyds_Benefits.csv')

In [7]:
barclays.head()

Unnamed: 0,tag,content,level
0,header,Barclays Wealth Management Additional Banking ...,0
1,para,These additional terms and conditions apply to...,0
2,header,1. How the Customer Agreement applies to non-p...,0
3,para,Section 1 of the Customer Agreement says that ...,0
4,header,2. Keeping each other informed,0


In [12]:
def change_datatype(df):
  df[['tag', 'content']] = df[['tag', 'content']].astype('str')
  return df

In [None]:
change_datatype(barclays)
change_datatype(hsbc)
change_datatype(santander)
change_datatype(natwest)
change_datatype(lloyds)

In [14]:
## define first dataframe creation

# header and body pairs
def extract_header_body_pairs(df):
    headers = []
    bodies = []

    current_header = None
    current_body = []

    for index, row in df.iterrows():
        if row['tag'] == 'header':
            if current_header is not None:
                headers.append(current_header)
                bodies.append(' '.join(current_body))

            current_header = row['content']
            current_body = []
        else:
            current_body.append(row['content'])

    if current_header is not None:
        headers.append(current_header)
        bodies.append(' '.join(current_body))

    df_pairs = pd.DataFrame({'header': headers, 'body': bodies})

    return df_pairs

In [15]:
# headings only df
def extract_all_headers(df):
    df_headers = df[df['tag'] == 'header'].drop('tag', axis = 1)
    df_headers.rename(columns = {'content':'header'}, inplace = True)
    df_headers.reset_index(drop=True, inplace=True)
    return df_headers

In [21]:
# execute all
bank_dfs = {
    'barclays': barclays,
    'hsbc': hsbc,
    'santander': santander,
    'natwest': natwest,
    'lloyds': lloyds
}

for bank_name, bank_df in bank_dfs.items():
    # Create dynamic variable names for header and body
    header_body_var = f"{bank_name}_header_body"
    headings_var = f"{bank_name}_headings"

    # Execute code to assign the variables
    exec(f"{header_body_var} = extract_header_body_pairs(bank_df)")
    exec(f"{headings_var} = extract_all_headers(bank_df)")

In [17]:
barclays_header_body.head()

Unnamed: 0,header,body
0,Barclays Wealth Management Additional Banking ...,These additional terms and conditions apply to...
1,1. How the Customer Agreement applies to non-p...,Section 1 of the Customer Agreement says that ...
2,2. Keeping each other informed,In addition to the various ways you can contac...
3,3. Carrying out your instructions,If we receive an instruction that contains inc...
4,4. Making payments out of and into your account,


In [18]:
barclays_headings.head()

Unnamed: 0,header,level
0,Barclays Wealth Management Additional Banking ...,0
1,1. How the Customer Agreement applies to non-p...,0
2,2. Keeping each other informed,0
3,3. Carrying out your instructions,0
4,4. Making payments out of and into your account,0


## Content
Apply YAKE to extract keywords from the entire document's body (after stopword removal and lemmatisation). Search for the keywords in the headings.

Also tested: RAKE, TF-IDF and KeyBERT.

https://ieeexplore.ieee.org/abstract/document/8663040

In [23]:
#pip install nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [24]:
!pip install yake
import yake
import re

Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl (60 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/60.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting segtok (from yake)
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Installing collected packages: segtok, yake
Successfully installed segtok-1.5.11 yake-0.4.8


In [None]:
# example to test on
heading = barclays_header_body['header'][1]
body = barclays_header_body['body'][1]
all_body = ' '.join(barclays_header_body['body'])

In [None]:
## function that measures keywords- tested on excerpts

def extract_keywords(heading, all_body):
    # Tokenization and preprocessing for body
    words = word_tokenize(all_body.lower())
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words]
    document = ' '.join(filtered_words)

    # YAKE keyword extraction
    language = "en"
    max_ngram_size = 1
    windowSize = 3
    numOfKeywords = 50

    kw_extractor = yake.KeywordExtractor(lan=language,
                                         n=max_ngram_size,
                                         windowsSize=windowSize,
                                         top=numOfKeywords)

    extracted_keywords = kw_extractor.extract_keywords(document)
    keywords = [keyword for keyword, score in extracted_keywords]  # Extract the keywords from tuples

    # Keywords to be excluded
    excluded_keywords = ['barclays', 'hsbc', 'santander', 'natwest', 'lloyds', 'customer', 'banking', 'personal', 'bank', 'account', 'money']

    # Filter out excluded keywords
    keywords_modified = [keyword for keyword in keywords if keyword.lower() not in excluded_keywords]

    # Find matched keywords in the heading
    matched_keywords = [keyword for keyword in keywords_modified if keyword.lower() in heading.lower()]

    if matched_keywords:
        return matched_keywords
    else:
        return "No keywords"

extract_keywords(heading, all_body)

['agreement', 'wealth', 'management']

In [25]:
## function to extract keywords from df

def extract_keywords_df(df):
    # Combine all body texts into one document
    all_body = ' '.join(df['body'])

    # Tokenization and preprocessing for combined body text
    words = word_tokenize(all_body.lower())
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words]
    document = ' '.join(filtered_words)

    # YAKE keyword extraction parameters
    language = "en"
    max_ngram_size = 1
    windowSize = 1
    numOfKeywords = 100

    # Initialize YAKE keyword extractor
    kw_extractor = yake.KeywordExtractor(lan=language,
                                         n=max_ngram_size,
                                         windowsSize=windowSize,
                                         top=numOfKeywords)

    # Extract keywords from the combined body text
    extracted_keywords = kw_extractor.extract_keywords(document)
    keywords = [keyword for keyword, score in extracted_keywords]  # Extract the keywords from tuples

    # Keywords to be excluded
    excluded_keywords = ['barclays', 'hsbc', 'santander', 'natwest', 'lloyds', 'customer', 'banking', 'personal', 'bank', 'account']

    # Filter out excluded keywords
    keywords_modified = [keyword for keyword in keywords if keyword.lower() not in excluded_keywords]

    # List to store matched keywords for each row
    matched_keywords_list = []
    matched_keywords_count = []

    # Iterate over each heading to find matched keywords
    for heading in df['header']:
        pattern = r'\b(?:{})\b'.format('|'.join(map(re.escape, keywords_modified)))
        matched_keywords = re.findall(pattern, heading.lower())
        matched_keywords_list.append(', '.join(matched_keywords) if matched_keywords else '')
        matched_keywords_count.append(1 if matched_keywords else 0)

    # Add matched keywords as a new column in the DataFrame
    df['keywords'] = matched_keywords_list
    df['c1_keywords'] = matched_keywords_count

    return df

In [27]:
# execute on all dfs
bank_header_body_dfs = {
    'barclays': barclays_header_body,
    'hsbc': hsbc_header_body,
    'santander': santander_header_body,
    'natwest': natwest_header_body,
    'lloyds': lloyds_header_body
}

for bank_name, bank_df in bank_header_body_dfs.items():
    bank_dfs[bank_name] = extract_keywords_df(bank_df)

In [28]:
barclays_header_body.head()

Unnamed: 0,header,body,keywords,c1_keywords
0,Barclays Wealth Management Additional Banking ...,These additional terms and conditions apply to...,"wealth, management, additional",1
1,1. How the Customer Agreement applies to non-p...,Section 1 of the Customer Agreement says that ...,"agreement, wealth, management",1
2,2. Keeping each other informed,In addition to the various ways you can contac...,,0
3,3. Carrying out your instructions,If we receive an instruction that contains inc...,,0
4,4. Making payments out of and into your account,,,0


## Language
L1: Count the number of words. Headings should be between 3 to 14 words.

In [29]:
def header_count(heading):
    num_words = len(heading.split())
    if 3 <= num_words <= 14:
        return f"Word count: {num_words}, Y"
    else:
        return f"Word count: {num_words}, N"

In [None]:
header_count(heading)

'Word count: 13, Y'

In [30]:
## apply to entire df
def header_count_df(df):

    wordcount = []
    wordcount_check = []

    for heading in df['header']:
        num_words = len(heading.split())
        wordcount.append(num_words)
        wordcount_check.append(1 if (num_words >= 3 and num_words <= 14) else 0)

    df['length'] = wordcount
    df['l1_length'] = wordcount_check

    return df

In [31]:
# execute on all dfs
for bank_name, bank_df in bank_header_body_dfs.items():
    bank_dfs[bank_name] = header_count_df(bank_df)

In [32]:
barclays_header_body.head()

Unnamed: 0,header,body,keywords,c1_keywords,length,l1_length
0,Barclays Wealth Management Additional Banking ...,These additional terms and conditions apply to...,"wealth, management, additional",1,6,1
1,1. How the Customer Agreement applies to non-p...,Section 1 of the Customer Agreement says that ...,"agreement, wealth, management",1,13,1
2,2. Keeping each other informed,In addition to the various ways you can contac...,,0,5,1
3,3. Carrying out your instructions,If we receive an instruction that contains inc...,,0,5,1
4,4. Making payments out of and into your account,,,0,9,1


L2: An Grammar API will be used to verify grammar checked by the LLM. Bing Spell Check is preferred but it can only process up to 1,000 requests a month. I used LanguageTool instead.

Also tried: TextRazor.

https://aclanthology.org/2020.lrec-1.228.pdf

https://pypi.org/project/language-tool-python/

In [33]:
pip install language-tool-python

Collecting language-tool-python
  Downloading language_tool_python-2.8-py3-none-any.whl (35 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.8


In [41]:
def check_grammar(heading):
  tool = language_tool_python.LanguageTool('en-UK')
  matches = tool.check(heading)

  if len(matches) > 0:
      print(f"Grammar check found {len(matches)} issue(s):")
      for match in matches:
          print(f"Suggestion: {match.ruleId} - {match.message}")
          print(f" Correction: {match.replacements}")
          print(f" Context: {match.context}")
          print()
  else:
      print("No grammar issues found.")

In [None]:
check_grammar(all_body)

Grammar check found 3 issue(s):
Suggestion: COMMA_COMPOUND_SENTENCE - Use a comma before ‘and’ if it connects two independent clauses (unless they are closely connected and short).
 Correction: [', and']
 Context: ... owe us money under an agreement with us and you haven’t paid it back when you shoul...

Suggestion: EN_UNPAIRED_QUOTES - Unpaired symbol: ‘’’ seems to be missing
 Correction: []
 Context: ...e a Wealth customer. When we talk about ‘the Tariff ’ in your agreement with us, ...

Suggestion: COMMA_PARENTHESIS_WHITESPACE - Don’t put a space on both sides of a quote symbol.
 Correction: ['’ ', ' ’']
 Context: ...customer. When we talk about ‘the Tariff ’ in your agreement with us, we mean the W...



In [37]:
## function for df
import language_tool_python

def check_grammar_df(df):
  tool = language_tool_python.LanguageTool('en-UK')

  grammar_errors = []
  grammar_check = []

  for heading in df['header']:
    matches = tool.check(heading)
    if len(matches) > 0:
      error_ids = [match.ruleId for match in matches]
      grammar_errors.append(error_ids)
      grammar_check.append(1)
    else:
      grammar_errors.append([])
      grammar_check.append(0)

  df['grammar_errors'] = grammar_errors
  df['l2_grammar'] = grammar_check

  return df

In [38]:
# execute on all dfs
for bank_name, bank_df in bank_header_body_dfs.items():
    bank_dfs[bank_name] = check_grammar_df(bank_df)

Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:07<00:00, 33.6MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpytch9_v0.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-6.4.zip to /root/.cache/language_tool_python.


In [39]:
barclays_header_body.head()

Unnamed: 0,header,body,keywords,c1_keywords,length,l1_length,grammar_errors,l2_grammar
0,Barclays Wealth Management Additional Banking ...,These additional terms and conditions apply to...,"wealth, management, additional",1,6,1,[],0
1,1. How the Customer Agreement applies to non-p...,Section 1 of the Customer Agreement says that ...,"agreement, wealth, management",1,13,1,[],0
2,2. Keeping each other informed,In addition to the various ways you can contac...,,0,5,1,[],0
3,3. Carrying out your instructions,If we receive an instruction that contains inc...,,0,5,1,[],0
4,4. Making payments out of and into your account,,,0,9,1,[],0


L3: PassivePy package will be used to detect any instance of passive voice. *Amplifi also has some code*

https://myscp.onlinelibrary.wiley.com/doi/full/10.1002/jcpy.1377

https://mitramir55.github.io/PassivePyManualWebsite/

In [40]:
!pip install -r https://raw.githubusercontent.com/mitramir55/PassivePy/main/PassivePyCode/PassivePySrc/requirements_lg.txt
!pip install PassivePy==0.2.2

Collecting en_core_web_lg (from -r https://raw.githubusercontent.com/mitramir55/PassivePy/main/PassivePyCode/PassivePySrc/requirements_lg.txt (line 5))
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.0/en_core_web_lg-3.4.0.tar.gz (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m977.6 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spacy==3.4.1 (from -r https://raw.githubusercontent.com/mitramir55/PassivePy/main/PassivePyCode/PassivePySrc/requirements_lg.txt (line 1))
  Downloading spacy-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-legacy==3.0.10 (from -r https://raw.githubusercontent.com/mitramir55/PassivePy/main/PassivePyCode/PassivePySrc/requirements_lg.txt (line 2))
  Do

Collecting PassivePy==0.2.2
  Downloading PassivePy-0.2.2-py3-none-any.whl (13 kB)
Installing collected packages: PassivePy
Successfully installed PassivePy-0.2.2


In [42]:
from PassivePySrc import PassivePy
spacy_model = "en_core_web_lg"
passivepy = PassivePy.PassivePyAnalyzer(spacy_model)

# passive detection function
def passive_detection_df(df):
  df_detected_c = passivepy.match_corpus_level(df, column_name='header', n_process = 1,
                                              batch_size = 32, add_other_columns=True,
                                              truncated_passive=False, full_passive=False)

  df['l2_passive'] = df_detected_c['binary']
  df['passive_voice'] = df_detected_c['all_passives']
  return df

In [43]:
for bank_name, bank_df in bank_header_body_dfs.items():
    bank_dfs[bank_name] = passive_detection_df(bank_df)

Detecting Sentences...


  0%|          | 0/18 [00:00<?, ?it/s]

Total number of sentences = 22
[32mStarting to find passives...[0m


  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Detecting Sentences...


  0%|          | 0/8 [00:00<?, ?it/s]

Total number of sentences = 8
[32mStarting to find passives...[0m


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

Detecting Sentences...


  0%|          | 0/43 [00:00<?, ?it/s]

Total number of sentences = 43
[32mStarting to find passives...[0m


  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

Detecting Sentences...


  0%|          | 0/20 [00:00<?, ?it/s]

Total number of sentences = 20
[32mStarting to find passives...[0m


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Detecting Sentences...


  0%|          | 0/8 [00:00<?, ?it/s]

Total number of sentences = 10
[32mStarting to find passives...[0m


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

In [44]:
barclays_header_body.head()

Unnamed: 0,header,body,keywords,c1_keywords,length,l1_length,grammar_errors,l2_grammar,l2_passive,passive_voice
0,Barclays Wealth Management Additional Banking ...,These additional terms and conditions apply to...,"wealth, management, additional",1,6,1,[],0,0,[]
1,1. How the Customer Agreement applies to non-p...,Section 1 of the Customer Agreement says that ...,"agreement, wealth, management",1,13,1,[],0,0,[]
2,2. Keeping each other informed,In addition to the various ways you can contac...,,0,5,1,[],0,0,[]
3,3. Carrying out your instructions,If we receive an instruction that contains inc...,,0,5,1,[],0,0,[]
4,4. Making payments out of and into your account,,,0,9,1,[],0,0,[]


## Structure
Count and compare the number of keywords in main headings and subheadings, looking for overlaps and a greater number generated. Additionally, the average word count of subheadings should be longer than the word count of the main heading.

In [46]:
# joining relevant columns
bank_headings_dfs = {
    'barclays': barclays_headings,
    'hsbc': hsbc_headings,
    'santander': santander_headings,
    'natwest': natwest_headings,
    'lloyds': lloyds_headings
}

for bank_name in bank_headings_dfs.keys():
    exec(f"{bank_name}_headings['keywords'] = {bank_name}_header_body['keywords']")
    exec(f"{bank_name}_headings['length'] = {bank_name}_header_body['length']")

In [47]:
barclays_headings.head()

Unnamed: 0,header,level,keywords,length
0,Barclays Wealth Management Additional Banking ...,0,"wealth, management, additional",6
1,1. How the Customer Agreement applies to non-p...,0,"agreement, wealth, management",13
2,2. Keeping each other informed,0,,5
3,3. Carrying out your instructions,0,,5
4,4. Making payments out of and into your account,0,,9


In [None]:
# Initialize a new column for word count differences
barclays_headings['word_count_diff'] = 0

# Iterate and calculate differences
for i in range(1, len(barclays_headings)):
    current_row = barclays_headings.iloc[i]
    previous_row_index = None
    for j in range(i - 1, -1, -1):
        if barclays_headings.iloc[j]['level'] < current_row['level']:
            previous_row_index = j
            break
    if previous_row_index is not None:
        previous_row = barclays_headings.iloc[previous_row_index]
        barclays_headings.at[i, 'word_count_diff'] = current_row['length'] - previous_row['length']

# Display the updated DataFrame
barclays_headings

Unnamed: 0,header,level,keywords,length,word_count_diff
0,Barclays Wealth Management Additional Banking ...,0,"wealth, management, additional",6,0
1,1. How the Customer Agreement applies to non-p...,0,"agreement, wealth, management",13,0
2,2. Keeping each other informed,0,,5,0
3,3. Carrying out your instructions,0,,5,0
4,4. Making payments out of and into your account,0,,9,0
5,Making payments out of your account,2,,6,-3
6,Payments into your account,2,,4,-5
7,Wealth Management,2,"wealth, management",2,-7
8,International payments,3,international,2,0
9,5. Borrowing on a joint account,0,,6,0


In [None]:
#checking for keywords

# Initialize a new column for overlapping keywords
barclays_headings['overlapping_keywords'] = ""

# Iterate and find overlapping keywords
for i in range(1, len(barclays_headings)):
    current_row = barclays_headings.iloc[i]
    previous_row_index = None
    for j in range(i - 1, -1, -1):
        if barclays_headings.iloc[j]['level'] < current_row['level']:
            previous_row_index = j
            break
    if previous_row_index is not None:
        previous_row = barclays_headings.iloc[previous_row_index]
        current_keywords = current_row['keywords'].split(', ') if current_row['keywords'] else []
        previous_keywords = previous_row['keywords'].split(', ') if previous_row['keywords'] else []
        overlap = set(current_keywords) & set(previous_keywords)
        barclays_headings.at[i, 'overlapping_keywords'] = ', '.join(overlap)

# Display the updated DataFrame
barclays_headings

Unnamed: 0,header,level,keywords,length,word_count_diff,overlapping_keywords
0,Barclays Wealth Management Additional Banking ...,0,"wealth, management, additional",6,0,
1,1. How the Customer Agreement applies to non-p...,0,"agreement, wealth, management",13,0,
2,2. Keeping each other informed,0,,5,0,
3,3. Carrying out your instructions,0,,5,0,
4,4. Making payments out of and into your account,0,,9,0,
5,Making payments out of your account,2,,6,-3,
6,Payments into your account,2,,4,-5,
7,Wealth Management,2,"wealth, management",2,-7,
8,International payments,3,international,2,0,
9,5. Borrowing on a joint account,0,,6,0,


In [55]:
def create_subheadings_df(df):

  df['overlapping_keywords'] = ""
  df['word_count_diff'] = 0
  df['s1_subheadings'] = 0

  # word count diffs
  for i in range(1, len(df)):
    current_row = df.iloc[i]
    previous_row_index = None
    for j in range(i - 1, -1, -1):
        if df.iloc[j]['level'] < current_row['level']:
            previous_row_index = j
            break
    if previous_row_index is not None:
        previous_row = df.iloc[previous_row_index]
        df.at[i, 'word_count_diff'] = current_row['length'] - previous_row['length']

  # keywords
  for i in range(1, len(df)):
      current_row = df.iloc[i]
      previous_row_index = None
      for j in range(i - 1, -1, -1):
          if df.iloc[j]['level'] < current_row['level']:
              previous_row_index = j
              break
      if previous_row_index is not None:
          previous_row = df.iloc[previous_row_index]
          current_keywords = current_row['keywords'].split(', ') if current_row['keywords'] else []
          previous_keywords = previous_row['keywords'].split(', ') if previous_row['keywords'] else []
          overlap = set(current_keywords) & set(previous_keywords)
          df.at[i, 'overlapping_keywords'] = ', '.join(overlap)

  # checks word count diff and keywords
  for i in range(1, len(df)):
    if df['level'][i] != 0:
        if df['word_count_diff'][i] >= 0 and df['overlapping_keywords'][i] != "":
          df.at[i, 's1_subheadings'] = 1
        else:
          df.at[i, 's1_subheadings'] = 0

  return df

In [56]:
create_subheadings_df(barclays_headings)

Unnamed: 0,header,level,keywords,length,overlapping_keywords,word_count_diff,s1_subheadings
0,Barclays Wealth Management Additional Banking ...,0,"wealth, management, additional",6,,0,0
1,1. How the Customer Agreement applies to non-p...,0,"agreement, wealth, management",13,,0,0
2,2. Keeping each other informed,0,,5,,0,0
3,3. Carrying out your instructions,0,,5,,0,0
4,4. Making payments out of and into your account,0,,9,,0,0
5,Making payments out of your account,2,,6,,-3,0
6,Payments into your account,2,,4,,-5,0
7,Wealth Management,2,"wealth, management",2,,-7,0
8,International payments,3,international,2,,0,0
9,5. Borrowing on a joint account,0,,6,,0,0


In [57]:
bank_headings_dfs = {
    'hsbc': hsbc_headings,
    'santander': santander_headings,
    'natwest': natwest_headings,
    'lloyds': lloyds_headings
}

for bank_name, bank_df in bank_headings_dfs.items():
    bank_dfs[bank_name] = create_subheadings_df(bank_df)

In [58]:
hsbc_headings.head()

Unnamed: 0,header,level,keywords,length,overlapping_keywords,word_count_diff,s1_subheadings
0,How and when will we give you the loan?,0,"give, loan",9,,0,0
1,What is the cost of the loan?,0,"cost, loan",7,,0,0
2,How must you repay the loan?,0,"repay, loan",6,,0,0
3,What should you do if your details change?,0,change,8,,0,0
4,Personal Loans,0,,2,,0,0


In [64]:
barclays_headings.to_csv('barclays_headings_processed.csv', index= False)

In [62]:
for bank_name, bank_df in bank_headings_dfs.items():
    file_name = f"{bank_name}_headings_processed.csv"
    bank_df.to_csv(file_name, index=False)
    print(f"Saved {bank_name} DataFrame to {file_name}")

Saved hsbc DataFrame to hsbc_headings_processed.csv
Saved santander DataFrame to santander_headings_processed.csv
Saved natwest DataFrame to natwest_headings_processed.csv
Saved lloyds DataFrame to lloyds_headings_processed.csv


In [63]:
for bank_name, bank_df in bank_header_body_dfs.items():
    file_name = f"{bank_name}_header_body_processed.csv"
    bank_df.to_csv(file_name, index=False)
    print(f"Saved {bank_name} DataFrame to {file_name}")

Saved barclays DataFrame to barclays_header_body_processed.csv
Saved hsbc DataFrame to hsbc_header_body_processed.csv
Saved santander DataFrame to santander_header_body_processed.csv
Saved natwest DataFrame to natwest_header_body_processed.csv
Saved lloyds DataFrame to lloyds_header_body_processed.csv
