In [1]:
import pandas as pd
import numpy as np
import sklearn
import random

RAND_SEED = 49
random.seed(RAND_SEED)
np.random.seed(RAND_SEED)

## Importing Files

In [122]:
# import a sample of files
barclays = pd.read_csv('./documents/Barclays_Wealth_Management.csv')
hsbc = pd.read_csv('./documents/HSBC_Loans.csv')
santander = pd.read_csv('./documents/Santander_Cashback.csv')

In [123]:
barclays.head()

Unnamed: 0,tag,content,level
0,header,Barclays Wealth Management Additional Banking ...,0
1,para,These additional terms and conditions apply to...,0
2,header,1. How the Customer Agreement applies to non-p...,0
3,para,Section 1 of the Customer Agreement says that ...,0
4,header,2. Keeping each other informed,0


In [124]:
barclays[['tag', 'content']] = barclays[['tag', 'content']].astype('str')
hsbc[['tag', 'content']] = hsbc[['tag', 'content']].astype('str')
santander[['tag', 'content']] = santander[['tag', 'content']].astype('str')

In [125]:
## define first dataframe creation

# header and body pairs
def extract_header_body_pairs(df):
    headers = []
    bodies = []

    current_header = None
    current_body = []

    for index, row in df.iterrows():
        if row['tag'] == 'header':
            if current_header is not None:
                headers.append(current_header)
                bodies.append(' '.join(current_body))

            current_header = row['content']
            current_body = []
        else:
            current_body.append(row['content'])

    if current_header is not None:
        headers.append(current_header)
        bodies.append(' '.join(current_body))

    df_pairs = pd.DataFrame({'header': headers, 'body': bodies})

    return df_pairs

In [126]:
# headings only df
def extract_all_headers(df):
    df_headers = df[df['tag'] == 'header'].drop('tag', axis = 1)
    df_headers.rename(columns = {'content':'header'}, inplace = True)
    return df_headers

In [127]:
# extract for csv files
barclays_header_body = extract_header_body_pairs(barclays)
barclays_headings = extract_all_headers(barclays)

In [50]:
#drop nulls from header_body pairs because they don't have a pair to compare to
#barclays_header_body.replace('', np.nan, inplace=True)
#barclays_header_body.dropna(inplace= True)

In [190]:
barclays_header_body.head()

Unnamed: 0,header,body,matched_keywords,keywords,c1_keywords
0,Barclays Wealth Management Additional Banking ...,These additional terms and conditions apply to...,"wealth, management, additional","wealth, management, additional",1
1,1. How the Customer Agreement applies to non-p...,Section 1 of the Customer Agreement says that ...,"agreement, wealth, management","agreement, wealth, management",1
2,2. Keeping each other informed,In addition to the various ways you can contac...,,,0
3,3. Carrying out your instructions,If we receive an instruction that contains inc...,,,0
4,4. Making payments out of and into your account,,,,0


## Content
Apply YAKE to extract keywords from the entire document's body (after stopword removal and lemmatisation). Search for the keywords in the headings.

Also tested: RAKE, TF-IDF and KeyBERT.

https://ieeexplore.ieee.org/abstract/document/8663040

In [10]:
#pip install nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/kimbo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kimbo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kimbo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [160]:
import yake
import re

In [60]:
# example to test on
heading = barclays_header_body['header'][1]
body = barclays_header_body['body'][1]
all_body = ' '.join(barclays_header_body['body'])

In [119]:
## function that measures keywords- tested on excerpts

def extract_keywords(heading, all_body):
    # Tokenization and preprocessing for body
    words = word_tokenize(all_body.lower())
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words]
    document = ' '.join(filtered_words)
    
    # YAKE keyword extraction
    language = "en"
    max_ngram_size = 1
    windowSize = 3
    numOfKeywords = 50

    kw_extractor = yake.KeywordExtractor(lan=language, 
                                         n=max_ngram_size,
                                         windowsSize=windowSize, 
                                         top=numOfKeywords)
    
    extracted_keywords = kw_extractor.extract_keywords(document)
    keywords = [keyword for keyword, score in extracted_keywords]  # Extract the keywords from tuples

    # Keywords to be excluded
    excluded_keywords = ['barclays', 'hsbc', 'santander', 'natwest', 'lloyds', 'customer', 'banking', 'personal', 'bank', 'account', 'money']

    # Filter out excluded keywords
    keywords_modified = [keyword for keyword in keywords if keyword.lower() not in excluded_keywords]

    # Find matched keywords in the heading
    matched_keywords = [keyword for keyword in keywords_modified if keyword.lower() in heading.lower()]

    if matched_keywords:
        return matched_keywords
    else:
        return "No keywords"
    
extract_keywords(heading, all_body)

In [186]:
## function to extract keywords from df

def extract_keywords_df(df):
    # Combine all body texts into one document
    all_body = ' '.join(df['body'])

    # Tokenization and preprocessing for combined body text
    words = word_tokenize(all_body.lower())
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words]
    document = ' '.join(filtered_words)

    # YAKE keyword extraction parameters
    language = "en"
    max_ngram_size = 1
    windowSize = 1
    numOfKeywords = 100

    # Initialize YAKE keyword extractor
    kw_extractor = yake.KeywordExtractor(lan=language, 
                                         n=max_ngram_size,
                                         windowsSize=windowSize, 
                                         top=numOfKeywords)
    
    # Extract keywords from the combined body text
    extracted_keywords = kw_extractor.extract_keywords(document)
    keywords = [keyword for keyword, score in extracted_keywords]  # Extract the keywords from tuples

    # Keywords to be excluded
    excluded_keywords = ['barclays', 'hsbc', 'santander', 'natwest', 'lloyds', 'customer', 'banking', 'personal', 'bank', 'account']

    # Filter out excluded keywords
    keywords_modified = [keyword for keyword in keywords if keyword.lower() not in excluded_keywords]

    # List to store matched keywords for each row
    matched_keywords_list = []
    matched_keywords_count = []

    # Iterate over each heading to find matched keywords
    for heading in df['header']:
        pattern = r'\b(?:{})\b'.format('|'.join(map(re.escape, keywords_modified)))
        matched_keywords = re.findall(pattern, heading.lower())
        matched_keywords_list.append(', '.join(matched_keywords) if matched_keywords else '')
        matched_keywords_count.append(1 if matched_keywords else 0)

    # Add matched keywords as a new column in the DataFrame
    df['keywords'] = matched_keywords_list
    df['c1_keywords'] = matched_keywords_count

    return df

In [187]:
extract_keywords_df(barclays_header_body)

Unnamed: 0,header,body,matched_keywords,keywords,c1_keywords
0,Barclays Wealth Management Additional Banking ...,These additional terms and conditions apply to...,"wealth, management, additional","wealth, management, additional",1
1,1. How the Customer Agreement applies to non-p...,Section 1 of the Customer Agreement says that ...,"agreement, wealth, management","agreement, wealth, management",1
2,2. Keeping each other informed,In addition to the various ways you can contac...,,,0
3,3. Carrying out your instructions,If we receive an instruction that contains inc...,,,0
4,4. Making payments out of and into your account,,,,0
5,Making payments out of your account,4.1 The SEPA Business to Business Direct Debit...,,,0
6,Payments into your account,4.2 If your account is in an EEA currency othe...,,,0
7,Wealth Management,4.4 If one of the following types of payment i...,"wealth, management","wealth, management",1
8,International payments,4.6 To find out more about how we calculate fo...,international,international,1
9,5. Borrowing on a joint account,5.1 Individuals who are authorised to give ins...,,,0


## Language
L1: Count the number of words. Headings should be between 3 to 14 words. 

In [17]:
def header_count(heading):
    num_words = len(heading.split())
    if 3 <= num_words <= 14:
        return f"Word count: {num_words}, Y"
    else:
        return f"Word count: {num_words}, N"

In [18]:
header_count(heading)

'Word count: 13, Y'

In [191]:
## apply to entire df
def header_count_df(df):

    wordcount = []
    wordcount_check = []
    
    for heading in df['header']:
        num_words = len(heading.split())
        wordcount.append(num_words)
        wordcount_check.append(1 if (num_words >= 3 and num_words <= 14) else 0)

    df['length'] = wordcount
    df['l1_length'] = wordcount_check

    return df

In [192]:
header_count_df(barclays_header_body)

Unnamed: 0,header,body,matched_keywords,keywords,c1_keywords,length,l1_length
0,Barclays Wealth Management Additional Banking ...,These additional terms and conditions apply to...,"wealth, management, additional","wealth, management, additional",1,6,1
1,1. How the Customer Agreement applies to non-p...,Section 1 of the Customer Agreement says that ...,"agreement, wealth, management","agreement, wealth, management",1,13,1
2,2. Keeping each other informed,In addition to the various ways you can contac...,,,0,5,1
3,3. Carrying out your instructions,If we receive an instruction that contains inc...,,,0,5,1
4,4. Making payments out of and into your account,,,,0,9,1
5,Making payments out of your account,4.1 The SEPA Business to Business Direct Debit...,,,0,6,1
6,Payments into your account,4.2 If your account is in an EEA currency othe...,,,0,4,1
7,Wealth Management,4.4 If one of the following types of payment i...,"wealth, management","wealth, management",1,2,0
8,International payments,4.6 To find out more about how we calculate fo...,international,international,1,2,0
9,5. Borrowing on a joint account,5.1 Individuals who are authorised to give ins...,,,0,6,1


L2: An Grammar API will be used to verify grammar checked by the LLM. Bing Spell Check is preferred but it can only process up to 1,00 requests a month on the free version. 

I used LanguageTool instead. Also tried: TextRazor.

https://aclanthology.org/2020.lrec-1.228.pdf

https://languagetool.org/http-api/#/default

https://www.geeksforgeeks.org/grammar-checker-in-python-using-language-check/

In [200]:
import textrazor

# Initialize TextRazor with your API key
textrazor.api_key = '1b1d2b5fe266fc93f155a5281f12f6a8e1a0cfea6e33f02862803615'

In [217]:
# Function to analyze a sentence
def analyze_sentence(sentence):
    # Initialize TextRazor client with 'words' and 'spelling' extractors
    client = textrazor.TextRazor(extractors=["words", "spelling"])
    response = client.analyze(sentence)
    
    # Check if response is successful
    if response.ok:
        print("TextRazor analysis complete.")
        
        # Extract spell check suggestions with scores above 0.5
        for sentence in response.json['response']['sentences']:
            for word in sentence['words']:
                if 'spellingSuggestions' in word:
                    token = word['token']
                    suggestions = [(sugg['suggestion'], sugg['score']) for sugg in word['spellingSuggestions'] if sugg['score'] > 0.5]
                    
                    if suggestions:
                        print(f"Word '{token}' has spelling suggestions:")
                        for suggestion, score in suggestions:
                            print(f"- {suggestion} (Score: {score})")
    else:
        print(f"TextRazor request failed: {response.message}")

In [214]:
analyze_sentence(all_body)

TextRazor analysis complete.
Word 'SEPA' has spelling suggestions:
- sea (Score: 0.7331)
- spa (Score: 0.5816)
Word 'EEA' has spelling suggestions:
- sea (Score: 0.657)
- era (Score: 0.6335)


In [222]:
pip install language-tool-python

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting language-tool-python
  Downloading language_tool_python-2.8-py3-none-any.whl.metadata (12 kB)
Downloading language_tool_python-2.8-py3-none-any.whl (35 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.8
Note: you may need to restart the kernel to use updated packages.


In [224]:
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')

text = "Your the best but their are allso  good !"
matches = tool.check(text)
len(matches)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


CalledProcessError: Command '['/usr/bin/java', '-version']' returned non-zero exit status 1.

L3: PassivePy package will be used to detect any instance of passive voice. *Amplifi also has some code*

https://myscp.onlinelibrary.wiley.com/doi/full/10.1002/jcpy.1377

In [22]:
!pip install -r https://raw.githubusercontent.com/mitramir55/PassivePy/main/PassivePyCode/PassivePySrc/requirements_lg.txt
!pip install PassivePy==0.2.2

Defaulting to user installation because normal site-packages is not writeable
Collecting en_core_web_lg (from -r https://raw.githubusercontent.com/mitramir55/PassivePy/main/PassivePyCode/PassivePySrc/requirements_lg.txt (line 5))
  Using cached en_core_web_lg-3.4.0-py3-none-any.whl
Collecting spacy==3.4.1 (from -r https://raw.githubusercontent.com/mitramir55/PassivePy/main/PassivePyCode/PassivePySrc/requirements_lg.txt (line 1))
  Using cached spacy-3.4.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (24 kB)
Collecting spacy-legacy==3.0.10 (from -r https://raw.githubusercontent.com/mitramir55/PassivePy/main/PassivePyCode/PassivePySrc/requirements_lg.txt (line 2))
  Using cached spacy_legacy-3.0.10-py2.py3-none-any.whl.metadata (2.8 kB)
Using cached spacy-3.4.1-cp39-cp39-macosx_11_0_arm64.whl (6.4 MB)
Using cached spacy_legacy-3.0.10-py2.py3-none-any.whl (21 kB)
Installing collected packages: spacy-legacy, spacy
  Attempting uninstall: spacy-legacy
    Found existing installation: spacy-lega

In [225]:
from PassivePySrc import PassivePy

spacy_model = "en_core_web_lg"
passivepy = PassivePy.PassivePyAnalyzer(spacy_model)
result = passivepy.match_text(heading, full_passive=True, truncated_passive=True)
#passive_count = result['passive_count']

#if passive_count >= 1:
    #return f"Instances of passive voice detected: {passive_count}, N"
#else:
    #return f"Instances of passive voice detected: {passive_count}, Y"

ImportError: cannot import name util

In [None]:
## apply to entire df

## Structure
Count and compare the number of keywords in main headings and subheadings, looking for overlaps and a greater number generated. Additionally, the average word count of subheadings should be longer than the word count of the main heading. 

In [None]:
# split the dataframe and append keywords and word count columns

In [None]:
# function that looks for overlapping keywords, keyword count and the average word count