In [2]:
import pandas as pd
import numpy as np
import sklearn
import random

RAND_SEED = 49
random.seed(RAND_SEED)
np.random.seed(RAND_SEED)

## Importing Files

In [3]:
# import a sample of files
barclays = pd.read_csv('./documents/Barclays_Wealth_Management.csv')
hsbc = pd.read_csv('./documents/HSBC_Loans.csv')
santander = pd.read_csv('./documents/Santander_Cashback.csv')

In [4]:
barclays.head()

Unnamed: 0,tag,content,level
0,header,Barclays Wealth Management Additional Banking ...,0
1,para,These additional terms and conditions apply to...,0
2,header,1. How the Customer Agreement applies to non-p...,0
3,para,Section 1 of the Customer Agreement says that ...,0
4,header,2. Keeping each other informed,0


In [5]:
barclays[['tag', 'content']] = barclays[['tag', 'content']].astype('str')

In [6]:
## define first dataframe creation

# header and body pairs
def extract_header_body_pairs(df):
    headers = []
    bodies = []

    current_header = None
    current_body = []

    for index, row in df.iterrows():
        if row['tag'] == 'header':
            if current_header is not None:
                headers.append(current_header)
                bodies.append(' '.join(current_body))

            current_header = row['content']
            current_body = []
        else:
            current_body.append(row['content'])

    if current_header is not None:
        headers.append(current_header)
        bodies.append(' '.join(current_body))

    df_pairs = pd.DataFrame({'header': headers, 'body': bodies})

    return df_pairs

In [7]:
# headings only df
def extract_all_headers(df):
    df_headers = df[df['tag'] == 'header'].drop('tag', axis = 1)
    df_headers.rename(columns = {'content':'header'}, inplace = True)
    return df_headers

In [8]:
# extract for csv files
barclays_header_body = extract_header_body_pairs(barclays)
barclays_headings = extract_all_headers(barclays)

In [9]:
barclays_header_body.head()

Unnamed: 0,header,body
0,Barclays Wealth Management Additional Banking ...,These additional terms and conditions apply to...
1,1. How the Customer Agreement applies to non-p...,Section 1 of the Customer Agreement says that ...
2,2. Keeping each other informed,In addition to the various ways you can contac...
3,3. Carrying out your instructions,If we receive an instruction that contains inc...
4,4. Making payments out of and into your account,


## Content
Apply TF-IDF to extract keywords from the body text (after stopword removal and lemmatisation). Search for the keywords in the headings.

https://ieeexplore.ieee.org/abstract/document/8663040

In [11]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [12]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /Users/kimbo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kimbo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kimbo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
heading = barclays_header_body['header'][1]
body = barclays_header_body['body'][1]

In [14]:
all_body = ' '.join(barclays_header_body['body'])

In [15]:
# filter stop words
words = nltk.word_tokenize(body.lower())
stop_words = set(nltk.corpus.stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

# Filter and lemmatize words
filtered_words = [
    lemmatizer.lemmatize(word) 
    for word in words 
    if word.isalnum() and word not in stop_words
]

document = ' '.join(filtered_words)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([document])

tfidf_scores = zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray()[0])
sorted_keywords = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)

for item in sorted_keywords:
  print(item)

('banking', 0.4423258684646914)
('customer', 0.4423258684646914)
('agreement', 0.29488391230979427)
('say', 0.29488391230979427)
('section', 0.29488391230979427)
('service', 0.29488391230979427)
('apply', 0.14744195615489714)
('barclays', 0.14744195615489714)
('business', 0.14744195615489714)
('charity', 0.14744195615489714)
('club', 0.14744195615489714)
('cover', 0.14744195615489714)
('management', 0.14744195615489714)
('notwithstanding', 0.14744195615489714)
('professional', 0.14744195615489714)
('provide', 0.14744195615489714)
('trustee', 0.14744195615489714)
('wealth', 0.14744195615489714)


In [16]:
## function that measures keywords
def keyword_relevance(heading, body):
    # Tokenization and preprocessing for body
    words = word_tokenize(body.lower())
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words]
    document = ' '.join(filtered_words)
    
    # TF-IDF vectorization
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([document])
    
    # Extract TF-IDF scores and keywords
    tfidf_scores = zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray()[0])
    sorted_keywords = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    
    # Extract keywords only (top keywords based on TF-IDF score)
    keywords = [keyword for keyword, score in sorted_keywords[:5]] #take top 5
    
    # Check if any keyword is in the heading and print the matched keyword(s)
    matched_keywords = [keyword for keyword in keywords if keyword.lower() in heading.lower()]
    
    if matched_keywords:
        print(f"Matched keyword(s): {', '.join(matched_keywords)}")
        return 'Y'
    else:
        return 'N'

In [17]:
keyword_relevance(heading,body)

Matched keyword(s): banking, customer, agreement


'Y'

In [18]:
## apply to entire df

## Language
Count the number of words. Headings should be between 3 to 14 words. 

In [25]:
def header_count(heading):
    num_words = len(heading.split())
    if 3 <= num_words <= 14:
        return f"Word count: {num_words}, Y"
    else:
        return f"Word count: {num_words}, N"

In [26]:
header_count(heading)

'Word count: 13, Y'

In [21]:
## apply to entire df

An external Grammar API will be used to verify grammar.

https://aclanthology.org/2020.lrec-1.228.pdf

https://docs.grammarbot.io/request/ (i will try bing spell later)

In [None]:
import requests
import json

import requests

url = "https://grammarbot-neural.p.rapidapi.com/v1/check"
key = "<API KEY HERE>"

payload = {
    "text": heading,
    "lang": "en"
}

headers = {
    "content-type": "application/json",
    "X-RapidAPI-Key": key,
    "X-RapidAPI-Host": "grammarbot-neural.p.rapidapi.com"
}

response = requests.request("POST", url, json=payload, headers=headers)
rjson = response.json()

if rjson["status"] == 200:
    correction = rjson['correction']
    print(correction)
else:
    print("ERROR: " + str(rjson))

Apply PassivePy package to detect any instance of passive voice. *Amplifi also has some code*

https://myscp.onlinelibrary.wiley.com/doi/full/10.1002/jcpy.1377

In [28]:
!pip install -r https://raw.githubusercontent.com/mitramir55/PassivePy/main/PassivePyCode/PassivePySrc/requirements_lg.txt
!pip install PassivePy==0.2.23

Collecting en_core_web_lg (from -r https://raw.githubusercontent.com/mitramir55/PassivePy/main/PassivePyCode/PassivePySrc/requirements_lg.txt (line 5))
  Using cached en_core_web_lg-3.4.0-py3-none-any.whl


In [29]:
from PassivePySrc import PassivePy

def passive_count(heading):
    passivepy = PassivePy.PassivePyAnalyzer(spacy_model = "en_core_web_lg")
    result = passivepy.match_text(heading, full_passive=True, truncated_passive=True)
    passive_count = result['passive_count']

    if passive_count >= 1:
        return f"Instances of passive voice detected: {passive_count}, N"
    else:
        return f"Instances of passive voice detected: {passive_count}, Y"

ModuleNotFoundError: No module named 'PassivePySrc'

In [None]:
## apply to entire df

## Structure
Count and compare the number of keywords in main headings and subheadings, looking for overlaps and a greater number generated. Additionally, the average word count of subheadings should be longer than the word count of the main heading. 

In [None]:
# split the dataframe and append keywords and word count columns

In [None]:
# function that looks for overlapping keywords, keyword count and the average word count