In [1]:
import pandas as pd
import nltk
nltk.download('punkt_tab')
import re
import unicodedata
from nltk.corpus import stopwords
import spacy
import networkx as nx
from pyvis.network import Network
from networkx.algorithms.community import greedy_modularity_communities

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Dd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [3]:
# Load DataFrame
df = pd.read_csv("../data/pdf_metadata_and_summaries_trf.csv")  # Replace with your actual filename

In [4]:
df.columns

Index(['File Name', 'Metadata', 'Summary'], dtype='object')

In [5]:
ADDITIONAL_STOPWORDS = ['bank', 'fdic', 'institution']

# Function to clean a sentence and return a list of cleaned words
def clean(sentence):
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords_list = stopwords.words('english') + ADDITIONAL_STOPWORDS
    sentence = (unicodedata.normalize('NFKD', sentence)
                .encode('ascii', 'ignore')
                .decode('utf-8', 'ignore')
                .lower())

    # Remove numbers, punctuations, and any word with a single letter
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)  # Keep only alphabetic characters and spaces
    words = sentence.split()

    # Lemmatize and filter out stopwords and single-letter words
    word_list = [wnl.lemmatize(word) for word in words if len(word) > 1 and word not in stopwords_list]

    return word_list

In [6]:
# Function to extract triplets (subject, predicate, object) from sentences in a dataframe column
def extract_triplets(df, column):
    """
    Takes a dataframe and a column and returns a dataframe of triplets (subject, predicate, object) extracted from sentences.

            Parameters:
                    df (dataframe): A pandas dataframe
                    column (str): A column name in the dataframe

            Returns:
                    triplets_df (dataframe): A dataframe with columns ['Subject', 'Predicate', 'Object', 'Filename'] containing the extracted triplets
    """
    nlp = spacy.load('en_core_web_trf')
    triplets = []

    for _, row in df.iterrows():
        text = row[column]
        filename = row.get('File Name', None)
        cleaned_sentences = [" ".join(clean(sentence)) for sentence in nltk.sent_tokenize(str(text))]
        cleaned_text = " ".join(cleaned_sentences)
        doc = nlp(cleaned_text)
        for sentence in doc.sents:
            subject = None
            predicate = None
            obj = None

            for token in sentence:
                if token.dep_ in ('nsubj', 'nsubjpass'):
                    subject = token.text
                elif token.dep_ == 'ROOT':
                    predicate = token.text
                elif token.dep_ in ('dobj', 'pobj'):
                    obj = token.text

            if subject and predicate and obj:
                triplets.append((subject, predicate, obj, filename))

    triplets_df = pd.DataFrame(triplets, columns=['Subject', 'Predicate', 'Object', 'Filename'])
    return triplets_df

In [7]:
dfx = extract_triplets(df, "Summary")
dfx.shape
dfx.head()

  model.load_state_dict(torch.load(filelike, map_location=device))


(1736, 4)

Unnamed: 0,Subject,Predicate,Object,Filename
0,issue,provided,failure,09-003.pdf
1,deficiency,agreed,report,09-003.pdf
2,asset,conducted,quality,09-003.pdf
3,institution,included,operationthe,09-003.pdf
4,examination,exceeded,regulation,09-003.pdf


In [8]:
dfx.to_csv("../data/triplets_trf.csv", index=False)

In [9]:
# Function to visualize the knowledge graph using NetworkX and PyVis
def visualize_knowledge_graph(triplets_df):
    """
    Visualizes the knowledge graph from the triplets dataframe.

            Parameters:
                    triplets_df (dataframe): A dataframe with columns ['Subject', 'Predicate', 'Object']

            Returns:
                    None: Displays the interactive graph
    """
    graph = nx.DiGraph()

    # Add edges from triplets
    for _, row in triplets_df.iterrows():
        graph.add_edge(row['Subject'], row['Object'], label=row['Predicate'])

    # Visualize using PyVis
    net = Network(notebook=True, directed=True)
    net.from_nx(graph)

    # Add edge labels
    for edge in net.edges:
        edge['title'] = edge['label']

    net.show('../data/knowledge_graph_trf.html')

In [10]:
visualize_knowledge_graph(dfx)

../data/knowledge_graph_trf.html


In [11]:
dfx.shape

(1736, 4)