In [1]:
import asyncio
import os
import re
import json
from copy import copy, deepcopy
from pathlib import Path
from pprint import pprint, PrettyPrinter
from time import time, sleep
from typing import List, Dict
from uuid import uuid4
from collections import defaultdict

import evaluate
import openai
import requests
import tiktoken
from bs4 import BeautifulSoup, Comment
from doctran import Doctran, ExtractProperty
from dotenv import load_dotenv, find_dotenv
from evaluate import load
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import (
    OpenAIEmbeddings,
    HuggingFaceEmbeddings,
)
from langchain.llms import OpenAI
from langchain.text_splitter import (
    MarkdownTextSplitter,
    MarkdownHeaderTextSplitter,
    LineType,
    RecursiveCharacterTextSplitter,
)
# Load model directly
from transformers import AutoProcessor, AutoModelForTokenClassification
from loguru import logger
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from pdfminer.high_level import extract_text

from main import (
    divide_sections_if_too_large,
    extract_plan_and_content_wikipedia,
    compare_documents_plans,
    compare_documents_sections,
    extract_plan_and_content_patent,
    extract_plan_and_content_arxiv,
    load_arxiv_paper,
)

import pytesseract

# If you don't have tesseract executable in your PATH, include the following:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract"

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

llm_default = ChatOpenAI(model_name="gpt-3.5-turbo", streaming=True)
llm_16k = ChatOpenAI(model_name="gpt-3.5-turbo-16k", streaming=True)

def num_tokens_from_string(string: str, encoding_name: str = "gpt-3.5-turbo") -> int:
    """Returns the number of tokens in a text string."""
    try:
        encoding = tiktoken.get_encoding(encoding_name)
    except ValueError:
        encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def convert_to_markdown(article_dict):
    md_text = ""

    for heading, content in article_dict.items():
        # heading is of form: 'h3 Example'
        # Define the markdown equivalent for the heading level
        heading_level = "#" * int(heading[1])
        heading = heading[3:]
        # Append the heading and the content to the markdown text
        md_text += f"{heading_level} {heading}\n\n{content}\n\n"

    return md_text


def truncated_pprint(obj, N=5):
    """Pretty print an object, truncating lists and strings to N items/characters
    for easier viewing of plan_json objects"""
    def truncate(item, N):
        if isinstance(item, list) and N is not None:
            return item[:N] + (['...'] if len(item) > N else [])
        if isinstance(item, str) and N is not None:
            N = 125
            return item[:N] + ('...' if len(item) > N else '')
        return item

    def trunc_recursive(item, N):
        if isinstance(item, list):
            return [trunc_recursive(i, N) for i in truncate(item, N)]
        elif isinstance(item, dict):
            return {k: trunc_recursive(v, N) for k, v in item.items()}
        else:
            return truncate(item, N)

    truncated_obj = trunc_recursive(obj, N)
    pprint(truncated_obj, sort_dicts=False)

# Test
data = {
    'long_list': list(range(100)),
    'long_string': 'a' * 100,
    'nested': {
        'nested_list': list(range(50)),
        'nested_string': 'b' * 50
    }
}

# truncated_pprint(data, 5)


processor = AutoProcessor.from_pretrained("nielsr/layoutlmv3-finetuned-funsd")
model = AutoModelForTokenClassification.from_pretrained("nielsr/layoutlmv3-finetuned-funsd")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'LayoutLMv3TokenizerFast'.


In [8]:
list(Path('data/patents').glob('*'))

[WindowsPath('data/patents/A PROCESS FOR REGENERATING SPENT FLUIDIZED CATALYTIC CRACKING CATALYST.txt'),
 WindowsPath('data/patents/APPARATUS FOR THE MEASUREMENT OF ATRIAL PRESSURE.txt'),
 WindowsPath('data/patents/ATMSPOS BASED ELECTRONIC MAIL SYSTEM.txt'),
 WindowsPath('data/patents/BANDAGE.txt'),
 WindowsPath('data/patents/COMPOSITE BLOCK AND PROCESS FOR MANUFACTURING.txt'),
 WindowsPath('data/patents/CONDENSED BENZOXA RING COMPOUND, PRODUCTION THEREOF, AND PHARMACEUTICAL COMPOSITION CONTAINING THE SAME.txt'),
 WindowsPath('data/patents/CONDOM FOR ORAL-GENITAL USE.txt'),
 WindowsPath('data/patents/DETERMINATION OF TRICYCLIC ANTIDEPRESSANT DRUGS IN THE PRESENCE OF INTERFERING SUBSTANCES.txt'),
 WindowsPath('data/patents/DEVICE FOR FORMATION OF A FILM ON THE WALLS OF HOLES IN PRINTED CIRCUIT BOARDS.txt'),
 WindowsPath('data/patents/Device for the examination of flow motion inside cylindrical components.txt'),
 WindowsPath('data/patents/DISPENSING APPARATUS UTILIZING A PRESSURE GENERAT

In [2]:
path = 'data/arxiv/A_Survey_of_Software-Defined_Smart_Grid_Networks_Security_Threats_and__Defense_Techniques.pdf'
plan_json = await extract_plan_and_content_arxiv(path)

[32m2023-08-10 16:11:51.484[0m | [1mINFO    [0m | [36mmain[0m:[36mextract_plan_and_content_arxiv[0m:[36m731[0m - [1m
	Extracting plan and content for arxiv paper: data/arxiv/A_Survey_of_Software-Defined_Smart_Grid_Networks_Security_Threats_and__Defense_Techniques.pdf[0m
[32m2023-08-10 16:11:53.722[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_sections_if_too_large[0m:[36m288[0m - [1mDividing sections if too large in plan and section content.[0m
[32m2023-08-10 16:11:55.207[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_sections_if_too_large[0m:[36m343[0m - [1mAdded 'Smart Grids and Network Security' split original heading 'Content'[0m
[32m2023-08-10 16:11:56.354[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_sections_if_too_large[0m:[36m343[0m - [1mAdded 'Software-Defined Networking (SDN) and Software-Defined Smart Grid (SD-SG)' split original heading 'Content'[0m
[32m2023-08-10 16:11:56.956[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_s

Truncated plan_json:
{'id': '84fd297a-f948-4966-b7ce-01a82aa78141',
 'title': 'A Survey of Software-Defined Smart Grid Networks Security Threats '
          'and  Defense Techniques',
 'abstract': '—smart grids are replacing conventional power grids\n'
             'due to rising electricity use, failing infrastructure, and '
             'reliability\n'
             'pr...',
 'title_embedding_1': [0.0049530500546097755,
                       -0.027262356132268906,
                       0.016478152945637703,
                       -0.010516656562685966,
                       0.015215879306197166,
                       '...'],
 'title_embedding_2': [-0.04646815359592438,
                       -0.06297782808542252,
                       -0.03468792140483856,
                       0.01633869670331478,
                       0.02604159340262413,
                       '...'],
 'abstract_embedding_1': [0.0012893922394141555,
                          -0.001830162014812231,
          

[32m2023-08-10 16:32:03.305[0m | [1mINFO    [0m | [36mmain[0m:[36mextract_plan_and_content_arxiv[0m:[36m748[0m - [1m
	Successfully extracted plan and content for data/arxiv/A_Survey_of_Software-Defined_Smart_Grid_Networks_Security_Threats_and__Defense_Techniques.pdf
	Written to file: C:\Users\User\Documents\1 Projects\document_extraction\A Survey of Software-Defined Smart Grid Networks Security Threats and  Defense Techniques.json
	Time taken: 20mins 11.8s[0m


In [4]:
truncated_pprint(plan_json)

{'id': '84fd297a-f948-4966-b7ce-01a82aa78141',
 'title': 'A Survey of Software-Defined Smart Grid Networks Security Threats '
          'and  Defense Techniques',
 'abstract': '—smart grids are replacing conventional power grids\n'
             'due to rising electricity use, failing infrastructure, and '
             'reliability\n'
             'pr...',
 'title_embedding_1': [0.0049530500546097755,
                       -0.027262356132268906,
                       0.016478152945637703,
                       -0.010516656562685966,
                       0.015215879306197166,
                       '...'],
 'title_embedding_2': [-0.04646815359592438,
                       -0.06297782808542252,
                       -0.03468792140483856,
                       0.01633869670331478,
                       0.02604159340262413,
                       '...'],
 'abstract_embedding_1': [0.0012893922394141555,
                          -0.001830162014812231,
                          0.011

In [None]:
path = 'data/arxiv/A_Survey_of_Software-Defined_Smart_Grid_Networks_Security_Threats_and__Defense_Techniques.pdf'
article_dict = load_arxiv_paper(path)
article_dict = await divide_sections_if_too_large(article_dict, doc_type="arxiv")

In [4]:
article_dict.keys()

dict_keys(['Title', 'Abstract', 'Smart Grids and Network Security', 'Software-Defined Networking (SDN)', 'Related Works', 'Related Literature and Surveys on SD-SG Network Security', 'SD-SG Security', 'Akkaya et al.', 'DDoS Attacks and Defense Techniques', 'Evolution of Software-Defined Smart Grid Infrastructure', 'SDN Layers and Intercommunication', 'Software Defined Networking', 'SDN Architecture', 'DDoS/DoS/Physical-DoS (PDoS) Spoofing, Sniffing, and Message Relay MITM,Eavesdropping, and Homograph Meter Manipulation and Theft FDI Impersonation, Session Key Exposure, and TSA TCP-SYN Flooding Jamming RAM Exhaustation/CPU Overload Brute Force Message Replay, Covert Sybil Multi-Attack DDoS/DoS Controller Intrusion Multi-Attack', 'SDN Terms', 'SDN Redesign', 'SDN Controllers', 'Distributed Control Plane Topologies', 'Southbound API', 'SD-SG Cyberattack Threats', 'Network Security Challenges of Software-Defined Smart Grids', 'Security Solutions for SDN Networks', 'Blockchain', 'SDN-based S

In [2]:
arxiv_papers = [
    "https://arxiv.org/pdf/2307.04438.pdf",
    "https://arxiv.org/pdf/2306.14697.pdf",
    "https://arxiv.org/pdf/2302.09051.pdf",
    "https://arxiv.org/pdf/2305.10091.pdf",
    "https://arxiv.org/pdf/2305.17474.pdf",
    "https://arxiv.org/pdf/2306.16960.pdf",
    "https://arxiv.org/pdf/2305.20069.pdf",
    "https://arxiv.org/pdf/2306.08451.pdf",
    "https://arxiv.org/pdf/2306.17003.pdf",
    "https://arxiv.org/pdf/2307.07573.pdf",
]

In [8]:
import requests
import re
import os

def sanitize_filename(filename):
    """Convert string to a valid filename."""
    s = str(filename).strip().replace(' ', '_')
    # Remove any character that is not a word character
    # (alphanumeric + underscore), not a hyphen, or not a period.
    return re.sub(r'(?u)[^-\w.]', '', s)

def get_arxiv_metadata(arxiv_id):
    """Fetch metadata for the given arXiv ID."""
    url = f'https://export.arxiv.org/api/query?id_list={arxiv_id}'
    response = requests.get(url)
    response.raise_for_status()
    data = response.text

    # Use regex to extract the title. There are better ways (like parsing XML),
    # but this is simple and should work for our purpose.
    match = re.search(r'<title>([^<]+)</title>', data)
    title = match.group(1) if match else None
    return {'title': title}

def download_arxiv_pdf(arxiv_url):
    """Given a arxiv_url, download the PDF to the data/arxiv directory."""
    arxiv_id = arxiv_url.split('/')[-1].replace('.pdf', '')
    metadata = get_arxiv_metadata(arxiv_id)

    if metadata.get('title'):
        filename = sanitize_filename(metadata['title']) + '.pdf'
    else:
        filename = f"{arxiv_id}.pdf"

    arxiv_dir = Path('data/arxiv')
    os.makedirs(arxiv_dir, exist_ok=True)

    response = requests.get(arxiv_url)
    output_file = arxiv_dir / filename
    with open(output_file, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded to {output_file}")

# Example usage:
arxiv_url = 'https://arxiv.org/pdf/2306.14697.pdf'
# download_arxiv_pdf(arxiv_url)


In [9]:
for paper in arxiv_papers:
    download_arxiv_pdf(paper)

Downloaded to data\arxiv\Reconfigurable_Intelligent_Surface_Assisted_Railway_Communications_A__survey.pdf
Downloaded to data\arxiv\A_Survey_of_Software-Defined_Smart_Grid_Networks_Security_Threats_and__Defense_Techniques.pdf
Downloaded to data\arxiv\Complex_QA_and_language_models_hybrid_architectures_Survey.pdf
Downloaded to data\arxiv\Multi-Agent_Reinforcement_Learning_Methods_Applications_Visionary__Prospects_and_Challenges.pdf
Downloaded to data\arxiv\Macroeconomic_Effects_of_Inflation_Targeting_A_Survey_of_the_Empirical__Literature.pdf
Downloaded to data\arxiv\Sketching_a_Model_on_Fisheries_Enforcement_and_Compliance_--_A_Survey.pdf
Downloaded to data\arxiv\A_survey_on_the_complexity_of_learning_quantum_states.pdf
Downloaded to data\arxiv\A_Survey_on_Blood_Pressure_Measurement_Technologies_Addressing__Potential_Sources_of_Bias.pdf
Downloaded to data\arxiv\A_survey_on_algebraic_dilatations.pdf
Downloaded to data\arxiv\Literature_Survey_on_the_Container_Stowage_Planning_Problem.pdf


In [2]:
doc = Path('data/arxiv/A_Survey_of_Software-Defined_Smart_Grid_Networks_Security_Threats_and__Defense_Techniques.pdf')
text = extract_text(doc)

doc_2 = Path('data/arxiv/A_Survey_on_Blood_Pressure_Measurement_Technologies_Addressing__Potential_Sources_of_Bias.pdf')
text_2 = extract_text(doc_2)

doc_3 = Path('data/arxiv/A_survey_on_algebraic_dilatations.pdf')
text_3 = extract_text(doc_3)

doc_4 = Path('data/arxiv/A_survey_on_the_complexity_of_learning_quantum_states.pdf')
text_4 = extract_text(doc_4)

In [3]:
title = doc.stem
title = title.replace('_', ' ')
title

'A Survey of Software-Defined Smart Grid Networks Security Threats and  Defense Techniques'

In [4]:
# The pattern searches for "abstract" followed by any content.
# Then, it looks for one of the potential following sections:
# "I. Introduction", "1. Introduction", or "Contents".
# pattern = r'abstract(.*?)(i\. introduction|1\. introduction|contents)'

# The pattern searches for "abstract" followed by any content.
# Then, it looks for one of the potential following sections:
# "I. Introduction", "1. Introduction", or "Contents".
# We use a positive lookahead (?=...) to assert that the introduction or contents
# pattern exists, but we don't include it in the main match.
pattern = r'abstract(.*?)(?=(i\. introduction|1\. introduction|contents))'


# The re.DOTALL flag allows the . in the pattern to match newline characters,
match = re.search(pattern, text.lower(), re.DOTALL)

if match:
    abstract_start = match.start()
    abstract_end = match.end()
    abstract = match.group(1).strip()  # Extracted abstract content
    print(abstract)

—smart grids are replacing conventional power grids
due to rising electricity use, failing infrastructure, and reliability
problems. two-way communication, demand-side administra-
tion, and real-time pricing make smart grids (sgs) dependent
on its communication system. manual network administration
slows down sg communication. sg networks additionally utilize
hardware and software from several vendors, allowing devices
to communicate. software-defined sgs (sd-sg) use software-
defined networking (sdn) to monitor and regulate sg global
communication networks to address these concerns. sdn sepa-
rates the data plane (routers and switches) from the control plane
(routing logic) and centralizes control into the sdn controller.
this helps network operators manage visibility, control, and
security. these benefits have made sdn popular in sg architec-
tural and security studies. but because sd-sgs are vulnerable
to cyberattacks, there are concerns about the security of these
sd-sg networks. c

In [5]:
pattern = r'references\n'
# regions = []
matches = [match for match in re.finditer(pattern, text.lower())]

references = ''
if matches:
    final_match = matches[-1]
    reference_start = final_match.start()
    reference_end = final_match.end()
    references = text[reference_start:]
print(references[:250])

REFERENCES

[1] M. H. Rehmani, A. Davy, B. Jennings, and C. Assi, “Software defined
networks-based smart grid communication: A comprehensive survey,”
IEEE Communications Surveys & Tutorials, vol. 21, no. 3, pp. 2637–
2670, 2019.

[2] O. M. Butt, M. Z


In [6]:
content = text[abstract_end:reference_start]
content

'I. INTRODUCTION\n\nSmart grids (SGs) are improved versions of traditional\npower grids and rely on two-way communication technology\nto improve efficiency, reliability, and sustainability. Unlike\nthe outdated and inefficient traditional power grids, which\nsuffers from frequent power outages, SGs uses a variety of\ntechnologies, such as sensors, analytic, and control systems,\nto more effectively monitor and manage energy consumption,\ngeneration, and distribution [1]. SG can monitor energy de-\nmand and supply in real-time, making them more adaptable\nto changing energy needs.\n\nSG provides greater transparency and accessibility to energy\nproviders, allowing them to effectively monitor and manage\ntheir energy consumption in real-time [2]. They also allow\nutility companies to communicate with customers more effi-\nciently, providing real-time feedback on energy consumption\nand assisting customers in making informed decisions about\n\nenergy usage [3]. These features are due to a

In [12]:
article_dict = {
    'Title': title,
    'Abstract': abstract,
    'Content': content,
    'References': references,
}
truncated_pprint(article_dict)

{'Title': 'A Survey of Software-Defined Smart Grid Networks Security Threats '
          'and  Defense Techniques',
 'Abstract': '—smart grids are replacing conventional power grids\n'
             'due to rising electricity use, failing infrastructure, and '
             'reliability\n'
             'pr...',
 'Content': 'I. INTRODUCTION\n'
            '\n'
            'Smart grids (SGs) are improved versions of traditional\n'
            'power grids and rely on two-way communication technol...',
 'References': 'REFERENCES\n'
               '\n'
               '[1] M. H. Rehmani, A. Davy, B. Jennings, and C. Assi, '
               '“Software defined\n'
               'networks-based smart grid communication: ...'}


In [13]:
split_dict = await divide_sections_if_too_large(article_dict, doc_type='arxiv')

[32m2023-08-10 15:44:57.496[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_sections_if_too_large[0m:[36m240[0m - [1mDividing sections if too large in plan and section content.[0m
[32m2023-08-10 15:44:58.620[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_sections_if_too_large[0m:[36m295[0m - [1mAdded 'Smart Grids and Network Security' split original heading 'Content'[0m
[32m2023-08-10 15:44:59.425[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_sections_if_too_large[0m:[36m295[0m - [1mAdded 'Software-Defined Networking (SDN)' split original heading 'Content'[0m
[32m2023-08-10 15:45:00.054[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_sections_if_too_large[0m:[36m295[0m - [1mAdded 'Related Works' split original heading 'Content'[0m
[32m2023-08-10 15:45:01.187[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_sections_if_too_large[0m:[36m295[0m - [1mAdded 'Related Literature and Surveys on SD-SG Network Security' split original heading 'Conten

In [14]:
pprint(list(split_dict.keys()))

['Title',
 'Abstract',
 'Smart Grids and Network Security',
 'Software-Defined Networking (SDN)',
 'Related Works',
 'Related Literature and Surveys on SD-SG Network Security',
 'Security of SD-SG Systems',
 'Akkaya et al.',
 'DDoS Attacks and Defense Techniques',
 'Evolution of Software-Defined Smart Grid Infrastructure',
 'SDN Layers and Intercommunication',
 'Software Defined Networking',
 'SDN Architecture',
 'DDoS/DoS/Physical-DoS (PDoS) Spoofing, Sniffing, and Message Relay '
 'MITM,Eavesdropping, and Homograph Meter Manipulation and Theft FDI '
 'Impersonation, Session Key Exposure, and TSA TCP-SYN Flooding Jamming RAM '
 'Exhaustation/CPU Overload Brute Force Message Replay, Covert Sybil '
 'Multi-Attack DDoS/DoS Controller Intrusion Multi-Attack',
 'SDN Terms',
 'SDN Redesign',
 'SDN Controllers',
 'Distributed Control Plane Topologies',
 'Southbound API',
 'SD-SG Cyberattack Threats',
 'Network Security Challenges of Software-Defined Smart Grids',
 'Security Solutions for SDN

In [32]:
text[reference_start: reference_start+250]

'REFERENCES\n\n[1] M. H. Rehmani, A. Davy, B. Jennings, and C. Assi, “Software defined\nnetworks-based smart grid communication: A comprehensive survey,”\nIEEE Communications Surveys & Tutorials, vol. 21, no. 3, pp. 2637–\n2670, 2019.\n\n[2] O. M. Butt, M. Z'

In [23]:
content = text[abstract_end:reference_start]
content[:250]

'\n\nSmart grids (SGs) are improved versions of traditional\npower grids and rely on two-way communication technology\nto improve efficiency, reliability, and sustainability. Unlike\nthe outdated and inefficient traditional power grids, which\nsuffers from '

In [31]:
texts = [text, text_2, text_3, text_4]
for t in texts:
    print(repr(t[:500]))
    print()

'A Survey of Software-Defined Smart Grid\nNetworks: Security Threats and Defense Techniques\n\nDennis Agnew Sharon Boamah Janise McNair\nDepartment of Electrical and Computer Engineering, University of Florida, Gainesville, FL\n{dennisagnew, sharonboamah}@ufl.edu, mcnair@ece.ufl.edu\n\n3\n2\n0\n2\n\nn\nu\nJ\n\n6\n2\n\n]\n\nY\nS\n.\ns\ns\ne\ne\n[\n\n1\nv\n7\n9\n6\n4\n1\n.\n6\n0\n3\n2\n:\nv\ni\nX\nr\na\n\nAbstract—Smart grids are replacing conventional power grids\ndue to rising electricity use, failing infrastructure, and reliability\nproblems. Tw'

'SOURCES OF BIAS IN BLOOD PRESSURE MEASUREMENT TECHNOLOGIES\n\n1\n\nA Survey on Blood Pressure Measurement\nTechnologies: Addressing Potential Sources of Bias\n\nSeyedeh Somayyeh Mousavi and Reza Sameni*\n\n3\n2\n0\n2\n\nn\nu\nJ\n\n4\n1\n\n]\nh\np\n-\nd\ne\nm\n\n.\ns\nc\ni\ns\ny\nh\np\n[\n\n1\nv\n1\n5\n4\n8\n0\n.\n6\n0\n3\n2\n:\nv\ni\nX\nr\na\n\nAbstract—Blood pressure is a vital sign that offers important\ninsights into overall health, part

In [32]:
# Pattern details:
# 1. `^` asserts start of a line.
# 2. `(.*?)` captures everything lazily.
# 3. The lookahead `(?=...)` asserts that what directly follows is:
#   a. an email-like pattern, OR
#   b. words like "University", "Department", "Research Center", OR
#   c. a date pattern (e.g., "July 12, 2023").
pattern = r'^.*?(?=\S+@\S+|\bUniversity\b|\bDepartment\b|\bResearch Center\b|\b[\w\s]{3,20}\b \d{1,2}, \d{4})'

for text in texts:
    match = re.search(pattern, text, re.DOTALL | re.MULTILINE)
    if match:
        title = match.group(0).strip()
        print(title)
        print('-' * 50)

A Survey of Software-Defined Smart Grid
Networks: Security Threats and Defense Techniques

Dennis Agnew Sharon Boamah Janise McNair
--------------------------------------------------
SOURCES OF BIAS IN BLOOD PRESSURE MEASUREMENT TECHNOLOGIES

1

A Survey on Blood Pressure Measurement
Technologies: Addressing Potential Sources of Bias

Seyedeh Somayyeh Mousavi and Reza Sameni*

3
2
0
2

n
u
J

4
1

]
h
p
-
d
e
m

.
s
c
i
s
y
h
p
[

1
v
1
5
4
8
0
.
6
0
3
2
:
v
i
X
r
a

Abstract—Blood pressure is a vital sign that offers important
insights into overall health, particularly cardiovascular well-
being. It plays a critical role in medical settings and homes
for disease prevention, diagnosis, treatment, and management.
Physicians heavily rely on blood pressure values for making
crucial decisions. Most commercial devices utilize cuffs for
blood pressure measurement, and automatic devices have gained
popularity due to the high prevalence of hypertension. Self-
measurement and home monitoring of

In [6]:
print(text)

A Survey of Software-Defined Smart Grid
Networks: Security Threats and Defense Techniques

Dennis Agnew Sharon Boamah Janise McNair
Department of Electrical and Computer Engineering, University of Florida, Gainesville, FL
{dennisagnew, sharonboamah}@ufl.edu, mcnair@ece.ufl.edu

3
2
0
2

n
u
J

6
2

]

Y
S
.
s
s
e
e
[

1
v
7
9
6
4
1
.
6
0
3
2
:
v
i
X
r
a

Abstract—Smart grids are replacing conventional power grids
due to rising electricity use, failing infrastructure, and reliability
problems. Two-way communication, demand-side administra-
tion, and real-time pricing make smart grids (SGs) dependent
on its communication system. Manual network administration
slows down SG communication. SG networks additionally utilize
hardware and software from several vendors, allowing devices
to communicate. Software-defined SGs (SD-SG) use software-
defined networking (SDN) to monitor and regulate SG global
communication networks to address these concerns. SDN sepa-
rates the data plane (routers and 

In [7]:
'references\n' in text.lower()

True

In [8]:
re.findall(r'references\n', text.lower())

['references\n', 'references\n', 'references\n']

In [24]:
pattern = r'abstract'
regions = []
for match in re.finditer(pattern, text.lower()):
    # regions.append(text[match.start():])
    start_index = max(0, match.start() - 250)
    end_index = min(len(text), match.end() + 250)
    regions.append(text[start_index:end_index])

# Print the regions
for i, region in enumerate(regions, 1):
    print(f"Match {i}:\n{region}\n{'-'*50}")

Match 1:
ron Boamah Janise McNair
Department of Electrical and Computer Engineering, University of Florida, Gainesville, FL
{dennisagnew, sharonboamah}@ufl.edu, mcnair@ece.ufl.edu

3
2
0
2

n
u
J

6
2

]

Y
S
.
s
s
e
e
[

1
v
7
9
6
4
1
.
6
0
3
2
:
v
i
X
r
a

Abstract—Smart grids are replacing conventional power grids
due to rising electricity use, failing infrastructure, and reliability
problems. Two-way communication, demand-side administra-
tion, and real-time pricing make smart grids (SGs) dependent
on its co
--------------------------------------------------
Match 2:
des. The
controllers gather information from the equipment on the
health of the network. It allows for ease of network manage-
ment by providing a central point of control for the network.
The controller allows the network management with high-level
abstraction and APIs to developers. The controller can provide
information such as network topology, flow statistics, and
device discovery. While using an SDN controller, a

In [23]:
re.findall(r'abstract', text.lower())

['abstract', 'abstract']

In [27]:
text[:1000]

'A Survey of Software-Defined Smart Grid\nNetworks: Security Threats and Defense Techniques\n\nDennis Agnew Sharon Boamah Janise McNair\nDepartment of Electrical and Computer Engineering, University of Florida, Gainesville, FL\n{dennisagnew, sharonboamah}@ufl.edu, mcnair@ece.ufl.edu\n\n3\n2\n0\n2\n\nn\nu\nJ\n\n6\n2\n\n]\n\nY\nS\n.\ns\ns\ne\ne\n[\n\n1\nv\n7\n9\n6\n4\n1\n.\n6\n0\n3\n2\n:\nv\ni\nX\nr\na\n\nAbstract—Smart grids are replacing conventional power grids\ndue to rising electricity use, failing infrastructure, and reliability\nproblems. Two-way communication, demand-side administra-\ntion, and real-time pricing make smart grids (SGs) dependent\non its communication system. Manual network administration\nslows down SG communication. SG networks additionally utilize\nhardware and software from several vendors, allowing devices\nto communicate. Software-defined SGs (SD-SG) use software-\ndefined networking (SDN) to monitor and regulate SG global\ncommunication networks to address t

In [4]:
article_dict = {'article': text}
split_dict = await divide_sections_if_too_large(article_dict, doc_type='arxiv')

[32m2023-08-10 13:37:44.005[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_sections_if_too_large[0m:[36m240[0m - [1mDividing sections if too large in plan and section content.[0m
[32m2023-08-10 13:37:46.328[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_sections_if_too_large[0m:[36m295[0m - [1mAdded 'A Survey of Software-Defined Smart Grid Networks: Security Threats and Defense Techniques' split original heading 'article'[0m
[32m2023-08-10 13:37:47.159[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_sections_if_too_large[0m:[36m295[0m - [1mAdded 'Smart Grids' split original heading 'article'[0m
[32m2023-08-10 13:37:48.292[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_sections_if_too_large[0m:[36m295[0m - [1mAdded 'Software-Defined Networking (SDN) and Software-Defined Smart Grid (SD-SG)' split original heading 'article'[0m
[32m2023-08-10 13:37:48.916[0m | [1mINFO    [0m | [36mmain[0m:[36mdivide_sections_if_too_large[0m:[36m295[0m - [1m

In [10]:
pprint(list(split_dict.keys()), width=125)

['A Survey of Software-Defined Smart Grid Networks: Security Threats and Defense Techniques',
 'Smart Grids',
 'Software-Defined Networking (SDN) and Software-Defined Smart Grid (SD-SG)',
 'Related Works',
 'Related Literature and Surveys on SD-SG Network Security',
 'Security of SD-SG Systems',
 'Akkaya et al.',
 'DDoS Attacks and Defense Techniques',
 'Evolution of Software-Defined Smart Grid Infrastructure',
 'SDN Layers and Intercommunication',
 'Software Defined Networking',
 'SDN Architecture',
 'DDoS/DoS/Physical-DoS (PDoS) Spoofing, Sniffing, and Message Relay MITM,Eavesdropping, and Homograph Meter Manipulation '
 'and Theft FDI Impersonation, Session Key Exposure, and TSA TCP-SYN Flooding Jamming RAM Exhaustation/CPU Overload Brute '
 'Force Message Replay, Covert Sybil Multi-Attack DDoS/DoS Controller Intrusion Multi-Attack',
 'SDN Terms',
 'SDN Redesign',
 'SDN Controllers',
 'Distributed Control Plane Topologies',
 'Southbound API',
 'SD-SG Cyberattack Threats',
 'Network 

In [17]:
num_tokens_from_string(text.lower().split('references\n')[-1])

12984

In [13]:
[num_tokens_from_string(x) for x in split_dict.values()]

[504,
 491,
 481,
 364,
 354,
 299,
 488,
 482,
 430,
 468,
 402,
 434,
 502,
 391,
 484,
 491,
 494,
 442,
 456,
 455,
 278,
 412,
 503,
 444,
 364,
 474,
 450,
 459,
 491,
 417,
 344,
 474,
 445,
 396,
 455,
 499,
 454,
 488,
 108,
 448,
 225,
 426,
 496,
 150,
 419,
 493,
 486,
 503,
 495,
 467,
 457,
 483,
 495,
 470,
 469,
 457,
 416,
 456,
 460,
 454,
 475,
 455,
 499,
 445,
 430,
 411,
 467,
 414,
 454,
 485,
 481,
 470,
 486,
 503,
 159]

In [12]:
print(split_dict['A Survey of Software-Defined Smart Grid Networks: Security Threats and Defense Techniques'])

A Survey of Software-Defined Smart Grid
Networks: Security Threats and Defense Techniques

Dennis Agnew Sharon Boamah Janise McNair
Department of Electrical and Computer Engineering, University of Florida, Gainesville, FL
{dennisagnew, sharonboamah}@ufl.edu, mcnair@ece.ufl.edu

3
2
0
2

n
u
J

6
2

]

Y
S
.
s
s
e
e
[

1
v
7
9
6
4
1
.
6
0
3
2
:
v
i
X
r
a

Abstract—Smart grids are replacing conventional power grids
due to rising electricity use, failing infrastructure, and reliability
problems. Two-way communication, demand-side administra-
tion, and real-time pricing make smart grids (SGs) dependent
on its communication system. Manual network administration
slows down SG communication. SG networks additionally utilize
hardware and software from several vendors, allowing devices
to communicate. Software-defined SGs (SD-SG) use software-
defined networking (SDN) to monitor and regulate SG global
communication networks to address these concerns. SDN sepa-
rates the data plane (routers and 

In [31]:
for x in sorted(split_dict.keys(), key=len, reverse=True):
    print(f"Length: {len(x)}, num tokens: {num_tokens_from_string(x)}")
    print(x)
    print()

Length: 330, num tokens: 92
DDoS/DoS/Physical-DoS (PDoS) Spoofing, Sniffing, and Message Relay MITM,Eavesdropping, and Homograph Meter Manipulation and Theft FDI Impersonation, Session Key Exposure, and TSA TCP-SYN Flooding Jamming RAM Exhaustation/CPU Overload Brute Force Message Replay, Covert Sybil Multi-Attack DDoS/DoS Controller Intrusion Multi-Attack

Length: 129, num tokens: 25
MystifY: A Proactive Moving-Target Defense Approach for Software Defined Control Plane in Software Defined Cyber-Physical Systems

Length: 108, num tokens: 18
Network Optimization for Improved Performance and Speed for SDN and Security Analysis of SDN Vulnerabilities

Length: 106, num tokens: 21
Scalability, consistency, reliability and security in sdn controllers: a survey of diverse sdn controllers

Length: 106, num tokens: 16
An intrusion detection system against black hole attacks on the communication network of self-driving cars

Length: 99, num tokens: 15
Blockchain and homomorphic encryption-based 

In [7]:
num_tokens_from_string(text)

33580

In [9]:
print(text)

A Survey of Software-Defined Smart Grid
Networks: Security Threats and Defense Techniques

Dennis Agnew Sharon Boamah Janise McNair
Department of Electrical and Computer Engineering, University of Florida, Gainesville, FL
{dennisagnew, sharonboamah}@ufl.edu, mcnair@ece.ufl.edu

3
2
0
2

n
u
J

6
2

]

Y
S
.
s
s
e
e
[

1
v
7
9
6
4
1
.
6
0
3
2
:
v
i
X
r
a

Abstract—Smart grids are replacing conventional power grids
due to rising electricity use, failing infrastructure, and reliability
problems. Two-way communication, demand-side administra-
tion, and real-time pricing make smart grids (SGs) dependent
on its communication system. Manual network administration
slows down SG communication. SG networks additionally utilize
hardware and software from several vendors, allowing devices
to communicate. Software-defined SGs (SD-SG) use software-
defined networking (SDN) to monitor and regulate SG global
communication networks to address these concerns. SDN sepa-
rates the data plane (routers and 

In [11]:
from io import StringIO
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
output_string = StringIO()
with open(doc, 'rb') as fin:
    extract_text_to_fp(fin, output_string, laparams=LAParams(),
                       output_type='html', codec=None)

In [14]:
soup = BeautifulSoup(output_string.getvalue(), "html.parser")

In [18]:
soup

<html><head>
<meta content="text/html" http-equiv="Content-Type"/>
</head><body>
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:48px; top:102px; width:514px; height:51px;"><span style="font-family: NimbusRomNo9L-Regu; font-size:23px">A Survey of Software-Defined Smart Grid
<br/>Networks: Security Threats and Defense Techniques
<br/></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:115px; top:171px; width:377px; height:35px;"><span style="font-family: NimbusRomNo9L-Regu; font-size:10px">Dennis Agnew Sharon Boamah Janise McNair
<br/></span><span style="font-family: NimbusRomNo9L-ReguItal; font-size:9px">Department of Electrical and Computer Engineering</span><span style="font-family: NimbusRomNo9L-Regu; font-size:9px">, </span>