In [None]:
import sys
!{sys.executable} -m pip install -r '../requirements.txt'

# Set Up Notebook

Always run this before any other section

In [2]:
import PyPDF2
import pandas as pd
import spacy
import nltk
import itertools
from collections import Counter
from string import punctuation
from tqdm.notebook import tqdm as tqdm_notebook
from heapq import nlargest

In [3]:
nlp = spacy.load("en_core_web_sm")
stopwords = spacy.lang.en.stop_words.STOP_WORDS

# Extract Text from PDFs into CSV

In [None]:
def get_page_text(pdf_page):
    text = pdf_page.extract_text() 
    new_string = ' '.join(text.split('\n'))   
    return new_string

def pdf_to_text(pdf_path): 
    
    article = ""
    
    with open(pdf_path, "rb") as pdfFileObj:
        pdfReader = PyPDF2.PdfReader(pdfFileObj)
        pageCount = len(pdfReader.pages)  
        # Extract text from each page
        for p in range(pageCount):
            page = pdfReader.pages[p]
            article += ' ' + get_page_text(page)
            
    return article

In [None]:
articles = [""] * 222
for i in tqdm_notebook(range(222)):
    pdf_path = f'../gdit_articles/{i}.pdf' # Change to path where GDIT Articles are stored
    article = pdf_to_text(pdf_path)
    articles[i] = article

In [None]:
# set encoding='utf-8' if codec error occurs
with open('../gdit_articles/titles.txt', 'r', encoding='cp1252') as file:
    lines = file.readlines()
titles = [x.strip() for x in lines]

In [None]:
df = pd.DataFrame(list(zip(titles, articles)), columns = ['Title', 'Article'])
df.head(20)

## Data Cleaning and Wrangling

- Remove redundant footers from text
- Create feature for article's introduction(i.e. first page)
- Create feature for article tags and related sections

In [None]:
footer1 = '© 2023 General Dynamics Information Technology, Inc., a General Dynamics  Company. Capabilities Industries Perspectives Careers About GDIT Contact Us General Dynamics Information Technology is an Equal Opportunity/Affirmative Action employer. All qualified applicants will receive consideration for employment without regard to race, color, religion, sex, sexual orientation, gender identity, national origin, disability, or veteran status, or any other protected class. Privacy Policy Legal Terms CA Consumer Privacy EU-U.S. Privacy Shield Privacy Statement EU General Data Protection Regulation Site Map'
footer2 = 'You may opt-out at any time by   contacting us contacting us   or using the link in our emails. For more details, review our   privacy policy privacy policy .'
footer3 = 'MONTHLY NEWSLETTER Subscribe to our newsletter. Get thought leadership delivered once a month. Email  By submitting your contact information you agree that GDIT and its partners may send you marketing information. You may opt-out at any time by contacting us contacting us   or using the link in our emails. For more details, review our   privacy policy privacy policy .'
footer4 = 'SHARE SHARE'
footer5 = 'MONTHLY NEWSLETTER Subscribe to our newsletter. Get thought leadership delivered once a month. Email  By submitting your contact information you agree that GDIT and its partners may send you marketing information.'
footers_to_delete = [footer1, footer2, footer3, footer4]
for footer in footers_to_delete: 
    df['Article'] = df.Article.str.replace(footer, '')
# Save changes to CSV
# outfile = '../csv/gdit_articles.csv'
df.to_csv(outfile, sep=',', encoding='utf-8', index=False)

### Create Features: First Page, Tags, Related

Retrieve the introduction(i.e. first page) from each article

In [None]:
# Make FirstPage/Introduction data column
def get_intro_text(pdf_path):
    with open(pdf_path, "rb") as pdfFileObj:
        pdfReader = PyPDF2.PdfReader(pdfFileObj)
        return get_page_text(pdfReader.pages[0])

df['Page1'] = ""
for i in tqdm_notebook(range(222)):
    pdf_path = f'../gdit_articles/{i}.pdf'
    df.at[i, 'Page1'] = get_intro_text(pdf_path)

Extract Tags and Related data

In [None]:
# Searching for TAG(S) abd Related in articles
# Article TAGS may contain important keyword information
dfTag = df[df['Article'].str.contains("TAGS", case=True)]
dfTag[['Title', 'Article']]

# Find articles with tags, fill NA's from articles w/o tags
df['Tags'] = df['Article'].str.extract(r"TAGS (.*) Related").fillna("") 
df['Related'] = df['Article'].str.extract(r"TAGS.*Related (.*)").fillna("")

# Clean Tags and Related features
df['Tags'] = df['Tags'].str.replace("TAGS", "").str.strip()
df['Related'] = df['Related'].str.replace(r"Related", "").str.strip()

# Extracting Keywords and Phrases 

import spacy to extract keywords from articles (don't forget to pip install in the terminal)

In [11]:
# Run if didnt run the code to generate the df already
df = pd.read_csv('../csv/gdit_articles.csv')

In [5]:
def get_keywords(text):
    
    words, phrases, both = [], [], []
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] #extracting proper nouns, adjs, and nouns
    doc = nlp(text.lower()) #returns a Doc with Token objects
    
    # Extract key phrases
    for chunk in doc.noun_chunks:
        
        final = ""
        
        for token in chunk:
            if (token.lemma_ in punctuation or token.text == 'gdit'):
                continue
                
            if (token.pos_ in pos_tag):
                final = final + token.lemma_ + " "
        
        if " " in final.strip():
            phrases.append(final.strip())
            both.append(final.strip())
    
    # Extract keywords
    for token in doc:
        
        # If the token isn't a stop word or punctuation, continue
        if (token.lemma_ in stopwords or token.lemma_ in punctuation or token.text == 'gdit'):
            continue
            
        # Store result if it's one of the parts of speech in the list
        if (token.pos_ in pos_tag):
            words.append(token.lemma_)
            both.append(token.lemma_)
                
    return words, phrases, both

get a list of hot words

if we want we can create a column for hashtags after we have keywords - could help with querying or an output from the GUI

examples to get the 5 most common key words/hashtags (use the list with duplicates)

applying the hotword/hashtag code above to the whole dataframe and making a column for the results

read that using a dictionary to iterate is faster than iterrows()

In [13]:
#df_dict = df.loc[:4].to_dict('records')
df_dict = df.to_dict('records')
word_col, phrase_col, all_col = [], [], []
for row in tqdm_notebook(df_dict, desc = "Extracting keywords"): #tqdm displays a progress bar
    words, phrases, both = get_keywords(row['Article']) #get hotwords for each row of the articles column
    top_words = ', '.join([x[0] for x in Counter(words).most_common(10)])
    top_phrases = ', '.join([x[0] for x in Counter(phrases).most_common(10)])
    #top_both = ', '.join([x[0] for x in Counter(both).most_common(10)])
    word_col.append(top_words)
    phrase_col.append(top_phrases)
    #all_col.append(top_both)

Extracting keywords:   0%|          | 0/222 [00:00<?, ?it/s]

In [17]:
df.head()

Unnamed: 0,Title,Abstract Summary,Extraction Summary,Keywords,Keyphrases,Tags,Page1,Article
0,AI Tool Accelerates Skin Cancer Detection in V...,The GDIT skin lesion classifier tool was devel...,The classifiers were trained on a library of 3...,"['skin', 'image', 'lesion', 'veteran', 'tool',...","['skin lesion', 'skin cancer', 'skin disease',...",,"recognized skin diseases, malignant or benign ...","recognized skin diseases, malignant or benign..."
1,Arkansas: A First in Payment Reform,Arkansas became the first state to successfull...,Antibiotic use for unspecified upper-respirato...,"['cost', 'state', 'care', 'datum', 'provider',...","['payment reform', 'big datum', 'data analytic...",,DATA ANALYTICS Arkansas: A First in Payment Re...,DATA ANALYTICS Arkansas: A First in Payment R...
2,Cancer Research Empowered by the Cloud and Mac...,Cancer Research Empowered by the Cloud and Mac...,types of cancer can be studied by researchers ...,"['cancer', 'datum', 'research', 'cloud', 'nci'...","['machine learning', 'cancer researcher', 'hea...",,HEALTH Cancer Research Empowered by the Cloud ...,HEALTH Cancer Research Empowered by the Cloud...
3,Air Force Distributed Common Ground System Adv...,Air Force Distributed Common Ground System (AF...,DIGITAL MODERNIZATION Air Force Distributed C...,"['system', 'air', 'force', 'dcgs', 'datum', 'i...","['air force', 'common ground system', 'joint o...",,DIGITAL MODERNIZATION Air Force Distributed Co...,DIGITAL MODERNIZATION Air Force Distributed C...
4,Twin Supercomputers Power Weather Forecasting,WCOSS 2 supercomputers for the National Weathe...,Each of these new supercomputers provides to N...,"['weather', 'computing', 'power', 'noaa', 'sup...","['computing power', 'vital information', 'wate...",,HIGH-PERFORMANCE COMPUTING Twin Supercomputers...,HIGH-PERFORMANCE COMPUTING Twin Supercomputer...


In [19]:
#df['Keywords and Phrases'] = all_col
df = df.drop(columns=['Keywords', 'Keyphrases'])
df.insert(3, 'Keywords', word_col) # Add the column to the dataframe
df.insert(4, 'Keyphrases', phrase_col)

# Re-order columns
#df = df[['Title', 'Page1', 'Keywords', 'Keyphrases', 'Tags', 'Article']]
df.tail()

Unnamed: 0,Title,Abstract Summary,Extraction Summary,Keywords,Keyphrases,Tags,Page1,Article
217,The Quantum Impact on Cyber,Quantum computers are more efficient than clas...,By Dr. Jim Matney Dr. Jim Matney Vice Presid...,"quantum, agency, risk, cyber, solution, datum,...","quantum threat, new standard, quantum computin...",CYBER QUANTUM By Dr. Jim Matney Dr. Jim Mat...,As almost any cybersecurity professional would...,As almost any cybersecurity professional woul...
218,Quantum Computing: What Agencies Need to Know,Quantum computing is a disruptive technology t...,"In November 2021, IBM unveiled their 127-Qub...","quantum, computer, computing, agency, intellig...","quantum computer, quantum computing, federal a...",ARTIFICIAL INTELLIGENCE HEALTH CYBER QUANTUM ...,Most people have heard of supercomputing. Most...,Most people have heard of supercomputing. Mos...
219,AI Powers Up Cyber Resilience: A Look Back at ...,Team Barcelona won the Cyber Hackathon at GDIT...,By Dr. Matthew McFadden Dr. Matthew McFadden...,"cyber, team, cybersecurity, capability, partne...","cyber incident, min watch cloud, threat landsc...",CYBER OUR CULTURE By Dr. Matthew McFadden D...,Given the rise in cyber incidents and changing...,Given the rise in cyber incidents and changin...
220,Protecting Workloads Using Zero Trust: 5 Steps...,"According to zero trust architecture, an appli...",2. Implement Micro-Segmentation and Run-Time T...,"application, security, workload, trust, softwa...","application workload, virtual machine, securit...",APPLICATION SERVICES CYBER ZERO TRUST By Ja...,To fully understand the “application workload”...,To fully understand the “application workload...
221,Seven Ways Zero Trust Delivers ROI to Agencies,Zero Trust is a security approach that assumes...,TAGS TAGS CYBER ZERO TRUST By Matt Hayden M...,"trust, security, agency, cyber, benefit, netwo...","security approach, federal government, min wat...",CYBER ZERO TRUST By Matt Hayden Matt Hayden...,"As is becoming more and more widely known, Ze...","As is becoming more and more widely known, Z..."


In [None]:
# Save changes to CSV
outfile = '../csv/gdit_articles.csv'
df.to_csv(outfile, sep=',', encoding='utf-8', index=False)

# Generate Extraction-Based Summaries

In [None]:
df = pd.read_csv('../csv/gdit_articles.csv')

In [7]:
# Percent is the percent of the highest ranked sentences
def summarize(text, percent):
    
    doc = nlp(text)
    tokens = [token.text for token in doc]
    word_freq = {}
    
    # Count word frequencies
    for token in doc:
        if token.text not in stopwords and token.text not in punctuation:
            if token.text not in word_freq.keys():
                word_freq[token.text] = 1
            else:
                word_freq[token.text] += 1
    
    max_freq = max(word_freq.values())
    
    # Normalize the word counts
    for word in word_freq.keys():
        word_freq[word] = word_freq[word]/max_freq
    
    sentence_tokens = [sent for sent in doc.sents]
    sentence_scores = {}
    
    # Normalized count for each sentence
    for sent in sentence_tokens:
        for token in sent:
            if token.text in word_freq.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_freq[token.text]
                else:
                    sentence_scores[sent] += word_freq[token.text]

    select_len = int(len(sentence_tokens) * percent)
    
    # Return a list containing top sentences
    summary = nlargest(5, sentence_scores, key=sentence_scores.get)
    # Convert the sentences from spacy.tokens.span.Span objects to strings
    final_summary = [word.text for word in summary]
    summary = ''.join(final_summary)
    
    return summary

In [8]:
df_dict = df.to_dict('records')
summ_col = []

# Create extracted-based summaries for each article
for row in tqdm_notebook(df_dict, desc = "Extracting summaries"):
    
    summary = summarize(row['Article'], 0.1)
    summ_col.append(summary)

Extracting summaries:   0%|          | 0/222 [00:00<?, ?it/s]

In [9]:
print(summ_col[0])

The classifiers were trained on a library of 30,000 publicly available skin lesion images, labeled with seven skin diseases: melanocytic nevi (benign), melanoma (malignant), benign keratosis-like lesions (benign) basal cell carcinoma (malignant), actinic keratoses and intraepithelial carcinoma (malignant), vascular lesions (benign) and dermatofibroma (benign).Read more: Department of Veterans Affairs soft-testing AI tool to speed skin cancer diagnosis How an AI-powered tool could help diagnose skin cancer in veterans    It uses deep learning artificial intelligence (AI) to classify images of skin lesions into seven common categories, determine if an image is indicative of a common skin disease, and recommend immediate follow-up care.skin lesion images used to train the AI model open-source deep learning image classifiers evaluated AI Applied to Improve Veteran Health recognized skin diseases, malignant or benign ARTIFICIAL INTELLIGENCE AI Tool Accelerates Skin Cancer Detection in Veter

In [10]:
df.head()

Unnamed: 0,Title,Summary,Page1,Keywords and Phrases,Keywords,Keyphrases,Tags,Related,Article
0,AI Tool Accelerates Skin Cancer Detection in V...,The classifiers were trained on a library of 3...,"recognized skin diseases, malignant or benign ...","['skin', 'image', 'lesion', 'veteran', 'tool',...","['skin', 'image', 'lesion', 'veteran', 'tool',...","['skin lesion', 'skin cancer', 'skin disease',...",,,"recognized skin diseases, malignant or benign..."
1,Arkansas: A First in Payment Reform,Antibiotic use for unspecified upper-respirato...,DATA ANALYTICS Arkansas: A First in Payment Re...,"['cost', 'state', 'care', 'datum', 'provider',...","['cost', 'state', 'care', 'datum', 'provider',...","['payment reform', 'big datum', 'data analytic...",,,DATA ANALYTICS Arkansas: A First in Payment R...
2,Cancer Research Empowered by the Cloud and Mac...,types of cancer can be studied by researchers ...,HEALTH Cancer Research Empowered by the Cloud ...,"['cancer', 'datum', 'research', 'cloud', 'nci'...","['cancer', 'datum', 'research', 'cloud', 'nci'...","['machine learning', 'cancer researcher', 'hea...",,,HEALTH Cancer Research Empowered by the Cloud...
3,Air Force Distributed Common Ground System Adv...,DIGITAL MODERNIZATION Air Force Distributed C...,DIGITAL MODERNIZATION Air Force Distributed Co...,"['system', 'air', 'force', 'dcgs', 'datum', 'i...","['system', 'air', 'force', 'dcgs', 'datum', 'i...","['air force', 'common ground system', 'joint o...",,,DIGITAL MODERNIZATION Air Force Distributed C...
4,Twin Supercomputers Power Weather Forecasting,Each of these new supercomputers provides to N...,HIGH-PERFORMANCE COMPUTING Twin Supercomputers...,"['weather', 'computing', 'power', 'noaa', 'sup...","['weather', 'computing', 'power', 'noaa', 'sup...","['computing power', 'vital information', 'wate...",,,HIGH-PERFORMANCE COMPUTING Twin Supercomputer...


In [None]:
# Insert the column of Summaries
df.insert(1, 'Summary', summ_col)

In [None]:
# Save changes to CSV
outfile = '../csv/gdit_articles.csv'
df.to_csv(outfile, sep=',', encoding='utf-8', index=False)

# New Section

In [3]:
df = pd.read_csv('../csv/gdit_articles.csv')

In [4]:
df.head()

Unnamed: 0,Title,Summary,Page1,Keywords and Phrases,Keywords,Keyphrases,Tags,Related,Article
0,AI Tool Accelerates Skin Cancer Detection in V...,The classifiers were trained on a library of 3...,"recognized skin diseases, malignant or benign ...","['skin', 'image', 'lesion', 'veteran', 'tool',...","['skin', 'image', 'lesion', 'veteran', 'tool',...","['skin lesion', 'skin cancer', 'skin disease',...",,,"recognized skin diseases, malignant or benign..."
1,Arkansas: A First in Payment Reform,Antibiotic use for unspecified upper-respirato...,DATA ANALYTICS Arkansas: A First in Payment Re...,"['cost', 'state', 'care', 'datum', 'provider',...","['cost', 'state', 'care', 'datum', 'provider',...","['payment reform', 'big datum', 'data analytic...",,,DATA ANALYTICS Arkansas: A First in Payment R...
2,Cancer Research Empowered by the Cloud and Mac...,types of cancer can be studied by researchers ...,HEALTH Cancer Research Empowered by the Cloud ...,"['cancer', 'datum', 'research', 'cloud', 'nci'...","['cancer', 'datum', 'research', 'cloud', 'nci'...","['machine learning', 'cancer researcher', 'hea...",,,HEALTH Cancer Research Empowered by the Cloud...
3,Air Force Distributed Common Ground System Adv...,DIGITAL MODERNIZATION Air Force Distributed C...,DIGITAL MODERNIZATION Air Force Distributed Co...,"['system', 'air', 'force', 'dcgs', 'datum', 'i...","['system', 'air', 'force', 'dcgs', 'datum', 'i...","['air force', 'common ground system', 'joint o...",,,DIGITAL MODERNIZATION Air Force Distributed C...
4,Twin Supercomputers Power Weather Forecasting,Each of these new supercomputers provides to N...,HIGH-PERFORMANCE COMPUTING Twin Supercomputers...,"['weather', 'computing', 'power', 'noaa', 'sup...","['weather', 'computing', 'power', 'noaa', 'sup...","['computing power', 'vital information', 'wate...",,,HIGH-PERFORMANCE COMPUTING Twin Supercomputer...
