## 3_NER(Named Entity Recognition)

In [1]:
import pandas as pd
import re
import spacy
from collections import Counter
import os

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from tqdm import tqdm

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

import warnings
warnings.simplefilter('ignore')



In [2]:
%%time

df = pd.read_parquet('cleaned_data.parquet', engine='pyarrow')
df.shape

CPU times: user 1.45 s, sys: 1.77 s, total: 3.21 s
Wall time: 3.72 s


(157054, 7)

In [3]:
df.head()

Unnamed: 0,doc_id,url,date,language,title,text,token_count
0,1,http://galusaustralis.com/2020/02/486473/legaltech-artificial-intelligence-market-2019-technology-advancement-and-future-scope-casetext-inc-catalyst-repository-systems-ebrevia/,2020-02-26,en,LegalTech Artificial Intelligence Market 2019 Technology Advancement and Future Scope Casetext Inc. Catalyst Repository Systems eBREVIA Galus Australis,LegalTech Artificial Intelligence Market 2019 Technology Advancement and Future Scope Casetext Inc. Catalyst Repository Systems eBREVIA Galus Australis Galus Australis BusinessGeneral NewsHealthcareIndustryInternationalLifestyleSci-Tech Wednesday February 26 2020 Trending Needle Counters Market Comprehensive Study by Companies Medline Industries Boen Healthcare Skin Scrub Trays Market Comprehensive Study by Companies Medline Industries BD Deroyal Global Portable Handheld Electronic Game Mach...,935
1,2,http://newsparliament.com/2020/02/27/children-with-autism-saw-their-learning-and-social-skills-boosted-after-playing-with-this-ai-robot/,2020-02-27,en,Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot News Parliament,Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot News Parliament Skip to content Thursday February 27 2020 Latest Mansplaining in conferences How can we get him to forestall Drax power station to cease burning coal in March 2021 Coronavirus Could Explode in the U.S. Overnight Like it Did in Italy Coronavirus Dettol sales surge as markets fall again Levi Strauss marks the next phase in corporate paid leave policies News Parliament Get the Real...,1557
4,3,http://www.millenniumpost.in/big-stories/ai-express-aircraft-skids-off-runaway-in-kozhikode-rescue-ops-on-414926,2020-08-07,en,Two dead as AI Express flight skids off Kozhikode airport,Two dead as AI Express flight skids off Kozhikode airportTopLoginSubscribeToggle navigationFeaturesDelhiKolkataNationOpinionEditorialBusinessSportsSunday PostBeyond BygoneEpic PowerIn RetrospectGlobal EyeInlandRoutesBeaconInsightGame OnSafariTrendingCandid TalkGastronomyFact FilesMapping the states of IndiaXsunday-postbeyond-bygonesunday-postepic-powerfact-filesians-feedssundaypostin-retrospectmapping-the-states-of-indiapuja-specialReminiscencesponsoredsundaypostglobal-eyefeaturesnationworld...,263
5,4,http://www.nativestew.com/2023/11/ai-took-my-job.html,2023-11-20,en,Native Stew - Bahamas AI Art Photos Videos AI Took My Job,Native Stew - Bahamas AI Art Photos Videos AI Took My Job Pages Home People Places Things Monday November 20 2023 AI Took My Job AI Took My Job - AI artA. Derek Catalano at November 20 2023 Email ThisBlogThisShare to TwitterShare to FacebookShare to Pinterest Labels Art People Newer Post Older Post Home Go Fund Me My daughter-in-law Gayle. Please help. Translate About Me A. Derek Catalano Photos videos and art featuring the people places and things of The Bahamas. I've traveled to many islan...,328
8,5,http://www.sbwire.com/press-releases/healthcare-artificial-intelligence-market-next-big-thing-major-giants-general-electric-medtronic-aicure-apixio-1363690.htm,2022-10-06,en,Healthcare Artificial Intelligence Market Analysis and Forecast for Next 5 Years,Healthcare Artificial Intelligence Market Analysis and Forecast for Next 5 Years SBWire Sign Up Login Our Service Plans Pricing Newsroom Help About AMA Research amp Media LLP Email Alerts RSS Healthcare Artificial Intelligence Market Next Big Thing Major Giants- General Electric Medtronic AiCure APIXIO Healthcare Artificial Intelligence Market 2022-2028 New Jersey USA -- SBWIRE -- 10052022 -- Advance Market Analytics published a new research publication on Healthcare Artificial Intelligence ...,780


In [5]:
def clean_text_for_ner(text):
    """
    Function to clean text for NER by:
    - Removing URLs.
    - Removing or replacing certain special characters.
    """
    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # For NER, consider keeping certain characters like hyphens, apostrophes, etc.
    text = re.sub(r'[^A-Za-z0-9\s\'\-\.]', '', text)

    return text

In [6]:
# Clean the 'text' and 'title' column
df['text_cleaned'] = df['text'].apply(clean_text_for_ner)
df['title_cleaned'] = df['title'].apply(clean_text_for_ner)

In [7]:
df['title'].iloc[0]

'LegalTech Artificial Intelligence Market 2019 Technology Advancement and Future Scope Casetext Inc. Catalyst Repository Systems eBREVIA Galus Australis'

In [8]:
df['text'].iloc[0]

'LegalTech Artificial Intelligence Market 2019 Technology Advancement and Future Scope Casetext Inc. Catalyst Repository Systems eBREVIA Galus Australis Galus Australis BusinessGeneral NewsHealthcareIndustryInternationalLifestyleSci-Tech Wednesday February 26 2020 Trending Needle Counters Market Comprehensive Study by Companies Medline Industries Boen Healthcare Skin Scrub Trays Market Comprehensive Study by Companies Medline Industries BD Deroyal Global Portable Handheld Electronic Game Machine Market Outlook and Business Insights 2020-2026 Apollo Games Sony Aristocrat Leisure IGT Infectious Disease Testing Using PCR for IVD Market Comprehensive Study by Companies Thermo Fisher BD Roche Diagnostics Veterinary Dental X-ray Generators Market Comprehensive Study by Companies Planmeca Midmark Medicatech USA Veterinary Ophthalmoscopes Market Comprehensive Study by Companies Heine Optotechnik Gowllands Limited Veterinary Holters Market Comprehensive Study by Companies Dextronix Nasiff Assoc

In [9]:
# Append the content of 'title' to 'text'
df['text_cleaned'] = df['text_cleaned'] + " " + df['title_cleaned']

In [10]:
# Split document into sentences

# Load spaCy's model
nlp = spacy.load("en_core_web_sm", disable=["ner", "tagger", "lemmatizer", "attribute_ruler"])

def process_text(text):
    return [sent.text for sent in nlp(text).sents]

texts = df['text_cleaned'].tolist()

In [11]:
# Get the number of available CPU cores (leaving one for system operations)
cpu_cores = os.cpu_count()
cpu_cores

8

In [12]:
%%time

# ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=(cpu_cores-1)) as executor:
    sentences_list = list(executor.map(process_text, texts))

df['sentences'] = sentences_listd

CPU times: user 4h 33min 49s, sys: 1h 9min 28s, total: 5h 43min 17s
Wall time: 1h 21min 27s


In [13]:
# Create a new DataFrame containing doc_id, sentence_id, and sentence
sentences_data = []
for index, row in df.iterrows():
    doc_id = row['doc_id']
    for sentence_id, sentence in enumerate(row['sentences'], start=1):
        sentences_data.append((doc_id, sentence_id, sentence))

# Create a new DataFrame
sentences_df = pd.DataFrame(sentences_data, columns=['doc_id', 'sentence_id', 'sentence'])

In [14]:
sentences_df.head()

Unnamed: 0,doc_id,sentence_id,sentence
0,1,1,LegalTech Artificial Intelligence Market 2019
1,1,2,Technology Advancement and Future Scope Casetext Inc.
2,1,3,Catalyst Repository Systems eBREVIA Galus Australis Galus Australis BusinessGeneral NewsHealthcareIndustryInternationalLifestyleSci-Tech Wednesday February 26 2020 Trending Needle Counters Market Comprehensive Study by Companies Medline Industries Boen Healthcare Skin Scrub Trays Market Comprehensive Study by Companies Medline Industries BD Deroyal Global Portable Handheld Electronic Game Machine Market Outlook and Business Insights 2020-2026 Apollo Games Sony Aristocrat Leisure IGT Infectio...
3,1,4,Catalyst Repository Systems eBREVIA
4,1,5,General NewsLegalTech Artificial Intelligence Market 2019 Technology Advancement and Future Scope Casetext Inc.


In [15]:
sentences_df.shape

(7068447, 3)

In [16]:
# Save sentences_df with entities

# Specify the file path where the Parquet file will be saved
file_path = 'sentence.parquet'
# Save the DataFrame as a Parquet file
sentences_df.to_parquet(file_path)

In [18]:
# Load spaCy model with specific components disabled to optimize for NER tasks
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "lemmatizer", "attribute_ruler"])

# Define a function to perform NER using spaCy
def spacy_ner(text):
    doc = nlp(text)
    # Return a list of entities found in the text along with their labels
    return [(ent.text, ent.label_) for ent in doc.ents]

# Example of applying the NER function to the first sentence in the sentences DataFrame
spacy_ner(sentences_df['sentence'].iloc[0])

[('LegalTech', 'ORG')]

In [19]:
tqdm.pandas(desc="NER Processing")

# Apply the NER function to each sentence in the dataframe and show progress
sentences_df['entities'] = sentences_df['sentence'].progress_apply(spacy_ner)

NER Processing: 100%|██████████| 7068447/7068447 [7:14:41<00:00, 271.01it/s]   


In [20]:
sentences_df.head()

Unnamed: 0,doc_id,sentence_id,sentence,entities
0,1,1,LegalTech Artificial Intelligence Market 2019,"[(LegalTech, ORG)]"
1,1,2,Technology Advancement and Future Scope Casetext Inc.,"[(Technology Advancement and Future Scope Casetext Inc., ORG)]"
2,1,3,Catalyst Repository Systems eBREVIA Galus Australis Galus Australis BusinessGeneral NewsHealthcareIndustryInternationalLifestyleSci-Tech Wednesday February 26 2020 Trending Needle Counters Market Comprehensive Study by Companies Medline Industries Boen Healthcare Skin Scrub Trays Market Comprehensive Study by Companies Medline Industries BD Deroyal Global Portable Handheld Electronic Game Machine Market Outlook and Business Insights 2020-2026 Apollo Games Sony Aristocrat Leisure IGT Infectio...,"[(Catalyst Repository Systems, ORG), (Wednesday February 26 2020, DATE), (Trending Needle Counters Market Comprehensive Study by Companies Medline Industries Boen Healthcare Skin Scrub Trays Market Comprehensive Study by Companies Medline Industries BD Deroyal Global Portable Handheld Electronic Game Machine Market Outlook and Business Insights, EVENT), (2020-2026, DATE), (Apollo Games, ORG), (Sony, ORG), (Aristocrat, NORP), (Leisure IGT Infectious Disease Testing Using PCR, ORG), (Optotechn..."
3,1,4,Catalyst Repository Systems eBREVIA,"[(Catalyst Repository Systems, ORG)]"
4,1,5,General NewsLegalTech Artificial Intelligence Market 2019 Technology Advancement and Future Scope Casetext Inc.,"[(NewsLegalTech, PERSON)]"


In [21]:
sentences_df_drop = sentences_df.drop(columns=['sentence'])

In [22]:
# Save sentences_df with entities

# Specify the file path where the Parquet file will be saved
file_path = 'entities.parquet'
# Save the DataFrame as a Parquet file
sentences_df_drop.to_parquet(file_path)

In [23]:
# Count entities

# function
def count_top_entities(entities_series, entity_types, top_n):
    """
    Count top N entities for specified entity types in a series of entity lists and return the counts in table format.
    :param entities_series: Pandas Series containing lists of entities (tuples of entity text and entity type)
    :param entity_types: List of entity types to count (e.g., ['ORG', 'PRODUCT', 'GPE', 'PERSON'])
    :param top_n: Number of top entities to display for each type
    :return: DataFrame showing the top N entities for each specified type
    """
    counters = {entity_type: Counter() for entity_type in entity_types}
    
    # Iterate over the series to count entities by type
    for entities in entities_series:
        for entity_text, entity_type in entities:
            if entity_type in entity_types:
                counters[entity_type][entity_text] += 1
    
    # Prepare the DataFrame to display the top N entities for each type
    top_entities_df = pd.DataFrame()
    for entity_type in entity_types:
        top_entities = counters[entity_type].most_common(top_n)
        top_entities_df[entity_type] = [f"{entity[0]} ({entity[1]})" for entity in top_entities]
    
    return top_entities_df

In [24]:
# Count entities by type
entity_types = ['ORG', 'PRODUCT', 'GPE', 'PERSON']
top_n = 50

top_entities_df = count_top_entities(sentences_df['entities'], entity_types, top_n)
top_entities_df

Unnamed: 0,ORG,PRODUCT,GPE,PERSON
0,AI (306226),AI (690325),US (188410),Biden (17056)
1,Google (87210),Android (7304),India (69997),Musk (11102)
2,ChatGPT (83414),YouTube (6209),U.S. (57197),Elon Musk (10749)
3,Microsoft (71488),UsMeet (5170),China (57029),Trump (9220)
4,Gray Media Group Inc. (61328),Bing (4324),PRNewswire (52816),Sam Altman (8654)
5,Gray Media Group (41157),Twitter (4089),UK (39021),GPT-4 (8323)
6,Gray Television Inc. (40682),Facebook (3802),Japan (24611),Altman (7155)
7,OpenAI (30590),Google Cloud (3408),Canada (24597),CaptioningAudio DescriptionAt (6768)
8,Facebook (30120),JavaScript (3338),France (23808),Joe Biden (5841)
9,Amazon (26271),Windows (3121),Russia (23286),CaptioningAudio (5507)


Note: In the final analysis, topics labeled as 'other' have been excluded, which has resulted in different rankings for entities (refer to the 6_Summary file)

- After chunking the article into sentences, Named Entity Recognition (NER) was performed using spaCy.
- The recognized entities are listed below. Some category misclassifications are observed, but overall, the recognition quality looks high. 
- A detailed analysis of each entity will be presented in the 6_Summary notebook. During the following analysis, entities with incorrect categories or those serving as noise (e.g., the news media name 'Gray Media Group' or the too common 'AI') were manually excluded. Additionally, variations in the representation of the United States, such as US and U.S., were standardized.