In [4]:
import pandas as pd
import torch
import re
import spacy
from collections import Counter
import os

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from tqdm import tqdm

In [5]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

import warnings
warnings.simplefilter('ignore')

In [6]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print(f"Setting device to '{device}'")

Setting device to 'cuda'


In [7]:
from google.colab import drive
import json

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Define the file path
file_path = '/content/drive/My Drive/Colab Notebooks/NLP_Ignas/nlp_finalproject/'

In [9]:
# Load the saved CSV file back into notebook as df_cleaned_final
df_cleaned_final = pd.read_csv(os.path.join(file_path, 'df_cleaned_final.csv'))

# Display the DataFrame
print(df_cleaned_final.head(2))

   article_id  \
0           1   
1           2   

                                                                                                       url  \
0  http://spaceref.com/astronomy/observation-simulation-and-ai-join-forces-to-reveal-a-clear-universe.html   
1                                                    http://www.agoravox.it/Covid-19-un-messaggio-dai.html   

         date language  \
0  2021-07-05       en   
1  2020-03-13       en   

                                                                                    title  \
0         Observation Simulation And AI Join Forces To Reveal A Clear Universe - SpaceRef   
1  Covid-19 un messaggio dai ricercatori italiani ai colleghi stranieri - AgoraVox Italia   

                                                                                                                                                                                                                                                                  

In [10]:
df_cleaned_final.shape

(153749, 7)

In [11]:
def clean_text_for_ner(text):
    """
    Function to clean text for NER by:
    - Removing URLs.
    - Removing or replacing certain special characters.
    """
    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # For NER, consider keeping certain characters like hyphens, apostrophes, etc.
    text = re.sub(r'[^A-Za-z0-9\s\'\-\.]', '', text)

    return text

In [12]:
# Make a copy of df_news_final_project
df = df_cleaned_final.copy()

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153749 entries, 0 to 153748
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   article_id   153749 non-null  int64 
 1   url          153749 non-null  object
 2   date         153749 non-null  object
 3   language     153749 non-null  object
 4   title        153749 non-null  object
 5   text         153749 non-null  object
 6   token_count  153749 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 8.2+ MB


In [14]:
# Clean the 'text' and 'title' column
df['text_cleaned'] = df['text'].apply(clean_text_for_ner)
df['title_cleaned'] = df['title'].apply(clean_text_for_ner)

In [15]:
# Combine title and text to create a new column called "title_text_cleaned"
df['title_text_cleaned'] = df['text_cleaned'] + " " + df['title_cleaned']

Sentence Tokenization: spaCy uses sophisticated language rules and pre-trained statistical models to identify sentence boundaries, which makes it more robust than simple rule-based splitting (e.g., splitting by . or ?).

In [16]:
# Split document into sentences

# Load spaCy's model
nlp = spacy.load("en_core_web_sm", disable=["ner", "tagger", "lemmatizer", "attribute_ruler"])

def process_text(text):
    return [sent.text for sent in nlp(text).sents]

texts = ["Hello world. This is spaCy in action.", "How are you? I'm fine, thank you."]

sentences = [process_text(text) for text in texts]

Output:
[

    ["Hello world.", "This is spaCy in action."],

    ["How are you?", "I'm fine, thank you."]

]


In [17]:
texts = df['title_text_cleaned'].tolist()

In [19]:
# texts

In [22]:
texts[5]

"Data Science and Machine-Learning Platforms Market Growing Popularity and Emerging Trends SBWire Sign Up Login Our Service Plans Pricing Newsroom Help About HTF Market Intelligence Consulting Private Limited Email Alerts RSS Data Science and Machine-Learning Platforms Market Is Booming Worldwide Alteryx IBM RapidMiner Edison NJ -- SBWIRE -- 12142020 -- Global Data Science and Machine-Learning Platforms Market Report 2020 is latest research study released by HTF MI evaluating the market highlighting opportunities risk side analysis and leveraged with strategic and tactical decision-making support. The study provides information on market trends and development drivers capacities technologies and on the changing investment structure of the Global Data Science and Machine-Learning Platforms Market. Some of the key players profiled in the study are SAS Alteryx IBM RapidMiner KNIME Microsoft Dataiku Databricks TIBCO Software MathWorks H20.ai Anaconda SAP Google Domino Data Lab Angoss Lexal

In [23]:
len(texts)

153749

In [24]:
# Get the number of available CPU cores (leaving one for system operations)
cpu_cores = os.cpu_count()
cpu_cores

8

In [25]:
%%time

# ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=(cpu_cores-1)) as executor:
    sentences_list = list(executor.map(process_text, texts))

df['sentences'] = sentences_list

CPU times: user 5h 15min 1s, sys: 1h 10min 1s, total: 6h 25min 3s
Wall time: 1h 53min 45s


Stores the flattened data in a new DataFrame with three columns:

- article_id: Identifies the article the sentence belongs to.

- sentence_id: The position of the sentence within the article.

- sentence: The actual sentence text.

In [26]:
# Create a new DataFrame containing article_id, sentence_id, and sentence
sentences_data = []

# It processes each row of the DataFrame to extract the article_id and the list of sentences in the sentences column.
# For each sentence in the sentences list of a row, it assigns a unique sentence_id starting from 1 for that article.
# Converts the nested structure (articles containing multiple sentences) into a flat structure where each row represents a single sentence along with its article_id and sentence_id.

for index, row in df.iterrows():
    article_id = row['article_id']
    for sentence_id, sentence in enumerate(row['sentences'], start=1):
        sentences_data.append((article_id, sentence_id, sentence))

# Create a new DataFrame
sentences_df = pd.DataFrame(sentences_data, columns=['article_id', 'sentence_id', 'sentence'])

In [27]:
sentences_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6523835 entries, 0 to 6523834
Data columns (total 3 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   article_id   int64 
 1   sentence_id  int64 
 2   sentence     object
dtypes: int64(2), object(1)
memory usage: 149.3+ MB


In [28]:
# Save sentences_df with entities

# Specify the file path where the Parquet file will be saved
file_path = '/content/drive/My Drive/Colab Notebooks/NLP_Ignas/nlp_finalproject/'

In [29]:
# Save the DataFrame as a CSV file
sentences_df.to_csv(file_path + 'sentences.csv', index=False)

In [30]:
print(f"DataFrame saved to {file_path + 'sentences.csv'}")

DataFrame saved to /content/drive/My Drive/Colab Notebooks/NLP_Ignas/nlp_finalproject/sentences.csv


In [31]:
sentences_df.head()

Unnamed: 0,article_id,sentence_id,sentence
0,1,1,Observation Simulation And AI Join Forces To Reveal A Clear Universe - SpaceRef Home NASA Watch SpaceRef Business Astrobiology Web Advertising Add an Event Sign up for our Daily Newsletter International Space Station NASA Hack Space Calendar Missions Space Weather Observation Simulation And AI Join Forces To Reveal A Clear Universe Press Release - Source NATIONAL INSTITUTES OF NATURAL SCIENCES Posted July 4 2021 1000 PM View Comments Using AI driven data analysis to peel back the noise and f...
1,1,2,CREDIT The Institute of Statistical Mathematics Japanese astronomers have developed a new artificial intelligence AI technique to remove noise in astronomical data due to random variations in galaxy shapes.
2,1,3,After extensive training and testing on large mock data created by supercomputer simulations they then applied this new tool to actual data from Japan's Subaru Telescope and found that the mass distribution derived from using this method is consistent with the currently accepted models of the Universe.
3,1,4,This is a powerful new tool for analyzing big data from current and planned astronomy surveys.
4,1,5,Wide area survey data can be used to study the large-scale structure of the Universe through measurements of gravitational lensing patterns.


In [32]:
sentences_df.shape

(6523835, 3)

In [33]:
sentences_df['sentence'].iloc[0]

'Observation Simulation And AI Join Forces To Reveal A Clear Universe - SpaceRef Home NASA Watch SpaceRef Business Astrobiology Web Advertising Add an Event Sign up for our Daily Newsletter International Space Station NASA Hack Space Calendar Missions Space Weather Observation Simulation And AI Join Forces To Reveal A Clear Universe Press Release - Source NATIONAL INSTITUTES OF NATURAL SCIENCES Posted July 4 2021 1000 PM View Comments Using AI driven data analysis to peel back the noise and find the actual shape of the Universe.'

In [35]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [36]:
# Load spaCy model with specific components disabled to optimize for NER tasks
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "lemmatizer", "attribute_ruler"])

# Define a function to perform NER using spaCy
def spacy_ner(text):
    doc = nlp(text)
    # Return a list of entities found in the text along with their labels
    return [(ent.text, ent.label_) for ent in doc.ents]

In [37]:
# Example of applying the NER function to the first sentence in the sentences DataFrame
spacy_ner(sentences_df['sentence'].iloc[0])

[('July 4 2021', 'DATE'), ('AI', 'PRODUCT'), ('Universe', 'ORG')]

In [38]:
spacy_ner(sentences_df['sentence'].iloc[1])

[('CREDIT The Institute of Statistical Mathematics Japanese', 'ORG'),
 ('AI', 'PRODUCT')]

In [39]:
spacy_ner(sentences_df['sentence'].iloc[2])

[('Japan', 'GPE'), ('Subaru Telescope', 'PRODUCT'), ('Universe', 'ORG')]

Full list of **spaCy Named Entity Recognition (NER) labels** with their descriptions:

---

1. **PERSON**: People, including fictional characters.

2. **NORP**: Nationalities, religious groups, or political groups.

3. **FAC**: Buildings, airports, highways, bridges, and other man-made structures.

4. **ORG**: Organizations, companies, institutions, or government agencies.

5. **GPE**: Countries, cities, states—geopolitical entities.

6. **LOC**: Non-political locations, such as mountains, bodies of water, and other geographical features.

7. **PRODUCT**: Objects, vehicles, foods, and other tangible products.

8. **EVENT**: Named events of historical significance, including wars, sports events, and natural disasters.

9. **WORK_OF_ART**: Titles of creative works like books, songs, and paintings.

10. **LAW**: Named legal documents and legislation.

11. **LANGUAGE**: Named languages.

12. **DATE**: Dates or periods, including absolute and relative dates or periods.

13. **TIME**: Times smaller than a day, such as specific times of day.

14. **PERCENT**: Percentage values, including the percent sign.

15. **MONEY**: Monetary values, including currency units.

16. **QUANTITY**: Measurements, weights, or distances.

17. **ORDINAL**: Terms that denote an order or rank in a sequence.

18. **CARDINAL**: Numerals that do not fall under another type.

Apply the NER function to each sentence in the dataframe and show progress

In [40]:
%%time

tqdm.pandas(desc="NER Processing")

sentences_df['entities'] = sentences_df['sentence'].progress_apply(spacy_ner)

NER Processing: 100%|██████████| 6523835/6523835 [11:49:38<00:00, 153.22it/s]


CPU times: user 11h 51min 4s, sys: 1min 5s, total: 11h 52min 9s
Wall time: 11h 49min 38s


In [41]:
sentences_df.head(3)

Unnamed: 0,article_id,sentence_id,sentence,entities
0,1,1,Observation Simulation And AI Join Forces To Reveal A Clear Universe - SpaceRef Home NASA Watch SpaceRef Business Astrobiology Web Advertising Add an Event Sign up for our Daily Newsletter International Space Station NASA Hack Space Calendar Missions Space Weather Observation Simulation And AI Join Forces To Reveal A Clear Universe Press Release - Source NATIONAL INSTITUTES OF NATURAL SCIENCES Posted July 4 2021 1000 PM View Comments Using AI driven data analysis to peel back the noise and f...,"[(July 4 2021, DATE), (AI, PRODUCT), (Universe, ORG)]"
1,1,2,CREDIT The Institute of Statistical Mathematics Japanese astronomers have developed a new artificial intelligence AI technique to remove noise in astronomical data due to random variations in galaxy shapes.,"[(CREDIT The Institute of Statistical Mathematics Japanese, ORG), (AI, PRODUCT)]"
2,1,3,After extensive training and testing on large mock data created by supercomputer simulations they then applied this new tool to actual data from Japan's Subaru Telescope and found that the mass distribution derived from using this method is consistent with the currently accepted models of the Universe.,"[(Japan, GPE), (Subaru Telescope, PRODUCT), (Universe, ORG)]"


In [42]:
# Save the DataFrame as a CSV file
sentences_df.to_csv(file_path + 'sentences_with_entities.csv', index=False)

In [43]:
entities = sentences_df.drop(columns=['sentence'])

In [44]:
entities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6523835 entries, 0 to 6523834
Data columns (total 3 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   article_id   int64 
 1   sentence_id  int64 
 2   entities     object
dtypes: int64(2), object(1)
memory usage: 149.3+ MB


In [45]:
# Save the DataFrame as a CSV file
entities.to_csv(file_path + 'entities.csv', index=False)

### Now I want to count the entities.

Count and display the top N most common entities for specified entity types in a Pandas Series where each entry contains a list of entities.

In [46]:
sentences_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6523835 entries, 0 to 6523834
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   article_id   int64 
 1   sentence_id  int64 
 2   sentence     object
 3   entities     object
dtypes: int64(2), object(2)
memory usage: 199.1+ MB


In [47]:
# Count entities

# function
def count_top_entities(entities_series, entity_types, top_n):
    """
    Count top N entities for specified entity types in a series of entity lists and return the counts in table format.

    :param entities_series: Pandas Series containing lists of entities (tuples of entity text and entity type)
                            A Pandas Series where each row contains a list of tuples.
                            Each tuple represents an entity with two components:
                            entity_text: The text of the entity (e.g., "Google").
                            entity_type: The type of entity (e.g., "ORG").
    :param entity_types: A list of entity types to analyze (e.g., ['ORG', 'PRODUCT', 'GPE', 'PERSON']).
    :param top_n: The number of top entities to display for each specified entity type.
    :return: DataFrame showing the top N entities for each specified type

    """
    counters = {entity_type: Counter() for entity_type in entity_types}

    # Iterate over the series to count entities by type
    # Loops through each list of entities in entities_series.
    # For each entity_text and entity_type:
    # If the entity_type is in the specified entity_types, it increments the count of entity_text in the corresponding Counter.
    for entities in entities_series:
        for entity_text, entity_type in entities:
            if entity_type in entity_types:
                counters[entity_type][entity_text] += 1


    # Prepare the DataFrame to display the top N entities for each type
    top_entities_df = pd.DataFrame()

    # For each entity type:
    # Retrieves the top N entities using most_common(top_n).
    # Formats the result as "entity_text (count)" and stores it in a column of the DataFrame.
    for entity_type in entity_types:
        top_entities = counters[entity_type].most_common(top_n)
        top_entities_df[entity_type] = [f"{entity[0]} ({entity[1]})" for entity in top_entities]

    return top_entities_df

In [48]:
# Count entities by type
entity_types = ['LOC', 'ORG', 'PRODUCT', 'GPE', 'PERSON', 'LANGUAGE', 'EVENT', 'LAW']
top_n = 50

top_entities_df = count_top_entities(sentences_df['entities'], entity_types, top_n)
top_entities_df

Unnamed: 0,LOC,ORG,PRODUCT,GPE,PERSON,LANGUAGE,EVENT,LAW
0,Europe (14785),AI (288028),AI (659762),US (211148),Biden (13699),English (15892),the European Economic Area (1599),the Terms Conditions and Privacy Policy (1490)
1,Africa (6765),Google (86002),YouTube (6099),India (68629),Elon Musk (9106),Spanish (1235),World Cup (1539),the AI Act (1206)
2,North America (6470),Microsoft (68703),Android (5379),U.S. (53004),Musk (8987),Arabic (725),Olympics (1456),the Securities Act (528)
3,Middle East (5216),ChatGPT (66490),UsMeet (4234),China (50751),Trump (8703),Hindi (581),World Sunrise Inside (1089),our Visitor Agreement Privacy Policy (491)
4,Asia (5081),Gray Media Group Inc. (48833),Twitter (3711),PRNewswire (42050),Sam Altman (7616),Mandarin (483),Black Friday (833),Privacy Policy Terms Conditions Advertise With Us (457)
5,Silicon Valley (4212),Gray Media Group (32754),Windows (3598),UK (31996),GPT-4 (6705),French (196),Series (816),Chapter 3 (446)
6,Earth (3768),Gray Television Inc. (32272),JavaScript (3425),Japan (22564),Altman (5822),Datamaran (181),World War II (653),Section 27A (419)
7,Asia Pacific (2380),Nvidia (30745),Galaxy (3141),France (21444),CaptioningAudio DescriptionAt (5325),Chinese (141),CES 2024 (527),AI Act (357)
8,the Middle East (2306),Apple (30265),Facebook (3111),Russia (20785),Joe Biden (5061),Portuguese (133),Wimbledon (477),Article Distribution channels (319)
9,Latin America (2191),OpenAI (30228),Windows 11 (3031),California (19423),CaptioningAudio (4336),ShowBirthdaysPet (123),the New Space Race Will Drive Innovation How (432),the Privacy Policy and Terms of Service (313)
