In [122]:
!pip install -q vaderSentiment pymongo spacy

In [154]:
'''
Importing necessary libraries:
The code imports libraries for web scraping, regular expressions, MongoDB, spaCy for NER (Named Entity Recognition), 
displacy for visualization, and VADER SentimentIntensityAnalyzer for sentiment analysis.
'''
from bs4 import BeautifulSoup
import requests
import re
import pymongo
import spacy
from spacy import displacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [155]:
#Downloading spaCy model
!python -q -m spacy download en_core_web_lg

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.0/en_core_web_lg-3.7.0-py3-none-any.whl (587.7 MB)
     -------------------------------------- 587.7/587.7 MB 2.1 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [125]:
#Initializing NER and Sentiment Analyzer:
analyzer = SentimentIntensityAnalyzer()
NER = spacy.load("en_core_web_lg")

In [None]:
'''
Fetching and Preprocessing the Article:

The code fetches the HTML content of a given URL using requests and parses it using BeautifulSoup.
The main content of the HTML is extracted and converted to lowercase.
Some basic data preprocessing is performed, including removing extra whitespaces, text within square brackets, 
text within parentheses, and special characters using regular expressions.
'''

In [126]:
URL = "https://www.bbc.com/news/newsbeat-44124396"

In [127]:
html_content = requests.get(URL).text

In [128]:
soup = BeautifulSoup(html_content, "lxml")

In [129]:
# Extract the main content from the HTML
body = soup.body.text

In [130]:
# Convert text to lowercase
body = body.lower()

In [131]:
body[:1000]

'bbc homepageskip to contentaccessibility helpyour accounthomenewssportearthreelworklifetravelmore menumore menusearch bbchomenewssportearthreelworklifetravelculturefuturemusictvweathersoundsclose menubbc newsmenuhomeisrael-gaza warwar in ukraineclimatevideoworldasiaukbusinesstechmorescienceentertainment & artshealthworld news tvin picturesbbc verifynewsbeatworldafricaaustraliaeuropelatin americamiddle eastus & canadaisrael gaza war: history of the conflict explainedpublished4 days agoshareclose panelshare pagecopy linkabout sharingrelated topicsisrael-gaza warimage source, getty imagesthe palestinian militant group hamas launched an unprecedented assault on israel on 7 october, with hundreds of gunmen infiltrating communities near the gaza strip.more than 1,400 israelis were killed, while the israeli military says 230 soldiers and civilians, including women and children, were taken to gaza as hostages.more than 8,000 palestinians in gaza have been killed in air and artillery strikes c

In [132]:
len(body)

13614

In [133]:
# Data preprocessing steps
cleaned_text = re.sub(r'\s+', ' ', body)  # Remove extra whitespaces
cleaned_text = re.sub(r'\[.*?\]', '', body)  # Remove text within square brackets
cleaned_text = re.sub(r'\(.*?\)', '', body)  # Remove text within parentheses
cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', cleaned_text) # Remove special characters using regular expressions

In [134]:
cleaned_text[:1000]

'bbc homepageskip to contentaccessibility helpyour accounthomenewssportearthreelworklifetravelmore menumore menusearch bbchomenewssportearthreelworklifetravelculturefuturemusictvweathersoundsclose menubbc newsmenuhomeisraelgaza warwar in ukraineclimatevideoworldasiaukbusinesstechmorescienceentertainment  artshealthworld news tvin picturesbbc verifynewsbeatworldafricaaustraliaeuropelatin americamiddle eastus  canadaisrael gaza war history of the conflict explainedpublished4 days agoshareclose panelshare pagecopy linkabout sharingrelated topicsisraelgaza warimage source getty imagesthe palestinian militant group hamas launched an unprecedented assault on israel on 7 october with hundreds of gunmen infiltrating communities near the gaza stripmore than 1400 israelis were killed while the israeli military says 230 soldiers and civilians including women and children were taken to gaza as hostagesmore than 8000 palestinians in gaza have been killed in air and artillery strikes carried out by 

In [135]:
len(cleaned_text)

13278

In [136]:
# Apply NER to the text
doc = NER(cleaned_text)

In [137]:
# Iterate through the entities and display them with their labels
for ent in doc.ents[:10]:
    print(f"Entity: {ent.text}, Label: {ent.label_}")

Entity: bbc homepageskip, Label: PERSON
Entity: contentaccessibility helpyour, Label: ORG
Entity: menusearch bbchomenewssportearthreelworklifetravelculturefuturemusictvweathersoundsclose, Label: PERSON
Entity: menubbc newsmenuhomeisraelgaza warwar, Label: ORG
Entity: artshealthworld news, Label: ORG
Entity: tvin picturesbbc, Label: PERSON
Entity: verifynewsbeatworldafricaaustraliaeuropelatin americamiddle eastus  , Label: PERSON
Entity: canadaisrael gaza war, Label: ORG
Entity: explainedpublished4 days, Label: DATE
Entity: agoshareclose panelshare pagecopy linkabout sharingrelated, Label: PERSON


In [138]:
# Visualize the NER results using displacy
displacy.render(doc, style="ent", jupyter=True)

In [139]:
#MongoDB Connection:

from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017/")
db = client["ner_db"]
collection = db["ner_results"]

In [140]:
#checking whether a mongodb server is running or not.
try:
    client = MongoClient('localhost', 27017)
    db = client.admin
    server_info = db.command("serverStatus")
    print("MongoDB is running.")
except Exception as e:
    print("MongoDB is not running or cannot be reached.")

MongoDB is running.


In [141]:
#Storing NER Results in MongoDB:
ner_results = []
for ent in doc.ents:
    entity_data = {
        "text": ent.text,
        "label": ent.label_,
        "start_char": ent.start_char,
        "end_char": ent.end_char,
    }
    ner_results.append(entity_data)

In [142]:
collection.insert_many(ner_results)

<pymongo.results.InsertManyResult at 0x200334d8100>

In [143]:
# Find all documents
all_documents = collection.find()
print("\nAll documents:")
for document in all_documents:
    print(document)


All documents:
{'_id': ObjectId('6543b8ba3392dda60644c3b7'), 'text': 'BBC HomepageSkip', 'label': 'ORG', 'start_char': 0, 'end_char': 16}
{'_id': ObjectId('6543b8ba3392dda60644c3b8'), 'text': 'menuSearch', 'label': 'PERSON', 'start_char': 107, 'end_char': 117}
{'_id': ObjectId('6543b8ba3392dda60644c3b9'), 'text': 'ArtsHealthWorld News', 'label': 'ORG', 'start_char': 307, 'end_char': 327}
{'_id': ObjectId('6543b8ba3392dda60644c3ba'), 'text': 'VerifyNewsbeatWorldAfricaAustraliaEuropeLatin AmericaMiddle', 'label': 'ORG', 'start_char': 345, 'end_char': 404}
{'_id': ObjectId('6543b8ba3392dda60644c3bb'), 'text': 'Gaza', 'label': 'GPE', 'start_char': 427, 'end_char': 431}
{'_id': ObjectId('6543b8ba3392dda60644c3bc'), 'text': 'agoShareclose panelShare', 'label': 'PERSON', 'start_char': 486, 'end_char': 510}
{'_id': ObjectId('6543b8ba3392dda60644c3bd'), 'text': 'Getty', 'label': 'ORG', 'start_char': 580, 'end_char': 585}
{'_id': ObjectId('6543b8ba3392dda60644c3be'), 'text': 'Palestinian', 'lab

In [153]:
# Perform aggregation to count labels
pipeline = [
    {"$group": {"_id": "$label", "count": {"$sum": 1}}},
    {"$sort": {"count": -1}}
]

result = list(collection.aggregate(pipeline))

# Print the label counts
for doc in result:
    label = doc["_id"]
    count = doc["count"]
    print(f"Label: {label}, Count: {count}")


Label: GPE, Count: 516
Label: NORP, Count: 396
Label: ORG, Count: 296
Label: DATE, Count: 152
Label: PERSON, Count: 122
Label: CARDINAL, Count: 112
Label: LOC, Count: 42
Label: EVENT, Count: 25
Label: QUANTITY, Count: 11
Label: PRODUCT, Count: 10
Label: TIME, Count: 8
Label: ORDINAL, Count: 5
Label: LAW, Count: 5
Label: WORK_OF_ART, Count: 2
Label: MONEY, Count: 1
Label: FAC, Count: 1


In [144]:
# Close the MongoDB connection
client.close()

Sentiment Analysis

* Stop words are removed, and lemmatization is applied to the text.
* Sentiment scores are calculated using the VADER SentimentIntensityAnalyzer on the cleaned text.
* A sentiment label ("Positive," "Negative," or "Neutral") is determined based on the compound sentiment score.

In [147]:
# Stop words removal and lemmatization
cleaned_tokens = [token.lemma_ for token in doc if not token.is_stop]

In [149]:
# Join the cleaned tokens back into a sentence
cleaned_text = " ".join(cleaned_tokens)

In [150]:
cleaned_tokens

['bbc',
 'homepageskip',
 'contentaccessibility',
 'helpyour',
 'accounthomenewssportearthreelworklifetravelmore',
 'menumore',
 'menusearch',
 'bbchomenewssportearthreelworklifetravelculturefuturemusictvweathersoundsclose',
 'menubbc',
 'newsmenuhomeisraelgaza',
 'warwar',
 'ukraineclimatevideoworldasiaukbusinesstechmorescienceentertainment',
 ' ',
 'artshealthworld',
 'news',
 'tvin',
 'picturesbbc',
 'verifynewsbeatworldafricaaustraliaeuropelatin',
 'americamiddle',
 'eastus',
 ' ',
 'canadaisrael',
 'gaza',
 'war',
 'history',
 'conflict',
 'explainedpublished4',
 'day',
 'agoshareclose',
 'panelshare',
 'pagecopy',
 'linkabout',
 'sharingrelated',
 'topicsisraelgaza',
 'warimage',
 'source',
 'getty',
 'imagesthe',
 'palestinian',
 'militant',
 'group',
 'hama',
 'launch',
 'unprecedented',
 'assault',
 'israel',
 '7',
 'october',
 'hundred',
 'gunman',
 'infiltrate',
 'community',
 'near',
 'gaza',
 'stripmore',
 '1400',
 'israelis',
 'kill',
 'israeli',
 'military',
 'say',
 '23

In [151]:
# Calculate sentiment scores using VADER on the cleaned text
sentiment_scores = analyzer.polarity_scores(cleaned_text)

# Determine sentiment label based on the compound score
sentiment_label = "Neutral"
if sentiment_scores["compound"] > 0.05:
    sentiment_label = "Positive"
elif sentiment_scores["compound"] < -0.05:
    sentiment_label = "Negative"

In [152]:
print(sentiment_scores)

{'neg': 0.193, 'neu': 0.711, 'pos': 0.096, 'compound': -0.9992}


The sentiment analysis for the provided text yields the following sentiment score:

* Negative (neg): 0.193
* Neutral (neu): 0.711
* Positive (pos): 0.096
* Compound: -0.9992
The compound score is -0.9992, indicating a highly negative sentiment. While the text does contain some neutral and slightly positive elements, the overall sentiment is predominantly negative.

Future enhancements:

1. Integration of advanced NLP models for more accurate entity recognition and sentiment analysis.
2. Implementation of a more robust data cleaning pipeline to handle complex text structures and diverse content formats.
3. Incorporation of a user interface for easier interaction and visualization of the NER and sentiment analysis results.
4. Expansion of the database capabilities to handle larger datasets and enable more complex queries and analyses.