# BBC News

### Fetching Data

In [1]:
from bs4 import BeautifulSoup as soup
import requests 
import pandas as pd
from datetime import date
from pymongo import MongoClient
import csv
import schedule
import time

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
def run_code():
    bbc_url = "https://www.bbc.com/"
    html = requests.get(bbc_url)
    bsobj = soup(html.content, features="lxml")
    bsobj

    # Fetching Headings
    complete_news_one_go = []
    for headings in bsobj.findAll('h3', {'class': 'media__title'}):
        complete_news_one_go.append(headings.text)

    for headings in bsobj.findAll('a', {'class': 'reel__link'}):
        complete_news_one_go.append(headings.h3.text)

    for headings in bsobj.findAll('a', {'class': 'top-list-item__link'}):
        complete_news_one_go.append(headings.h3.text)


    # Removing Extra Characters
    char = '\n'
    for idx, ele in enumerate(complete_news_one_go):
        complete_news_one_go[idx] = ele.replace(char, '')
        
    complete_news_one_go = [x.strip() for x in complete_news_one_go]


    # Fetching the Heading Links
    complete_news_one_go_links = []
    for headings in bsobj.findAll('h3', {'class': 'media__title'}):
        if headings.a is not None:
            complete_news_one_go_links.append(headings.a['href'])

    for headings in bsobj.findAll('a', {'class': 'reel__link'}):
        if headings is not None:
            complete_news_one_go_links.append(headings['href'])

    for headings in bsobj.findAll('a', {'class': 'top-list-item__link'}):
        if headings is not None:
            complete_news_one_go_links.append(headings['href'])

    address = "https://www.bbc.com"
    complete_news_one_go_links = [address + x if not x.startswith(address) else x for x in complete_news_one_go_links]

    # Fetching the Description
    complete_news_one_go_desc = []
    for link in complete_news_one_go_links:
        page = requests.get(link)
        bsobjtwo = soup(page.content)
        for news in bsobjtwo.findAll('article',{'class':'ssrcss-pv1rh6-ArticleWrapper'}):
            complete_news_one_go_desc.append(news.p.text.strip())

    
    # Complete Extra News
    finalized_news_one_go = dict(zip(complete_news_one_go, complete_news_one_go_desc))
    # finalized_news_one_go

    # Convert the dictionary into a DataFrame, using the keys as the index and the values as the column
    df = pd.DataFrame.from_dict(finalized_news_one_go, orient="index", columns=["Description"])
    date_today = date.today()
    df.to_csv("{}.csv".format(date_today), index_label="Headlines")

### Scheduling Task

In [None]:
schedule.every(60).minutes.do(run_code)
# Keep the program running
while True:
    # Run any pending tasks
    schedule.run_pending()
    # Wait for one second
    time.sleep(1)

### Storing Data Into Database

In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017")
db = client.NewsScrapping # use your database name
collection = db.news # use your collection name

In [None]:
collection.insert_one(finalized_news_one_go)