# News scraper

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import sys

if "/content/drive/MyDrive/INDUSTRIAL_APPLICATIONS_TRAVEL_TALES" not in sys.path:
  sys.path.append("/content/drive/MyDrive/INDUSTRIAL_APPLICATIONS_TRAVEL_TALES")


BASE_PATH = "/content/drive/MyDrive/INDUSTRIAL_APPLICATIONS_TRAVEL_TALES/"

model_path = "/content/drive/MyDrive/INDUSTRIAL_APPLICATIONS_TRAVEL_TALES/models/bbc-model"

collection_path = "/content/drive/MyDrive/INDUSTRIAL_APPLICATIONS_TRAVEL_TALES/collection/collection.csv"

Mounted at /content/drive


In [2]:
!pip install feedparser > 0
!pip install bs4 > 0
!pip install transformers > 0
!pip install sumy > 0
#!pip install fakeyou > 0
!pip install TTS > 0

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires fastapi, which is not installed.
lida 0.0.10 requires kaleido, which is not installed.
lida 0.0.10 requires python-multipart, which is not installed.
lida 0.0.10 requires uvicorn, which is not installed.
plotnine 0.12.4 requires numpy>=1.23.0, but you have numpy 1.22.0 which is incompatible.
pywavelets 1.5.0 requires numpy<2.0,>=1.22.4, but you have numpy 1.22.0 which is incompatible.
tensorflow 2.15.0 requires numpy<2.0.0,>=1.23.5, but you have numpy 1.22.0 which is incompatible.[0m[31m
[0m

In [4]:
#FAKEYOU_CDN_BASELINK = "https://storage.googleapis.com/vocodes-public"
#
#FAKEYOU_USERNAME = "pippoJ"
#FAKEYOU_PASSWORD = "password"
#FAKEYOU_MODEL_GERRY_SCOTTI = "TM:5ggf3m5w2mhq"
#FAKEYOU_MODEL_TOKEN = FAKEYOU_MODEL_GERRY_SCOTTI

In [3]:
import feedparser
import pandas as pd
import time
from bs4 import BeautifulSoup
import os
import numpy as np
import torch


from ast import literal_eval

#from fakeyou import FakeYou
from TTS.api import TTS

TTS_MODEL = "tts_models/multilingual/multi-dataset/xtts_v2"
TTS_OUTPUT_DIR = BASE_PATH + "generated_audio/"

VOICE_SAMPLE_FILE_PATH = BASE_PATH + "voices/renzi.wav"

from transformers import TextClassificationPipeline, pipeline

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

import re

import nltk
nltk.download('punkt')

# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"


def summarize_article(article_text, num_sentences=3):

    # Use extractive summarization with LSA
    summarizer = LsaSummarizer()
    parser = PlaintextParser.from_string(article_text, Tokenizer("english"))
    summary_sentences = summarizer(parser.document, num_sentences)

    # Convert the summary sentences to a single string
    summary = ' '.join(str(sentence) for sentence in summary_sentences)

    return summary


def parse_date(date_struct_time):
    date_str = time.strftime('%Y-%m-%d %H:%M:%S', date_struct_time)
    return date_str

def normalize_embedding(list):
    vector = np.array(list)
    normalized_vector = vector / np.linalg.norm(vector)
    return normalized_vector.tolist()

class NewsScraper:
    def __init__(self, embedder_pipe : TextClassificationPipeline, summarization_pipe, csv_file_path=None, fakeyou_obj = None):
        #self.rss_url = rss_url

        self.embedder = embedder_pipe
        self.summarizer = summarization_pipe
        self.tts = TTS(TTS_MODEL).to(device)
        self.fakeyou = fakeyou_obj
        self.entries = []
        if csv_file_path is not None and os.path.isfile(csv_file_path):
          def f(x):
              x = str(x)
              if "," not in x:
                x = x.replace(" ", ", ")
              try:
                  return literal_eval(str(x))
              except Exception as e:
                  print(e)
                  print("given x:" , x)
                  return []

          conv = {'Embedding': lambda x: f(x)}
          self.df = pd.read_csv(csv_file_path, converters=conv, index_col=0)
          print(f"Loaded dataframe of {len(self.df)} rows")
        else:
          print("Creating new empty dataframe")
          self.df = pd.DataFrame(columns=['Link', 'Title', 'Summary', 'Article', 'Date', 'Embedding'])  #, index=["Link"]


    def is_already_present(self, link) -> bool:
        return link in self.df["Link"].values
        #return False

    def fetch_news(self, rss_url, do_print=True):
        feed = feedparser.parse(rss_url)
        entries = feed.entries

        for entry in entries:
            print(entry)

            title = entry.title
            article_str = BeautifulSoup(entry.summary, "lxml").get_text(separator=' ')
            if "<br /><br /><hr /><br /><br />" in entry.summary:
                  rss_summary = (entry.summary).split('<br /><br /><hr /><br /><br />')[0]
                  article_str = BeautifulSoup((entry.summary).split('<br /><br /><hr /><br /><br />')[1], "lxml").get_text(separator=' ')


            else:
              if do_print:
                print(f"Skipping empty body news of title: {title}")
                print("-" * 50)
              continue

            date = parse_date(entry.published_parsed)
            link = entry.link

            # do not process twice the same news
            if self.is_already_present(link):
              if do_print:
                print(f"Skipping already indexed news of title: {title}")
                print("-" * 50)
              continue

            article_text_extract = summarize_article(article_str, num_sentences = 20)
            summary = self.summarizer(article_text_extract, min_length=50, max_length=100)[0]["summary_text"]

            wav_link = None


            if self.tts != None:
              wav_file_name = title.replace(" ", "-")
              #.replace(".", "").replace("'", "").replace("(", "").replace(")", "").replace("?", "").replace(",", "").replace(":","") + ".wav"
              pattern = re.compile('[^a-zA-Z0-9\-]')
              wav_file_name = pattern.sub('', wav_file_name)
              wav_file_name = wav_file_name + '.wav'
              self.tts.tts_to_file(text=summary, speaker_wav = VOICE_SAMPLE_FILE_PATH, language="en", file_path=TTS_OUTPUT_DIR + wav_file_name)

            elif self.fakeyou != None:
              wav_obj = self.fakeyou.say(summary, FAKEYOU_MODEL_TOKEN)
              if wav_obj.status == "complete_success":
                wav_link = FAKEYOU_CDN_BASELINK + wav_obj.json["maybe_public_bucket_wav_audio_path"]

            model_output = self.embedder(article_text_extract)
            embedding = []
            for row in model_output[0]:
              embedding.append(row["score"])

            normalized_embedding = normalize_embedding(embedding)

            if do_print:
              print(f"Keys: {entry.keys()}")
              print(f"Title: {title}")
              print(f"Original summary field: {entry.summary}")
              print(f"RSS summary: {rss_summary}")
              print(f"article_str: {article_str}")
              print(f"article_text_extract: {article_text_extract}")
              print(f"Generated summary: {summary}")
              print(f"Generated embedding: {embedding}")
              print(f"Normalized embedding: {normalized_embedding}")
              #if "content" in entry.keys(): print(f"Content: {entry.content}")
              print(f"Date: {date}")
              print(f"Link: {link}")
              print("-" * 50)

            new_record = pd.DataFrame({'Link':link,'Title': title, 'Summary': summary, 'Article':article_text_extract, 'Date': date, 'Wav-link': wav_link, 'wav_file_name': wav_file_name,  'Embedding': [normalized_embedding]})
            self.df = pd.concat([self.df, new_record], ignore_index=True)
            #return

    def save_to_csv(self, output_file_path):
        #print(self.df)
        try:
          os.makedirs("/".join(collection_path.split("/")[:-1]))
        except:
          pass
        self.df.to_csv(output_file_path)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
from transformers import TextClassificationPipeline, DistilBertTokenizerFast, TFDistilBertForSequenceClassification

tokenizer_inference = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
trainer_model_inference = TFDistilBertForSequenceClassification.from_pretrained(model_path)

embedder_pipe = TextClassificationPipeline(model=trainer_model_inference, tokenizer=tokenizer_inference, return_all_scores=True)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/INDUSTRIAL_APPLICATIONS_TRAVEL_TALES/models/bbc-model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [5]:
#hf_name = "pszemraj/led-base-book-summary"
#hf_name = "pszemraj/pegasus-x-large-book-summary"
hf_name="facebook/bart-large-cnn"
summarization_pipe = pipeline("summarization", model=hf_name, device= 0 if torch.cuda.is_available() else -1,) #pipeline("summarization")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
news_scraper = NewsScraper(embedder_pipe, summarization_pipe, collection_path)


 > You must confirm the following:
 | > "I have purchased a commercial license from Coqui: licensing@coqui.ai"
 | > "Otherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml" - [y/n]
 | | > y
 > Downloading model to /root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2


100%|█████████▉| 1.87G/1.87G [00:27<00:00, 66.6MiB/s]
100%|██████████| 1.87G/1.87G [00:27<00:00, 67.2MiB/s]
100%|██████████| 4.37k/4.37k [00:00<00:00, 11.4kiB/s]
 27%|██▋       | 99.3k/361k [00:00<00:00, 785kiB/s]
100%|██████████| 361k/361k [00:00<00:00, 614kiB/s] 
100%|██████████| 32.0/32.0 [00:00<00:00, 49.6iB/s]
 77%|███████▋  | 5.96M/7.75M [00:00<00:00, 59.6MiB/s]

 > Model's license - CPML
 > Check https://coqui.ai/cpml.txt for more info.
 > Using model: xtts


100%|██████████| 7.75M/7.75M [00:10<00:00, 59.6MiB/s]

Loaded dataframe of 56 rows


In [29]:
#news_scraper.df = news_scraper.df[news_scraper.df['wav_file_name'].notna()]
news_scraper.df

Unnamed: 0,Link,Title,Summary,Article,Date,Embedding,Wav-link,wav_file_name
0,https://www.huffpost.com/entry/pop-tart-bowl-e...,Fans Can’t Get Enough Of Pop-Tart Football Mas...,Kansas State beat North Carolina State 28-19 i...,College football got a little weirder than usu...,2023-12-29 17:51:54,"[0.07408532811413193, 0.9689050052805375, 0.08...",,Fans-Can’t-Get-Enough-Of-Pop-Tart-Football-Mas...
1,https://www.huffpost.com/entry/cats-dogs-funny...,33 Of The Funniest Tweets About Cats And Dogs ...,"Each week at HuffPost, we scour Twitter X to f...",Woof — it’s been a long week. If you feel like...,2023-12-29 16:07:05,"[0.10619353414002064, 0.17583030138564332, 0.1...",,33-Of-The-Funniest-Tweets-About-Cats-And-Dogs-...
2,https://www.huffpost.com/entry/lauren-boebert-...,Colorado GOP Chair Says Lauren Boebert's Distr...,Rep. Lauren Boebert (R-Colo.) announced Wednes...,The chairman of Colorado’s Republican Party ...,2023-12-29 13:46:59,"[0.1844602374077812, 0.24227349313139107, 0.90...",,Colorado-GOP-Chair-Says-Lauren-Boebert's-Distr...
3,https://www.huffpost.com/entry/chris-christie-...,Chris Christie Says This Is The Real Reason Be...,Chris Christie says he doesn't believe Nikki H...,Chris Christie criticized his fellow Republic...,2023-12-29 12:20:40,"[0.1908717281004337, 0.34007979267291505, 0.87...",,Chris-Christie-Says-This-Is-The-Real-Reason-Be...
4,https://www.bbc.co.uk/news/uk-england-south-yo...,"Crash victim 'devoted his life to others', say...","Chris Marriott, 46, was hit by a car in Sheffi...",29 minutes ago About sharing Chris Marriott wo...,2023-12-29 18:58:34,"[0.11019047690618171, 0.8927539188998495, 0.33...",,"Crash-victim-'devoted-his-life-to-others',-say..."
5,https://www.bbc.co.uk/news/world-europe-678433...,Ukraine war: At least 30 killed in biggest Rus...,More than 160 people were injured as Russia hi...,24 minutes ago About sharing Watch: Odesa cler...,2023-12-29 17:46:42,"[0.40483987403662186, 0.3972525657690741, 0.62...",,Ukraine-war:-At-least-30-killed-in-biggest-Rus...
6,https://www.bbc.co.uk/news/world-us-canada-677...,Can Donald Trump still run for president after...,Donald Trump disqualified from running as a pr...,20 minutes ago About sharing Mr Trump faces se...,2023-12-29 14:33:30,"[0.11186714605195681, 0.09556231207458032, 0.9...",,Can-Donald-Trump-still-run-for-president-after...
7,https://www.bbc.co.uk/news/health-67840758?at_...,One dead following cheese recall over E. coli,Four types of Mrs Kirkham's cheese were recall...,1 hour ago About sharing Four types of Mrs Kir...,2023-12-29 17:55:44,"[0.16056629889053406, 0.801760002979207, 0.251...",,One-dead-following-cheese-recall-over-E-coli.wav
8,https://www.bbc.co.uk/news/uk-67802959?at_medi...,Blackpool Tower fire: Five other times people ...,Police flying over Blackpool Tower in a helico...,27 minutes ago About sharing By Andre Rhoden-P...,2023-12-29 14:15:20,"[0.09784782516099813, 0.838472331475987, 0.115...",,Blackpool-Tower-fire:-Five-other-times-people-...
9,https://www.bbc.co.uk/news/uk-67843958?at_medi...,UK sending more air defence missiles to Ukrain...,"About 200 weapons will be provided, the Minist...",10 minutes ago About sharing By Harrison Jones...,2023-12-29 18:49:21,"[0.0986799197908025, 0.0607838318264644, 0.988...",,UK-sending-more-air-defence-missiles-to-Ukrain...


In [7]:
# rss_url is the URL of the RSS feed
#rss_url = "https://morss.it/:clip/feeds.bbci.co.uk/news/rss.xml"#"https://feeds.bbci.co.uk/news/rss.xml"#'https://www.ilsole24ore.com/rss/mondo--europa.xml'

print("Fetching new data...")
rss_url_list =[
    "https://morss.it/:clip/https://www.huffpost.com/section/us-news/feed",
    "https://morss.it/:clip/feeds.bbci.co.uk/news/rss.xml",
    "https://morss.it/:clip/https://www.huffpost.com/section/business/feed",
    "https://morss.it/:clip/https://www.huffpost.com/section/world-news/feed",
    "https://morss.it/:clip/https://www.huffpost.com/section/celebrity/feed",
    "https://morss.it/:clip/https://www.huffpost.com/section/media/feed",
    "https://morss.it/:clip/https://www.huffpost.com/section/tv/feed",
    "https://morss.it/:clip/https://www.huffpost.com/section/money/feed",
    "https://morss.it/:clip/https://www.huffpost.com/section/worklife/feed",
    "https://morss.it/:clip/https://www.huffpost.com/section/travel/feed",
    "https://morss.it/:clip/https://www.huffpost.com/section/style/feed",
    "https://morss.it/:clip/https://www.theguardian.com/sport/blog/rss"

]

for url in rss_url_list:
  news_scraper.fetch_news(url)


Fetching new data...
{'title': "Fantasia Barrino Opens Up About Losing 'Everything' After Winning 'American Idol'", 'title_detail': {'type': 'text/plain', 'language': None, 'base': 'https://morss.it/:clip/https://www.huffpost.com/section/us-news/feed', 'value': "Fantasia Barrino Opens Up About Losing 'Everything' After Winning 'American Idol'"}, 'links': [{'rel': 'alternate', 'type': 'text/html', 'href': 'https://www.huffpost.com/entry/fantasia-barrino-american-idol-lost-everything_n_65913539e4b0bf73e176bb39'}, {'length': '0', 'type': 'image/jpeg', 'href': 'https://img.huffingtonpost.com/asset/659139512400003b0027eaf3.jpeg?cache=3qn3t89j6s&ops=224_126', 'rel': 'enclosure'}], 'link': 'https://www.huffpost.com/entry/fantasia-barrino-american-idol-lost-everything_n_65913539e4b0bf73e176bb39', 'published': 'Sun, 31 Dec 2023 12:25:06 -0500', 'published_parsed': time.struct_time(tm_year=2023, tm_mon=12, tm_mday=31, tm_hour=17, tm_min=25, tm_sec=6, tm_wday=6, tm_yday=365, tm_isdst=0), 'id': 'h

Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors


 > Processing time: 16.12610125541687
 > Real-time factor: 0.7478978055736623
Keys: dict_keys(['title', 'title_detail', 'links', 'link', 'published', 'published_parsed', 'id', 'guidislink', 'comments', 'content', 'summary'])
Title: Fantasia Barrino Opens Up About Losing 'Everything' After Winning 'American Idol'
Original summary field: "The Color Purple" star described herself as a "little girl" from High Point, North Carolina, who knew "nothing about the industry."<br /><br /><hr /><br /><br /><section class="entry__content-list js-entry-content js-cet-subunit"><p><a href="https://www.huffpost.com/topic/fantasia-barrino">Fantasia Barrino</a> is detailing the challenges she faced in the years after her “<a href="https://www.huffpost.com/topic/american-idol">American Idol</a>” win.</p><p>“<a href="https://www.huffpost.com/topic/the-color-purple">The Color Purple</a>” star <a href="https://people.com/fantasia-barrino-on-post-american-idol-struggles-exclusive-8420500">told People magazine



 > Text splitted to sentences.
['U.S. military says one ship reported being struck by a missile late Saturday.', 'Two Navy destroyers responded to the call for help, and the Denmark-owned vessel was reportedly seaworthy.', 'Hours later, four Houthi boats fired at the same ship and tried to board, Central Command said.']
 > Processing time: 12.675684213638306
 > Real-time factor: 0.5164122537298693
Keys: dict_keys(['title', 'title_detail', 'links', 'link', 'published', 'published_parsed', 'id', 'guidislink', 'comments', 'content', 'summary'])
Title: Yemen's Houthi Rebels Show No Sign Of Ending 'Reckless' Red Sea Attacks, U.S. Commander Says
Original summary field: There have been about two dozen attacks on international shipping by the Houthis since Oct. 19.<br /><br /><hr /><br /><br /><section class="entry__content-list js-main-content-list" id="entry-body"><section class="entry__content-list js-entry-content js-cet-subunit"><p>CHRISTIANSTED, U.S. Virgin Islands (AP) — <a href="https:



 > Text splitted to sentences.
['Israeli strikes in central Gaza kill at least 35 people Sunday, hospital officials said.', 'Israel expanded its offensive to central Gaza this week, targeting a belt of dense, built-up communities.', 'Israel has faced stiff resistance from Hamas since it began its ground offensive in late October.']
 > Processing time: 13.05405879020691
 > Real-time factor: 0.5190606991042412
Keys: dict_keys(['title', 'title_detail', 'links', 'link', 'published', 'published_parsed', 'id', 'guidislink', 'comments', 'content', 'summary'])
Title: Israeli Strikes In Central Gaza Kill At Least 35 As Netanyahu Says War Will Continue For Months
Original summary field: Israel's prime minister continues to resist international calls for a cease-fire.<br /><br /><hr /><br /><br /><section class="entry__content-list js-main-content-list" id="entry-body"><section class="entry__content-list js-entry-content js-cet-subunit"><p>DEIR AL-BALAH, Gaza Strip (AP) — Israeli strikes in centr



 > Text splitted to sentences.
['Jeremy Renner dropped in on the Renown Regional Medical Center in Reno, Nevada, to thank the people who saved his life.', 'The harrowing accident certainly required the quick thinking and dedicated efforts of professionals.', 'Renner, who also sings, plans to mark the first anniversary of the accident with a song, “Wait,” which will be available to stream and purchase Jan. 1.']
 > Processing time: 15.80508041381836
 > Real-time factor: 0.5138692628705364
Keys: dict_keys(['title', 'title_detail', 'links', 'link', 'published', 'published_parsed', 'id', 'guidislink', 'comments', 'content', 'summary'])
Title: Jeremy Renner Surprises Reno Hospital 1 Year After Near-Fatal Snowplow Accident
Original summary field: The actor suffered a near-fatal accident on New Year's Day 2023 that left him with more than 30 broken bones.<br /><br /><hr /><br /><br /><section class="entry__content-list js-entry-content js-cet-subunit"><p><a href="https://www.huffpost.com/topic



 > Text splitted to sentences.
['Paula Abdul is suing former “American Idol’s” Nigel Lythgoe for alleged sexual assault.', 'The suit was obtained by multiple media outlets and detailed several incidents.', 'Abdul claimed Lyth goe “verbally insulted and belittled her” in a meeting about the “Idol” job.']
 > Processing time: 12.505569696426392
 > Real-time factor: 0.501184698807329
Keys: dict_keys(['title', 'title_detail', 'links', 'link', 'published', 'published_parsed', 'id', 'guidislink', 'comments', 'content', 'summary'])
Title: Paula Abdul Sues ‘American Idol’ Producer Nigel Lythgoe For Alleged Sexual Assault
Original summary field: The celebrity judge claimed Lythgoe attempted to sexually assault her on more than one occasion and also groped her assistant.<br /><br /><hr /><br /><br /><section class="entry__content-list js-entry-content js-cet-subunit"><p><a href="https://www.huffpost.com/topic/paula-abdul">Paula Abdul</a> is suing former “American Idol” and “So You Think You Can D



 > Text splitted to sentences.
['The Kansas City Chiefs tight end discussed the special Christmas present during an episode of his “New Heights” podcast.', 'Kelce said that Austin Swift, who attended the Chiefs’ Christmas Day football game dressed as Santa Claus, made him “feel like a child.”']
 > Processing time: 9.265850305557251
 > Real-time factor: 0.47901192708928225
Keys: dict_keys(['title', 'title_detail', 'links', 'link', 'published', 'published_parsed', 'id', 'guidislink', 'comments', 'content', 'summary'])
Title: Travis Kelce Shares The Thoughtful Christmas Gift Taylor Swift's Brother Gave Him
Original summary field: The Kansas City Chiefs tight end was clearly impressed by Austin Swift's special present.<br /><br /><hr /><br /><br /><section class="entry__content-list js-entry-content js-cet-subunit"><p><a href="https://www.huffpost.com/topic/travis-kelce">Travis Kelce</a> is raving over his gift from <a href="https://www.huffpost.com/entertainment/topic/taylor-swift">Taylor



 > Text splitted to sentences.
['Whoopi Goldberg and Billy Crystal became visibly emotional at the Kennedy Center Honors.', 'Goldberg paid tribute to their late friend, Robin Williams, during her speech celebrating Crystal.', 'As a trio, Crystal, Goldberg and Williams co-hosted multiple “Comic Relief” telethons.']
 > Processing time: 12.33214783668518
 > Real-time factor: 0.5024164495057725
Keys: dict_keys(['title', 'title_detail', 'links', 'link', 'published', 'published_parsed', 'id', 'guidislink', 'comments', 'content', 'summary'])
Title: Whoopi Goldberg, Billy Crystal Pay Tribute To 'Brother' Robin Williams
Original summary field: The actors paid tribute to their late friend at the Kennedy Center Honors, which took place Dec. 3.<br /><br /><hr /><br /><br /><section class="entry__content-list js-entry-content js-cet-subunit"><p><a href="https://www.huffpost.com/entertainment/topic/whoopi-goldberg">Whoopi Goldberg</a> and <a href="https://www.huffpost.com/topic/billy-crystal">Billy 



 > Text splitted to sentences.
['There is a retail crime wave terrorizing Americans — if you’re tuned in to conservative media.', 'Fox and outlets like The New York Post dutifully cover   every dramatic video of a robbery.', 'Data suggests that, unless you live in New York City or a handful of others, shoplifting has been declining.']
 > Processing time: 14.370686054229736
 > Real-time factor: 0.514418600963281
Keys: dict_keys(['title', 'title_detail', 'links', 'link', 'published', 'published_parsed', 'id', 'guidislink', 'comments', 'content', 'summary'])
Title: The Shoplifting Epidemic Touted By Fox News Is Likely Overblown, Data Shows
Original summary field: Don't let dramatic videos of retail theft lead you to believe that crime is running rampant everywhere we look.<br /><br /><hr /><br /><br /><section class="entry__content-list js-entry-content js-cet-subunit"><p>There is a retail crime wave terrorizing Americans — if you’re tuned in to conservative media.</p><p>It can be difficu



 > Text splitted to sentences.
['Ariana Grande described 2023 as “one of the most transformative, most challenging, and yet happiest and most special years of my life” She went on to note that she’d be “reacting to things that deserve my energy only and removing and protecting myself from things that do not”']
 > Processing time: 12.775734424591064
 > Real-time factor: 0.530046632865387
Keys: dict_keys(['title', 'title_detail', 'links', 'link', 'published', 'published_parsed', 'id', 'guidislink', 'comments', 'content', 'summary'])
Title: Ariana Grande Opens Up About 'Most Challenging' Year In Cryptic Instagram Post
Original summary field: Many were quick to interpret the post as Grande's response to the online frenzy surrounding her relationship with "Wicked" co-star, Ethan Slater.<br /><br /><hr /><br /><br /><section class="entry__content-list js-entry-content js-cet-subunit"><p><a href="https://www.huffpost.com/entertainment/topic/ariana-grande">Ariana Grande</a> is reflecting on a 

Token indices sequence length is longer than the specified maximum sequence length for this model (1530 > 1024). Running this sequence through the model will result in indexing errors


RuntimeError: ignored

In [32]:
news_scraper.save_to_csv(collection_path)

In [11]:
from ast import literal_eval

def f(x):
    x = str(x)
    if "," not in x:
      x = x.replace(" ", ", ")
    try:
        return literal_eval(str(x))
    except Exception as e:
        print(e)
        print("given x:" , x)
        return []

conv = {'Embedding': lambda x: f(x)}
dfFoo = pd.read_csv(collection_path, converters=conv, index_col=0)

In [12]:
type(dfFoo["Embedding"][0])

list

## Show stored news embeddings

In [14]:
from ast import literal_eval

def f(x):
    x = str(x)
    if "," not in x:
      x = x.replace(" ", ", ")
    try:
        return literal_eval(str(x))
    except Exception as e:
        print(e)
        print("given x:" , x)
        return []

conv = {'Embedding': lambda x: f(x)}
df= pd.read_csv(collection_path, converters=conv, index_col=0)

labels = df['Title']
embeddings = df['Embedding']

In [15]:
df

Unnamed: 0,Link,Title,Summary,Article,Date,Embedding,Wav-link,wav_file_name
0,https://www.bbc.co.uk/news/uk-england-manchest...,Storm Gerrit: Roofs blown off as 'tornado' str...,"Roofs were torn off houses, trees blew down a...",1 hour ago About sharing Heavy rain and wind c...,2023-12-28 10:27:15,[],,
1,https://www.bbc.co.uk/news/uk-scotland-6783247...,Thousands of homes still without power after S...,Storm Gerrit brought blizzards and flooding to...,10 minutes ago About sharing Properties have b...,2023-12-28 11:19:26,[],,
2,https://www.bbc.co.uk/news/uk-67773394?at_medi...,'My elderly father found a new partner - then ...,"Vincent died in June this year, six months aft...","""I remember one conversation where she was sho...",2023-12-28 00:48:47,[],,
3,https://www.bbc.co.uk/news/world-latin-america...,Taylor Swift fan died of heat exhaustion at Ri...,Ana Clara Benevides Machado died of heat exhau...,49 minutes ago About sharing Fans tried to shi...,2023-12-28 10:40:47,[],,
4,https://www.bbc.co.uk/news/world-middle-east-6...,Israel Gaza war: Israel warns Hezbollah and Le...,Benny Gantz said the IDF would intervene if th...,1 hour ago About sharing Fighting on the Israe...,2023-12-28 09:55:16,[],,
5,https://www.huffpost.com/entry/joe-biden-feder...,'Stunning Diversity': How Joe Biden Reshaped T...,Biden fell behind Donald Trump's number of con...,Advertisement After a breakneck pace of confir...,2023-12-28 10:45:05,[],,
6,https://www.huffpost.com/entry/joe-biden-nikki...,Joe Biden Has Blunt 4-Word Reply To Nikki Hale...,President Joe Biden offered a simple response ...,President Joe Biden had a simple response to...,2023-12-28 10:26:49,[],,
7,https://www.huffpost.com/entry/nikki-haley-civ...,Nikki Haley Fails To Mention Slavery When Aske...,Republican presidential candidate Nikki Haley ...,"COLUMBIA, S.C. (AP) — Republican presidential ...",2023-12-28 09:14:00,[],,
8,https://www.huffpost.com/entry/obit-herb-kohl_...,"Herb Kohl, Former U.S. Senator And Owner Of Th...",Kohl was a popular figure in Wisconsin. He bou...,Advertisement Kohl was a popular figure in Wis...,2023-12-28 00:02:20,[],,
9,https://www.huffpost.com/entry/immigration-may...,Democratic Mayors Renew Pleas For Federal Help...,Texas Gov. Greg Abbott’s busing operation has ...,Greg Abbott’s busing operation has transport...,2023-12-27 23:07:34,[],,


In [17]:
labels

0     Storm Gerrit: Roofs blown off as 'tornado' str...
1     Thousands of homes still without power after S...
2     'My elderly father found a new partner - then ...
3     Taylor Swift fan died of heat exhaustion at Ri...
4     Israel Gaza war: Israel warns Hezbollah and Le...
5     'Stunning Diversity': How Joe Biden Reshaped T...
6     Joe Biden Has Blunt 4-Word Reply To Nikki Hale...
7     Nikki Haley Fails To Mention Slavery When Aske...
8     Herb Kohl, Former U.S. Senator And Owner Of Th...
9     Democratic Mayors Renew Pleas For Federal Help...
10    2 Models Of Apple Watch Can Go On Sale Again, ...
11    Trader Joe’s Illegally Fired Union Supporter, ...
12    California Poultry Supplier Illegally Employed...
13    Spotify Axes 17% Of Workforce In Third Round O...
14    New Unions Have Won Historic Elections. Winnin...
15    U.S. Announces What Could Be Final Military Ai...
16    Heat Exhaustion Killed Taylor Swift Fan Attend...
17    On Foot And By Donkey Cart, Thousands Flee

In [16]:
embeddings

0                                                    []
1                                                    []
2                                                    []
3                                                    []
4                                                    []
5                                                    []
6                                                    []
7                                                    []
8                                                    []
9                                                    []
10                                                   []
11                                                   []
12                                                   []
13                                                   []
14                                                   []
15                                                   []
16                                                   []
17                                              

In [18]:
labels_array = []
embeddings_array=[]
for i in range(len(embeddings)):
  labels_array.append(i)
  #labels_array.append(labels[i])
  embeddings_array.append(embeddings[i])

In [19]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming you have a numpy array 'embeddings' with shape (num_articles, 5)
# Replace this with your actual data
# For example, if you have a list of embeddings, you can convert it to a numpy array like this:
# embeddings = np.array(list_of_embeddings)

# Create and fit PCA model
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings_array)

# Plot the 2D embeddings
plt.figure(figsize=(10, 6))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5)

# Add labels and title
plt.title('2D PCA of News Article Embeddings')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

# Add labels to each point
for i, label in enumerate(labels_array):
    plt.annotate(label, (embeddings_2d[i, 0], embeddings_2d[i, 1]))

# Show the plot
plt.show()

ValueError: ignored

# TTS Example

In [None]:
!pip install gTTS > 0

In [None]:
summaries = df['Summary']
summaries

In [None]:
import pyttsx3

def text_to_speech(text):
    """
    Convert the given text to speech and play it.

    :param text: The text to convert to speech.
    """
    engine = pyttsx3.init()

    # Adjust speech rate (speed)
    rate = engine.getProperty('rate')
    engine.setProperty('rate', rate - 50)

    # Adjust volume
    volume = engine.getProperty('volume')
    engine.setProperty('volume', volume + 0.9)

    engine.say(text)
    engine.runAndWait()

if __name__ == "__main__":
    input_text = "Hello, this is a sample text. Feel free to replace it with your own text."
    text_to_speech(input_text)
