In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

# ⚙️ Settings
pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings

In [98]:
#Cogemos varios links de busquedas en google news y analizamos su estructura
"""
https://news.google.com/rss/search?q=<your_topic>+after:<YYYY-MM-DD>+before:<YYYY-MM-DD>&hl=<language_code>&gl=<country_code>&ceid=<country_code>:<language_code>
https://news.google.com/search?q=apple%20leak%20when%3A1y&hl=en-US&gl=US&ceid=US%3Aen
https://news.google.com/topstories?hl=en-US&gl=US&ceid=US:en
https://news.google.com/rss/search?q=intitle:AAPL+when:1h&hl=en-US&gl=US&ceid=US:en
"""

#Sacamos los parametros que queremos para personalizar la busqueda
language = "en"
country = "US"
keywords = "apple leak"
keywords = keywords.replace(" ", "%20")
site = "forbes.com"
start_date = "2023-01-01"
end_date = "2023-12-31"

link = f"https://news.google.com/rss/search?q={keywords}+after:{start_date}+before:{end_date}+site:{site}&hl={language}&gl={country}&ceid={country}:{language}"

In [99]:
#Comprobamos que el link funciona
print(link)

https://news.google.com/rss/search?q=leak+after:2023-01-01+before:2023-12-31+site:macrumors.com&hl=en&gl=US&ceid=US:en


In [22]:
response = requests.get(link)
response.status_code

200

In [26]:
html = response.text
soup = BeautifulSoup(html, 'html.parser')

In [57]:
#Sacamos lista de todos los titulares y fechas de publicación de las noticias
all_titles = []
all_dates = []

news = soup.find_all("item")
for element in news:
    all_titles.append(element.find("title").get_text())
    all_dates.append(element.find("pubdate").get_text())
    

In [58]:
all_dates

['Sat, 15 Apr 2023 07:00:00 GMT',
 'Tue, 27 Jun 2023 07:00:00 GMT',
 'Sat, 06 May 2023 07:00:00 GMT',
 'Sat, 22 Apr 2023 07:00:00 GMT',
 'Sun, 01 Jan 2023 08:00:00 GMT',
 'Sat, 13 May 2023 07:00:00 GMT',
 'Tue, 01 Aug 2023 07:00:00 GMT',
 'Fri, 11 Aug 2023 07:00:00 GMT',
 'Mon, 22 May 2023 07:00:00 GMT',
 'Tue, 02 May 2023 07:00:00 GMT',
 'Tue, 31 Jan 2023 08:00:00 GMT',
 'Sat, 11 Mar 2023 08:00:00 GMT',
 'Tue, 12 Sep 2023 07:00:00 GMT',
 'Tue, 22 Aug 2023 07:00:00 GMT',
 'Tue, 02 May 2023 07:00:00 GMT',
 'Sat, 04 Feb 2023 08:00:00 GMT',
 'Fri, 19 May 2023 07:00:00 GMT',
 'Wed, 04 Oct 2023 07:00:00 GMT',
 'Wed, 25 Oct 2023 07:00:00 GMT',
 'Tue, 15 Aug 2023 07:00:00 GMT',
 'Sun, 10 Sep 2023 07:00:00 GMT',
 'Mon, 10 Jul 2023 07:00:00 GMT',
 'Thu, 12 Jan 2023 08:00:00 GMT',
 'Sun, 12 Feb 2023 08:00:00 GMT',
 'Wed, 31 May 2023 07:00:00 GMT',
 'Tue, 10 Jan 2023 08:00:00 GMT',
 'Thu, 27 Jul 2023 07:00:00 GMT',
 'Sun, 02 Jul 2023 07:00:00 GMT',
 'Sun, 27 Aug 2023 07:00:00 GMT',
 'Sat, 25 Feb 

In [61]:
#Limpiamos las fechas para quedarnos solo con día, mes y año
for i in range(len(all_dates)):
    all_dates[i] = all_dates[i][5:16]

all_dates

['15 Apr 2023',
 '27 Jun 2023',
 '06 May 2023',
 '22 Apr 2023',
 '01 Jan 2023',
 '13 May 2023',
 '01 Aug 2023',
 '11 Aug 2023',
 '22 May 2023',
 '02 May 2023',
 '31 Jan 2023',
 '11 Mar 2023',
 '12 Sep 2023',
 '22 Aug 2023',
 '02 May 2023',
 '04 Feb 2023',
 '19 May 2023',
 '04 Oct 2023',
 '25 Oct 2023',
 '15 Aug 2023',
 '10 Sep 2023',
 '10 Jul 2023',
 '12 Jan 2023',
 '12 Feb 2023',
 '31 May 2023',
 '10 Jan 2023',
 '27 Jul 2023',
 '02 Jul 2023',
 '27 Aug 2023',
 '25 Feb 2023',
 '01 Sep 2023',
 '22 Aug 2023',
 '13 Jan 2023',
 '27 Jul 2016',
 '12 Feb 2023',
 '01 Apr 2023',
 '20 Mar 2023',
 '19 Jul 2023',
 '13 Aug 2023',
 '26 Mar 2023',
 '16 Jun 2023',
 '02 Mar 2023',
 '26 Jul 2023',
 '11 Dec 2023',
 '22 Jul 2023',
 '25 Mar 2023',
 '19 Apr 2023',
 '06 Mar 2023',
 '10 Sep 2023',
 '07 Jul 2023',
 '01 Aug 2023',
 '22 Jan 2023',
 '09 Sep 2023',
 '03 Jan 2023',
 '25 Feb 2023',
 '08 Feb 2023',
 '15 Oct 2023',
 '03 Nov 2023',
 '19 Dec 2023',
 '09 Jul 2023',
 '07 Aug 2023',
 '29 Jun 2023',
 '04 Jul

In [77]:
#Creamos un diccionario con las noticias con el que crearemos el df
all_google_news = {
    "Date": all_dates,
    "News": all_titles
}
news_df = pd.DataFrame(data=all_google_news)
news_df.head()

Unnamed: 0,Date,News
0,15 Apr 2023,New iPhone 15 Pro Leak Claims Scrapped Action ...
1,27 Jun 2023,Apple Insider Reveals Price Cut Plans For iPho...
2,06 May 2023,Apple Warned About USB-C Ahead Of iPhone 15 Re...
3,22 Apr 2023,Apple Leak Details New iPhone 15 Pro Action Bu...
4,01 Jan 2023,Insider Claims iPhone 15 Will Be ‘Aggressively...


In [78]:
#Pasamos las fechas de string a formato fecha
news_df["Date"] = pd.to_datetime(news_df['Date'])
news_df.head()

Unnamed: 0,Date,News
0,2023-04-15,New iPhone 15 Pro Leak Claims Scrapped Action ...
1,2023-06-27,Apple Insider Reveals Price Cut Plans For iPho...
2,2023-05-06,Apple Warned About USB-C Ahead Of iPhone 15 Re...
3,2023-04-22,Apple Leak Details New iPhone 15 Pro Action Bu...
4,2023-01-01,Insider Claims iPhone 15 Will Be ‘Aggressively...


In [79]:
#Ordenamos por fecha
news_df = news_df.sort_values(by="Date", ascending=True).reset_index(drop=True)
news_df

In [80]:
#Quitamos la fecha anterior a 2023
news_df = news_df[news_df["Date"] >= "2023-01-01"]
news_df

Unnamed: 0,Date,News
1,2023-01-01,Insider Claims iPhone 15 Will Be ‘Aggressively...
2,2023-01-03,Apple Watch Ultra: Brilliant Design Upgrade Le...
3,2023-01-10,Apple Insider Leaks $100 Price Increase For iP...
4,2023-01-12,"Apple AirPods Lite For $99 Coming Soon, New Le..."
5,2023-01-13,Apple Watch Ultra 2: New Leak Hints At Dazzlin...
...,...,...
95,2023-12-11,The New Look Of Samsung’s Galaxy S24 Ultra - F...
96,2023-12-17,MacBook Pro Leak Reveals OLED Upgrade Delay - ...
97,2023-12-19,New iOS 17.3 Leak Reveals Stunning iPhone Feat...
98,2023-12-29,"Apple Loop: iPhone 16 Details, MacBook Air Pri..."


In [91]:
#Buscamos días con más de 1 noticia
news_df.Date.value_counts().head(16)

Date
2023-07-09    2
2023-02-12    2
2023-09-10    2
2023-08-27    2
2023-08-26    2
2023-05-19    2
2023-08-25    2
2023-08-22    2
2023-03-11    2
2023-03-06    2
2023-02-25    2
2023-11-03    2
2023-12-11    2
2023-08-01    2
2023-05-02    2
2023-08-13    1
Name: count, dtype: int64

In [81]:
#Guardamos el df en un CSV
news_df.to_csv("Apple_news_2023.csv", index=False)