# IT News Project

The project aims to analyze IT news articles and recommend related articles to identify patterns, trends. This could help in understanding certain topics or discovering emerging themes in current events.

# Part 1: Scraping and Crawling

We will use multiple websites to scrape the articles. Each website has a different snippet of code as the method to get the articles is adapted to each website.

####Code to execute before the others, all the other scraping snippets can be executed individually though

In [None]:
#for the first time launching google collab (packages to install)
!pip install python-dotenv beautifulsoup4 python-dateutil

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [1]:
#to execute each time (imports used everywhere)
import requests
from bs4 import BeautifulSoup
import csv
import json
import time
import dateutil
import pandas as pd


### New York Times Articles
New York Times has its own API so we will use it. More specifically we will use the article search API.

####Execute this code before to create the .env file for the NYT API Key

**If you want to retrieve New York Times articles**


In [None]:
with open('.env', 'w') as f:
    f.write("API_KEY=W4DCpxZiGY6T8r2AP97p5gMQht3lF6gt\n")

Main Code for New York Times (This takes around 10 mins)

In [None]:
import os
import urllib3
from dotenv import load_dotenv
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#load api key
load_dotenv()

####################
####PARAMETERS######
####################
API_KEY = os.getenv('API_KEY')  # my API key retrieved from NYT API service
NYT_base_url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'# base URL for the NYT API

# function to send a request to the NYT API
def send_request(page):
    params = {
        'q': 'technology',
        'api-key': API_KEY,
        'page': page
    }

    try:
        # make a GET request to the NYT API
        response = requests.get(NYT_base_url, params=params, verify=False)
        response.raise_for_status()  # raise an error for HTTP issues
        time.sleep(12)  # delay to prevent rate-limiting (They allow 10 requests per minute so to make sure to respect that)
        return response.json()
    except Exception as e:
        print(f"Error fetching data for page {page}: {e}")
        return None

# function to check if the article is well structured
def is_valid(article):
    # check if the article has a headline and main text
    return 'headline' in article and 'main' in article['headline']

# function to parse the response data into a DataFrame
def parse_response(response):
    # initialize data structure fields
    data = {
        'URL': [],
        'Title': [],
        'Author': [],
        'Publication Date': [],
        'Content': [], # content will be empty since the NYT doesn't give access to the full text
        'Keywords': []
    }

    # extract information from each article
    articles = response['response']['docs']
    for article in articles:
        if is_valid(article):
            # append relevant data to the dictionary
            data['URL'].append(article['web_url'])
            data['Title'].append(article['headline']['main'])
            data['Author'].append(article.get('byline', {}).get('original', "No Author Found"))
            data['Publication Date'].append(dateutil.parser.parse(article['pub_date']).date())
            data['Content'].append("")  #empty since there is no text
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['Keywords'].append(", ".join(keywords) if keywords else "No Keywords Found")# store keywords as a comma-separated string


    return pd.DataFrame(data)

# function to collect articles from multiple pages and save them as a single CSV
def get_NYT_multiple_pages(nb_pages):
    all_articles = pd.DataFrame()

    #loop through multiple pages to fetch articles
    for page_num in range(nb_pages):
        print(f"Fetching page {page_num + 1}...")
        response = send_request(page_num)
        if response is not None:
            page_data = parse_response(response)
            all_articles = pd.concat([all_articles, page_data], ignore_index=True)
        else:
            print(f"Skipping page {page_num + 1} due to errors.")

    #check and remove if any duplicate URLs
    if 'URL' in all_articles.columns:
        initial_count = len(all_articles)
        all_articles = all_articles.drop_duplicates(subset='URL').reset_index(drop=True)
        final_count = len(all_articles)
        print(f"Removed {initial_count - final_count} duplicate articles based on URL.")
    else:
        print("No 'URL' column found. Skipping duplicate check.")

    #save the final DataFrame as a single CSV file
    csv_path = "/content/nyt_tech_articles.csv"
    all_articles.to_csv(csv_path, index=False, encoding='utf-8-sig')
    print(f"Data saved to {csv_path}")


# specify the number of pages to scrape
get_NYT_multiple_pages(50)


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Fetching page 19...
Fetching page 20...
Fetching page 21...
Fetching page 22...
Fetching page 23...
Fetching page 24...
Fetching page 25...
Fetching page 26...
Fetching page 27...
Fetching page 28...
Fetching page 29...
Fetching page 30...
Fetching page 31...
Fetching page 32...
Fetching page 33...
Fetching page 34...
Fetching page 35...
Fetching page 36...
Fetching page 37...
Fetching page 38...
Fetching page 39...
Fetching page 40...
Fetching page 41...
Fetching page 42...
Fetching page 43...
Fetching page 44...
Fetching page 45...
Fetching page 46...
Fetching page 47...
Fetching page 48...
Fetching page 49...
Fetching page 50...
Removed 0

### CNN Tech Articles
We will scrape the articles from the Tech page of CNN. The CNN sitemap doesn't keep any tech related articles, and scraping/crawling on the /search is forbidden (as written on the robots.txt). This takes around 15s (only one page to crawl)

In [None]:

####################
####PARAMETERS######
####################
CNN_base_url = "https://edition.cnn.com/business/tech" #Tech section base URL
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"}#headers to pretend to be a browser
CNN_url = "https://edition.cnn.com" #URL to construct the url manually just in case

#function to extract article URLs from CNN's Business Tech section
def get_cnn_tech_article_urls(page_num=1):
    # construct the URL for the page
    url = f"{CNN_base_url}?page={page_num}"
    response = requests.get(url, headers=headers)
    article_links = set()  #use a set to automatically handle duplicates

    #check if the request was successful
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        #find all article links
        for link in soup.find_all("a", href=True):  #finds all <a> tags with an href attribute
            href = link["href"]

            #look for articles with "/tech/" in the URL (tech-related articles)
            if "/2024/" in href and "/tech/" in href:
                full_url = href if href.startswith("http") else CNN_url + href
                article_links.add(full_url)

    else:
        print(f"Failed to retrieve page {page_num} at {url}")

    return article_links

#function to extract article text from JSON-LD script tag
def extract_text_from_json_ld(url):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        json_ld_tag = soup.find("script", type="application/ld+json")
        if json_ld_tag:
            json_data = json.loads(json_ld_tag.string)
            if isinstance(json_data, list):
                for item in json_data:
                    if "articleBody" in item:
                        return item["articleBody"]
            elif "articleBody" in json_data:
                return json_data["articleBody"]
        print(f"No article body found in JSON-LD for {url}")
        return ""
    else:
        print(f"Failed to retrieve page at {url}")
        return ""

#function to scrape the full text of an article
def scrape_CNN_article(url):
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        title = soup.find('title')
        title = title.text.strip() if title else "No Title Found"

        author_meta = soup.find('meta', attrs={'name': 'author'})
        author = author_meta['content'].strip() if author_meta else "No Author Found"

        date = soup.find('meta', attrs={'property': 'article:published_time'})
        date = dateutil.parser.parse(date['content']).date() if date else "No Date Found"

        paragraphs = soup.find('div', attrs={'class': 'article__content'}).find_all('p')
        content = "\n".join(p.text.strip() for p in paragraphs if p.text)

        return {'URL': url, 'Title': title, 'Author': author, 'Publication Date': date, 'Content': content}
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {'URL': url, 'Title': None, 'Author': None, 'Publication Date': None, 'Content': None}

#function to save articles to a CSV file
def save_to_csv(articles, filename="cnn_tech_articles.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["URL","Title", "Author", "Publication Date", "Content"])
        writer.writeheader()
        for article in articles:
            writer.writerow(article)

def scrape_CNN_multiple_pages(num_pages=1):
    article_urls = set()
    articles = []
    for page_num in range(1, num_pages+1):
      article_urls.update(get_cnn_tech_article_urls(page_num))
    article_urls = list(article_urls)[:100]
    for url in article_urls:
      articles.append(scrape_CNN_article(url))
      time.sleep(1)  #pause to avoid overwhelming the server
    save_to_csv(articles)
    print("Articles saved to cnn_tech_articles.csv")

#run the scraper
#(I only fetch the first page because CNN doesn't store more than the current articles on the main tech page)
scrape_CNN_multiple_pages(1)

### Tech Crunch
Tech Crunch is a news website dedicated to technological news. Therefore, we will crawl the latest news pages and scrape each article on the page. This snippet takes around 10 mins (for me).

*Tech crunch also gives access to the sitemap so the crawling could have been done that way too.*

In [None]:
####################
####PARAMETERS######
####################
TC_base_url = "https://techcrunch.com/latest/page/" #base URL latest news
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"}#headers to pretend to be a browser

def fetch_page(url):
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return response.text

#function to parse article links from the page
def parse_article_links(page_html):
    soup = BeautifulSoup(page_html, 'html.parser')
    article_links = []

    #look for links with the correct class name
    for article in soup.find_all('a', class_='loop-card__title-link'):
        article_url = article.get('href')

        #exclude links that contain '/podcast/'
        if '/podcast/' not in article_url:
            article_links.append(article_url)

    return article_links

#function to scrape the full text of an article
def scrape_TC_article(url):
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        title = soup.find('title')
        title = title.text.strip() if title else "No Title Found"

        author_meta = soup.find('meta', attrs={'name': 'author'})
        author = author_meta['content'].strip() if author_meta else "No Author Found"

        date = soup.find('meta', attrs={'property': 'article:published_time'})
        date = dateutil.parser.parse(date['content']).date() if date else "No Date Found"

        #try to find all <p> tags with the class 'wp-block-paragraph'
        paragraphs = soup.find('div', attrs={'class': 'entry-content wp-block-post-content is-layout-constrained wp-block-post-content-is-layout-constrained'}).find_all('p')
        content = "\n".join(p.text.strip() for p in paragraphs if p.text)

        return {'URL': url, 'Title': title, 'Author': author, 'Publication Date': date, 'Content': content}
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {'URL': url, 'Title': None, 'Author': None, 'Publication Date': None, 'Content': None}

#function to scrape multiple pages
def scrape_TC_multiple_pages(start_page=1, end_page=5):
    csv_filename = "tc_tech_articles.csv"
    scraped_urls = set()  # Set to track scraped URLs and avoid duplicates

    with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['URL', 'Title', 'Author', 'Publication Date', 'Content'])
        writer.writeheader()

        for page_num in range(start_page, end_page + 1):
            page_url = TC_base_url+f'{page_num}/'

            #fetch the page
            print(f"Fetching page: {page_url}")
            page_html = fetch_page(page_url)

            #extract article links from the page
            article_urls = parse_article_links(page_html)

            #scrape content from each article
            for article_url in article_urls:
                if article_url not in scraped_urls:  #check if URL is already scraped
                    article_data = scrape_TC_article(article_url)
                    if article_data['Content']:  #only save if content is non-empty
                        writer.writerow(article_data)
                        scraped_urls.add(article_url)  #add URL to the set
                else:
                  time.sleep(1)  # wait for 1 second between requests to avoid overloading server

    print(f"Scraped articles are saved to {csv_filename}")

# Run the scraper
scrape_TC_multiple_pages(start_page=1, end_page=51)


### Venture Beats
The sitemap for venturebeats archives the sites news everyday. So, we will crawl the sitemap will all the links to the archives, and then crawl each archive, and scrape all articles with the designed keyword (I choose all keywords for all the IT related categories on the main website). There are headers for each snippet of code, but this was added for Venture Beats in the first place, they are very strict with their Scraping/Crawling rules. This takes around 40 mins for 100 pages.

In [None]:
####################
####PARAMETERS######
####################
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"}#headers to pretend to be a browser
VB_sitemap_url = "https://venturebeat.com/sitemap.xml"

#fetch main sitemap where all archives of the sitemap of everyday are stored
def fetch_sitemap(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "xml")
    sitemap_links = [loc.text for loc in soup.find_all("loc")]
    return sitemap_links

#filter tech related articles (so filter with keywords and take out additional content such as pictures, etc)
def is_article_url(url):
    exclude_keywords = ["wp-content", ".png", ".jpg", ".webp"]
    if any(keyword in url for keyword in exclude_keywords):
        return False
    include_keywords = ["/ai/", "/data-infrastructure/", "/programming-development/", "/automation/", "/security/"]
    return any(keyword in url for keyword in include_keywords)

#scrape each article
def scrape_VB_article(url):
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        title = soup.find('h1', class_='article-title') or soup.find('title')
        title = title.text.strip() if title else "No Title Found"

        author_meta = soup.find('meta', attrs={'name': 'author'})
        author = author_meta['content'].strip() if author_meta else "No Author Found"

        date = soup.find('time')
        date = dateutil.parser.parse(date['datetime']).date() if date else "No Date Found"

        article_content = soup.find("div", class_="article-content")
        #remove unwanted elements with the specified class names
        for unwanted in article_content.find_all(["div", "form"], class_=["post-boilerplate", "boilerplate-before", "boilerplate-after"]):
            unwanted.decompose()

        if article_content:
            #extract only the text from the relevant section, ensuring it is cleaned up
            clean_text = article_content.get_text(separator="\n", strip=True)
        else:
            print("Article content not found!")
            clean_text = ""

        return {'URL': url, 'Title': title, 'Author': author, 'Publication Date': date, 'Content': clean_text}
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {'URL': url, 'Title': None, 'Author': None, 'Publication Date': None, 'Content': None}

#fetch a certain number of days of archive on the main sitemap
def fetch_daily_sitemaps(main_sitemap, num_days=100):
    daily_sitemaps = []
    for i in range(min(num_days, len(main_sitemap))):
        daily_sitemaps.append(main_sitemap[i])
    return daily_sitemaps

def scrape_VB_multiple_pages(num_days=100):
    main_sitemap = fetch_sitemap(VB_sitemap_url)
    daily_sitemaps = fetch_daily_sitemaps(main_sitemap, num_days)
    #collect articles url
    all_urls = set()
    for daily_sitemap in daily_sitemaps:
        daily_urls = fetch_sitemap(daily_sitemap)
        all_urls.update(daily_urls)
        time.sleep(5)  #sleep to not overwhelm the server

    #filter URLs with the filtering method
    article_urls = [url for url in all_urls if is_article_url(url)]

    #scrape the articles one by one
    articles_data = []
    for url in article_urls:
        article_data = scrape_VB_article(url)
        if article_data['Content']:  # Save only when they have content
            articles_data.append(article_data)
        time.sleep(5)

    output_file = "/content/vb_tech_articles.csv"
    df = pd.DataFrame(articles_data)
    df.to_csv(output_file, index=False)
    print(f"Scraping complete. Data saved to {output_file}.")

#run the scraper
scrape_VB_multiple_pages(100)

Each dataframe has been created according to the source code of each website, and has been checked for duplicates. The delay is also adapted to the robots.txt of the website if indicated.

#Combine the CSVs (except for NYT) together

In [2]:
#list of specific CSV files to combine (so we exclude NYT articles)
csv_files = [
    "/content/cnn_tech_articles.csv",
    "/content/tc_tech_articles.csv",
    "/content/vb_tech_articles.csv"
]

output_file = "combined_csvs.csv"

#list to hold DataFrames
dataframes = []

#loop through the list of the CSVs
for file in csv_files:
    print(f"Reading {file}")
    #read each CSV file and append it to the list
    df = pd.read_csv(file)
    dataframes.append(df)

#combine all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

#save the combined DataFrame to a new CSV file
combined_df.to_csv(output_file, index=False)
print(f"Combined CSV saved to {output_file}")


Reading /content/cnn_tech_articles.csv
Reading /content/tc_tech_articles.csv
Reading /content/vb_tech_articles.csv
Combined CSV saved to combined_csvs.csv


# Part 2: Data Preprocessing