<a href="https://colab.research.google.com/github/charu1605/web-scraping/blob/main/scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup

def get_article_text(url):
    '''Extracts and returns only the main text from a given URL.'''
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }

        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')


        content_divs = soup.find_all('div', class_="elementor-widget-container")

        # Extract text from all <p> tags inside these divs
        article_text = "\n\n".join([p.get_text() for div in content_divs for p in div.find_all('p')])

        return article_text.strip() if article_text else "No text found"

    except Exception as e:
        print(f"Error: {e}")
        return None

def extract_articles(article_names_and_urls):
    '''Extracts articles for multiple URLs with specific names.'''
    articles = {}

    for name, url in article_names_and_urls.items():
        print(f"Extracting article: {name}")
        article_text = get_article_text(url)
        if article_text:
            articles[name] = article_text
        else:
            articles[name] = "Failed to extract article"

    return articles

# Define article names and corresponding URLs in the dictionary
article_names_and_urls = {
    "Trump": "https://scrapsfromtheloft.com/opinions/trump-and-the-decline-of-the-american-people/",
    "Jeffrey Sachs": "https://scrapsfromtheloft.com/opinions/jeffrey-sachs-can-there-be-peace-with-netanyahu-transcript/",
    "Kaja Kallas": "https://scrapsfromtheloft.com/opinions/the-pure-genius-of-kaja-kallas/",
    "Alberto Piroddi":"https://scrapsfromtheloft.com/opinions/why-do-the-poor-in-america-elect-the-rich/",
    "Massimo Cacciari":"https://scrapsfromtheloft.com/opinions/the-musks-the-machine-and-the-left-out-of-play/"

}

# Extract articles
extracted_articles = extract_articles(article_names_and_urls)

# Print the extracted articles
for name, article_text in extracted_articles.items():
    print(f"\nArticle: {name}\n")
    print(article_text[:500])  # Print only the first 500 characters for preview
    print("\n" + "="*50 + "\n")


Extracting article: Trump
Extracting article: Jeffrey Sachs
Extracting article: Kaja Kallas
Extracting article: Alberto Piroddi
Extracting article: Massimo Cacciari

Article: Trump

by Pino Arlacchi

Trump’s inaugural address and his first executive orders typify the debut of a populist leader promising to rescue the people from their perceived plight and lead them toward a path of rebirth.

How many times have we seen this dismal spectacle? From Mussolini to Hitler, from Berlusconi to today’s xenophobic European leaders, countless figures have promised national greatness and prosperity through their personal charisma, only to collapse ignominiously—and often tragically—und



Article: Jeffrey Sachs

Judge Andrew Napolitano and Professor Jeffrey Sachs discussed key geopolitical issues on Judging Freedom, including Donald Trump’s executive order to declassify files related to the assassinations of JFK, RFK, and MLK, raising questions about transparency and potential revelations. They ex

In [None]:
def save_articles_to_files(articles):
    '''Saves each article in a separate text file.'''
    for name, article_text in articles.items():
        # Sanitize the article title for use as a filename
        filename = f"{name.replace(' ', '_').replace('/', '_').replace(':', '')}.txt"

        # Save the article text to a text file
        with open(filename, "w", encoding="utf-8") as file:
            file.write(article_text)

        print(f"Article '{name}' saved as '{filename}'")
extracted_articles = extract_articles(article_names_and_urls)

# Save articles to text files
save_articles_to_files(extracted_articles)

Extracting article: Trump
Extracting article: Jeffrey Sachs
Extracting article: Kaja Kallas
Extracting article: Alberto Piroddi
Extracting article: Massimo Cacciari
Article 'Trump' saved as 'Trump.txt'
Article 'Jeffrey Sachs' saved as 'Jeffrey_Sachs.txt'
Article 'Kaja Kallas' saved as 'Kaja_Kallas.txt'
Article 'Alberto Piroddi' saved as 'Alberto_Piroddi.txt'
Article 'Massimo Cacciari' saved as 'Massimo_Cacciari.txt'


In [None]:
import pickle
def save_articles_to_files_and_pickle(articles):
    '''Saves each article in a separate text file and a pickle file.'''

    # Save articles as .txt files
    for name, article_text in articles.items():
        # Sanitize the article title for use as a filename
        filename = f"{name.replace(' ', '_').replace('/', '_').replace(':', '')}.txt"

        # Save the article text to a text file
        with open(filename, "w", encoding="utf-8") as file:
            file.write(article_text)

        print(f"Article '{name}' saved as '{filename}'")

    # Save the articles as a pickle file
    with open("articles.pkl", "wb") as pickle_file:
        pickle.dump(articles, pickle_file)

    print("All articles saved as 'articles.pkl'")

# Extract articles
extracted_articles = extract_articles(article_names_and_urls)

# Save articles to text files and pickle file
save_articles_to_files_and_pickle(extracted_articles)

Extracting article: Trump
Extracting article: Jeffrey Sachs
Extracting article: Kaja Kallas
Extracting article: Alberto Piroddi
Extracting article: Massimo Cacciari
Article 'Trump' saved as 'Trump.txt'
Article 'Jeffrey Sachs' saved as 'Jeffrey_Sachs.txt'
Article 'Kaja Kallas' saved as 'Kaja_Kallas.txt'
Article 'Alberto Piroddi' saved as 'Alberto_Piroddi.txt'
Article 'Massimo Cacciari' saved as 'Massimo_Cacciari.txt'
All articles saved as 'articles.pkl'


In [None]:
import pickle

# Load the saved articles from the pickle file
with open("articles.pkl", "rb") as file:
    articles = pickle.load(file)

# Print a preview of each article
for name, article_text in articles.items():
    print(f"\nArticle: {name}\n")
    print(article_text[:500])  # Print only the first 500 characters for preview
    print("\n" + "="*50 + "\n")



Article: Trump

by Pino Arlacchi

Trump’s inaugural address and his first executive orders typify the debut of a populist leader promising to rescue the people from their perceived plight and lead them toward a path of rebirth.

How many times have we seen this dismal spectacle? From Mussolini to Hitler, from Berlusconi to today’s xenophobic European leaders, countless figures have promised national greatness and prosperity through their personal charisma, only to collapse ignominiously—and often tragically—und



Article: Jeffrey Sachs

Judge Andrew Napolitano and Professor Jeffrey Sachs discussed key geopolitical issues on Judging Freedom, including Donald Trump’s executive order to declassify files related to the assassinations of JFK, RFK, and MLK, raising questions about transparency and potential revelations. They examined the Ukraine war, predicting its imminent end due to anticipated U.S. withdrawal of support under a Republican Congress, and criticized the Biden administratio

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
import pickle

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Load the articles from the pickle file
with open("articles.pkl", "rb") as file:
    articles = pickle.load(file)

# Initialize stop words
stop_words = set(stopwords.words('english'))

# Function to clean a single article text
def clean_article_text(article_text):
    # Step 1: Convert text to lowercase
    article_text = article_text.lower()

    # Step 2: Remove punctuation
    article_text = article_text.translate(str.maketrans("", "", string.punctuation))

    # Step 3: Remove numerical values
    article_text = re.sub(r'\d+', '', article_text)

    # Step 4: Remove non-sensical text (newlines and extra spaces)
    article_text = article_text.replace("\n", " ").strip()

    # Step 5: Remove stop words
    filtered_words = [word for word in article_text.split() if word not in stop_words]

    # Join back into a cleaned text
    cleaned_text = " ".join(filtered_words)

    return cleaned_text

# Process each article in the pickle file
cleaned_articles = {name: clean_article_text(text) for name, text in articles.items()}

# Print cleaned previews of each article
for name, cleaned_text in cleaned_articles.items():
    print(f"\nArticle: {name}\n")
    print(cleaned_text[:500])  # Print only the first 500 characters for preview
    print("\n" + "="*50 + "\n")



Article: Trump

pino arlacchi trump’s inaugural address first executive orders typify debut populist leader promising rescue people perceived plight lead toward path rebirth many times seen dismal spectacle mussolini hitler berlusconi today’s xenophobic european leaders countless figures promised national greatness prosperity personal charisma collapse ignominiously—and often tragically—under weight economic crises wars betrayal interests initially supported true trump came power democratic elections however ri



Article: Jeffrey Sachs

judge andrew napolitano professor jeffrey sachs discussed key geopolitical issues judging freedom including donald trump’s executive order declassify files related assassinations jfk rfk mlk raising questions transparency potential revelations examined ukraine war predicting imminent end due anticipated us withdrawal support republican congress criticized biden administration’s handling conflict discussion also addressed instability benjamin netanyahu

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import re
import string
import nltk
import pandas as pd
from nltk.corpus import stopwords
import pickle

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Load the articles from the pickle file
with open("articles.pkl", "rb") as file:
    articles = pickle.load(file)

# Initialize stop words
stop_words = set(stopwords.words('english'))

# Function to clean a single article text
def clean_article_text(article_text):
    # Step 1: Convert text to lowercase
    article_text = article_text.lower()

    # Step 2: Remove punctuation
    article_text = article_text.translate(str.maketrans("", "", string.punctuation))

    # Step 3: Remove numerical values
    article_text = re.sub(r'\d+', '', article_text)

    # Step 4: Remove non-sensical text (newlines and extra spaces)
    article_text = article_text.replace("\n", " ").strip()

    # Step 5: Remove stop words
    filtered_words = [word for word in article_text.split() if word not in stop_words]

    # Join back into a cleaned text
    cleaned_text = " ".join(filtered_words)

    return cleaned_text

# Create the Corpus (DataFrame)
cleaned_articles = {name: clean_article_text(text) for name, text in articles.items()}
corpus_df = pd.DataFrame(list(cleaned_articles.items()), columns=['Article', 'Text'])

# Save the Corpus to a CSV
corpus_df.to_csv("corpus.csv", index=False)

# Print the Corpus preview
print("Corpus DataFrame:")
print(corpus_df.head())  # Show first few articles


Corpus DataFrame:
            Article                                               Text
0             Trump  pino arlacchi trump’s inaugural address first ...
1     Jeffrey Sachs  judge andrew napolitano professor jeffrey sach...
2       Kaja Kallas  marco travaglio among drunken pygmies socalled...
3   Alberto Piroddi  alberto piroddi phenomenon poor americans elec...
4  Massimo Cacciari  massimo cacciari “put musks charge” – seems en...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Create the Document-Term Matrix (DTM)
vectorizer = CountVectorizer(stop_words='english')
dtm = vectorizer.fit_transform(corpus_df['Text'])

# Convert the DTM into a DataFrame
dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out(), index=corpus_df['Article'])

# Save the DTM to a CSV
dtm_df.to_csv("document_term_matrix.csv")

# Print the DTM preview
print("\nDocument-Term Matrix:")
print(dtm_df.head())  # Show the DTM (first few rows)



Document-Term Matrix:
                  ability  absolutely  abstention  abstract  absurd  accident  \
Article                                                                         
Trump                   1           0           0         0       0         0   
Jeffrey Sachs           0           4           0         0       1         0   
Kaja Kallas             0           0           0         0       0         0   
Alberto Piroddi         1           0           0         1       0         1   
Massimo Cacciari        0           0           2         0       0         0   

                  accountability  accountable  achieve  achievements  ...  \
Article                                                               ...   
Trump                          0            0        0             0  ...   
Jeffrey Sachs                  0            0        1             0  ...   
Kaja Kallas                    0            0        0             0  ...   
Alberto Piroddi         