In [1]:
# Dependencies
import json
import requests
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline

# Method 1 - Data Collection   

In [None]:
def get_articles_from_category(category):
    # Define the API endpoint
    url = "https://en.wikipedia.org/w/api.php"

    # Define the parameters for the API call to get the list of articles in the category
    params_category = {
        "action": "query",
        "format": "json",
        "list": "categorymembers",
        "cmtitle": f"Category:{category}",
        "cmlimit": 100,  # Limit the number of articles returned, adjust as needed
        "cmtype": "page"  # Fetch only articles (pages), not subcategories or files
    }

    # Make the request to the Wikipedia API
    response = requests.get(url, params=params_category)
    data = response.json()

    # Extract the list of articles
    if "query" in data and "categorymembers" in data["query"]:
        articles = data["query"]["categorymembers"]

        # Prepare a list to hold article details
        articles_data = []

        # Loop through each article and fetch its description
        for article in articles:
            title = article['title']
            page_id = article['pageid']

            # Fetch the article description (extract)
            params_extract = {
                "action": "query",
                "format": "json",
                "prop": "extracts",
               "exintro": True,
                "explaintext": True,
                "pageids": page_id
            }

            response_extract = requests.get(url, params=params_extract)
            extract_data = response_extract.json()

            # Extract the description
            extract = extract_data['query']['pages'][str(page_id)].get('extract', 'No description available.')

            # Extract the full content
            content = extract_data['query']['pages'][str(page_id)].get('extract', 'No content available.')

            # Append the article data to the list
            articles_data.append({
                "Title": title,
                "Page ID": page_id,
                "Description": extract,
                "Content": content
            })

        # Convert the list to a DataFrame
        df = pd.DataFrame(articles_data)
        return df
    else:
        print(f"No articles found in category: {category}")
        return None

# Example usage:
category_name = "American Revolution"
df_articles = get_articles_from_category(category_name)

# Display the DataFrame
if df_articles is not None:
    print(df_articles)

                                   Title   Page ID  \
0                    American Revolution      1973   
1           Flag on Prospect Hill debate  76953807   
2   Template:American Revolution origins  17003234   
3                          Boston Caucus  36486735   
4                        Boston Massacre     82254   
..                                   ...       ...   
61                      Vermont Republic    538613   
62                     Verplanck's Point  53858573   
63                         Virginia Plan    318325   
64                      Worcester Revolt  74528965   
65                  Robert K. Wright Jr.    446795   

                                          Description  \
0   The American Revolution was a rebellion and po...   
1   According to tradition, the first flag of the ...   
2                                                       
3   The Boston Caucus was an informal political or...   
4   The Boston Massacre (known in Great Britain as...   
..       

In [None]:
# Save the DataFrame to a CSV file
df_articles.to_csv('american_revolution_articles.csv', index=False)

In [None]:
# check df_articles
df_articles.head()

Unnamed: 0,Title,Page ID,Description,Content
0,American Revolution,1973,The American Revolution was a rebellion and po...,The American Revolution was a rebellion and po...
1,Flag on Prospect Hill debate,76953807,"According to tradition, the first flag of the ...","According to tradition, the first flag of the ..."
2,Template:American Revolution origins,17003234,,
3,Boston Caucus,36486735,The Boston Caucus was an informal political or...,The Boston Caucus was an informal political or...
4,Boston Massacre,82254,The Boston Massacre (known in Great Britain as...,The Boston Massacre (known in Great Britain as...
...,...,...,...,...
61,Vermont Republic,538613,The Vermont Republic (French: République du Ve...,The Vermont Republic (French: République du Ve...
62,Verplanck's Point,53858573,Verplanck's Point lies at the southernmost end...,Verplanck's Point lies at the southernmost end...
63,Virginia Plan,318325,The Virginia Plan (also known as the Randolph ...,The Virginia Plan (also known as the Randolph ...
64,Worcester Revolt,74528965,"The Worcester Revolt, or Worcester Revolution ...","The Worcester Revolt, or Worcester Revolution ..."


# Data Preprocessing

## Summarization

In [None]:
# Imports for loading environment variables.
from transformers import pipeline

In [None]:
# Load a pre-trained BART-based summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def chunk_text(text, max_length=512):
    """
    Splits the text into chunks that fit within the model's maximum token limit.
    """
    sentences = text.split('. ')
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_length + sentence_length <= max_length:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            chunks.append('. '.join(current_chunk) + '.')
            current_chunk = [sentence]
            current_length = sentence_length

    if current_chunk:
        chunks.append('. '.join(current_chunk) + '.')

    return chunks

def summarize_article(text):
    """
    Summarizes the provided text using the BART model.
    Handles long texts by splitting them into smaller chunks.
    """
    chunks = chunk_text(text, max_length=512)
    summary = ' '.join(summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks)
    return summary

# Read the CSV file containing articles
input_csv = 'american_revolution_articles.csv'
articles_df = pd.read_csv(input_csv)

# Summarize each article
summaries = []
for index, row in articles_df.iterrows():
    title = row['Title']
    content = row['Content']

    # Summarize the content
    try:
        summary = summarize_article(content)
    except Exception as e:
        print(f"Error summarizing article '{title}': {e}")
        summary = "Summary not available due to an error."

    summaries.append({"Title": title, "Summary": summary})

# Save the summaries to a new CSV file
output_csv = 'american_revolution_summaries.csv'
summaries_df = pd.DataFrame(summaries)
summaries_df.to_csv(output_csv, index=False)

print(f"Summaries saved to '{output_csv}'")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Your max_length is set to 150, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


Error summarizing article 'Template:American Revolution origins': 'float' object has no attribute 'split'


Your max_length is set to 150, but your input_length is only 124. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)
Your max_length is set to 150, but your input_length is only 96. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


Error summarizing article 'Continental (currency)': 'float' object has no attribute 'split'


Your max_length is set to 150, but your input_length is only 56. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 150, but your input_length is only 86. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
Your max_length is set to 150, but your input_length is only 65. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)
Your max_length is set to 150, but your input_length is only 86. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
Your

Summaries saved to 'american_revolution_summaries.csv'


## Method 2 - Additional context & QA pairs

## Data Collection

In [2]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Step 1: Search for Categories Related to a Keyword
def search_categories(keyword):
    """
    Search for Wikipedia categories related to "American Revolution".
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": f"Category:{keyword}",
        "srlimit": 10,
    }
    response = requests.get(url, params=params)
    data = response.json()
    categories = []
    if 'query' in data and 'search' in data['query']:
        for result in data['query']['search']:
            categories.append(result['title'])
    return categories

# Step 2: Get Articles
def get_articles_from_category(category):
    """
    Fetches articles from a given Wikipedia category.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params_category = {
        "action": "query",
        "format": "json",
        "list": "categorymembers",
        "cmtitle": category,
        "cmlimit": 100,
        "cmtype": "page"
    }

    response = requests.get(url, params=params_category)
    data = response.json()
    articles_data = []

    if "query" in data and "categorymembers" in data["query"]:
        articles = data["query"]["categorymembers"]
        for article in articles:
            title = article['title']
            page_id = article['pageid']
            params_extract = {
                "action": "query",
                "format": "json",
                "prop": "extracts",
                "exintro": True,
                "explaintext": True,
                "pageids": page_id
            }
            response_extract = requests.get(url, params=params_extract)
            extract_data = response_extract.json()
            content = extract_data['query']['pages'][str(page_id)].get('extract', 'No content available.')
            articles_data.append({
                "Title": title,
                "Content": content
            })
    return articles_data

# Step 3: Data Cleaning and Preprocessing
def clean_text(text):
    """
    Removes special characters, digits, and stopwords, and tokenizes the text.
    """
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\W+', ' ', text)  # Remove special characters and extra spaces
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Step 4: Generate Question-Answer Pairs
def generate_qa_pairs(text, title):
    """
    A simplified example of generating Q&A pairs.
    """
    questions = []
    answers = []
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        question = f"What is the significance of {title}?"
        answer = sentence
        questions.append(question)
        answers.append(answer)
    return list(zip(questions, answers))

# Step 5: Create a Q&A Corpus Based on Categories
def create_qa_corpus(keyword):
    """
    Creates a Q&A corpus based on categories related to a keyword.
    """
    categories = search_categories(keyword)
    qa_corpus = []

    for category in categories:
        articles = get_articles_from_category(category)
        for article in articles:
            title = article['Title']
            content = clean_text(article['Content'])
            qa_pairs = generate_qa_pairs(content, title)
            for question, answer in qa_pairs:
                qa_corpus.append({
                    "Title": title,
                    "Question": question,
                    "Answer": answer,
                    "Context": article['Content']  # Original unprocessed context
                })

    return qa_corpus

# Define keyword
keyword = "American Revolution"

# Example usage to create a Q&A corpus for the specified keyword
qa_corpus = create_qa_corpus(keyword)

# Step 6: Store the Corpus in a Structured Format (CSV)
df_qa_corpus = pd.DataFrame(qa_corpus)
df_qa_corpus.to_csv('american_revolution_qa_corpus.csv', index=False)

print("Q&A corpus created and saved to 'american_revolution_qa_corpus.csv'.")

Q&A corpus created and saved to 'american_revolution_qa_corpus.csv'.


In [7]:
# check df_qa_corpus
df_qa_corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 427 entries, 0 to 426
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     427 non-null    object
 1   Question  427 non-null    object
 2   Answer    427 non-null    object
 3   Context   427 non-null    object
dtypes: object(4)
memory usage: 13.5+ KB


In [8]:
# check df_qa_corpus
df_qa_corpus.head()

Unnamed: 0,Title,Question,Answer,Context
0,American Revolution,What is the significance of American Revolution?,american revolution rebellion political moveme...,The American Revolution was a rebellion and po...
1,Flag on Prospect Hill debate,What is the significance of Flag on Prospect H...,according tradition first flag united states g...,"According to tradition, the first flag of the ..."
2,Boston Caucus,What is the significance of Boston Caucus?,boston caucus informal political organization ...,The Boston Caucus was an informal political or...
3,Boston Massacre,What is the significance of Boston Massacre?,boston massacre known great britain incident k...,The Boston Massacre (known in Great Britain as...
4,Brown Bess,What is the significance of Brown Bess?,brown bess nickname uncertain origin british a...,"""Brown Bess"" is a nickname of uncertain origin..."
