In [2]:
#!pip install pandas torch transformers spacy
#!python -m spacy download en_core_web_md
#!python -m spacy download en_core_web_trf

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [1]:
import os
import pandas as pd
from transformers import pipeline
import spacy

In [2]:
print("--- Step 1: Loading Dataset ---")

# Define the path to your dataset folder
data_path = "/content/drive/MyDrive/Colab Notebooks/bbc-fulltext/bbc"

# A list to hold the data from each file
data = []

# Check if the path exists to avoid errors
if not os.path.exists(data_path):
    print(f"Error: The directory '{data_path}' was not found.")
    print("Please make sure the 'bbc' folder is in the same directory as your script.")
else:
    # Loop through each category folder (business, sport, etc.)
    for category in os.listdir(data_path):
        category_path = os.path.join(data_path, category)
        if os.path.isdir(category_path):
            # Loop through each .txt file in the category folder
            for filename in os.listdir(category_path):
                if filename.endswith(".txt"):
                    file_path = os.path.join(category_path, filename)
                    try:
                        # Try to read the file with utf-8 encoding
                        with open(file_path, 'r', encoding='utf-8') as f:
                            text = f.read()
                    except UnicodeDecodeError:
                        # If that fails, fall back to latin-1 encoding
                        with open(file_path, 'r', encoding='latin-1') as f:
                            text = f.read()

                    data.append({'category': category, 'text': text})

    # Create the final DataFrame
    df = pd.DataFrame(data)
    print(f"Dataset loaded successfully with {len(df)} articles.")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())

--- Step 1: Loading Dataset ---
Dataset loaded successfully with 2225 articles.

First 5 rows of the dataset:
  category                                               text
0    sport  Connors boost for British tennis\n\nFormer wor...
1    sport  Henman overcomes rival Rusedski\n\nTim Henman ...
2    sport  Bellamy fined after row\n\nNewcastle have fine...
3    sport  Collins calls for Chambers return\n\nWorld 100...
4    sport  Robben sidelined with broken foot\n\nChelsea w...


In [3]:
# ==============================================================================
# Step 2 : Classify All Categories into Sub-Categories
# ==============================================================================

print("\n--- Step 2 (Updated): Classifying All Categories into Sub-Categories ---")

# --- Initialize Zero-Shot Classifier
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0) # Added device=0 to explicitly use GPU

# --- Define Your Sub-Categories ---
business_subcategories = ["stock market", "company news", "mergers and acquisitions", "economy", "market analysis"]
entertainment_subcategories = ["cinema and film", "theatre and stage", "music", "literature and books", "celebrity news", "television"]
sports_subcategories = ["cricket", "football", "rugby", "athletics", "tennis", "formula one", "Olympics"]
tech_subcategories = ["computers and hardware", "software and gaming", "mobile and gadgets", "internet and social media", "artificial intelligence", "tech industry"]
politics_subcategories = ["uk domestic policy", "elections and campaigns", "government and parliament", "international politics", "political parties"]

# Define a maximum chunk size based on the model's limit (BART's is typically 1024)
MAX_CHUNK_SIZE = 512 # Using a smaller size to be safe and allow for potential tokenization overhead

# --- Classification Function ---
def classify_subcategory(text, category):
    """
    Classifies a text into a sub-category based on its main category by
    splitting long texts into chunks.
    """
    candidate_labels = []
    if category == 'business':
        candidate_labels = business_subcategories
    elif category == 'entertainment':
        candidate_labels = entertainment_subcategories
    elif category == 'sport':
        candidate_labels = sports_subcategories
    elif category == 'tech':
        candidate_labels = tech_subcategories
    elif category == 'politics':
        candidate_labels = politics_subcategories
    else:
        return 'N/A'

    if not candidate_labels:
        return 'N/A'

    # Split text into chunks if it's too long
    # A simple split by space might break words, a more sophisticated approach
    # would use the tokenizer, but this is a reasonable starting point.
    words = text.split()
    chunks = [' '.join(words[i:i + MAX_CHUNK_SIZE]) for i in range(0, len(words), MAX_CHUNK_SIZE)]

    results = []
    for chunk in chunks:
        try:
            # Perform the classification on each chunk
            result = zero_shot_classifier(chunk, candidate_labels=candidate_labels, multi_label=False)
            results.append(result)
        except Exception as e:
            print(f"Error classifying chunk: {e}")
            # Optionally, handle errors per chunk, e.g., skip or assign a default

    # Combine results from chunks
    # This is a simple approach: find the label that appears most frequently
    # or has the highest cumulative score across chunks.
    # For simplicity, let's take the label from the first successful chunk.
    if results:
        return results[0]['labels'][0]
    else:
        return 'Classification Error' # Indicate failure if no chunks were classified

# --- Apply the Function to the DataFrame ---
print("Applying sub-category classification to the full dataset... (This is slow)")
# Applying the function directly to the DataFrame can still be very slow.
df['sub_category'] = df.apply(lambda row: classify_subcategory(row['text'], row['category']), axis=1)
print("Sub-category classification complete.")

# --- Review the new results ---
print("\nSub-category counts for all categories:")
print(df['sub_category'].value_counts())

# You can also view the breakdown per category
print("\nBreakdown for Politics:")
print(df[df['category'] == 'politics']['sub_category'].value_counts())

print("\nBreakdown for Tech:")
print(df[df['category'] == 'tech']['sub_category'].value_counts())


--- Step 2 (Updated): Classifying All Categories into Sub-Categories ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


Applying sub-category classification to the full dataset... (This is slow)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Sub-category classification complete.

Sub-category counts for all categories:
sub_category
company news                 250
football                     181
political parties            139
rugby                        132
economy                      126
music                        115
cinema and film              115
tech industry                111
mobile and gadgets            96
government and parliament     91
tennis                        86
athletics                     84
uk domestic policy            79
market analysis               71
television                    67
internet and social media     67
celebrity news                65
computers and hardware        64
software and gaming           62
international politics        55
elections and campaigns       53
stock market                  34
mergers and acquisitions      29
Olympics                      17
theatre and stage             15
formula one                   10
literature and books           9
cricket          

In [4]:
# ==============================================================================
# Step 3: Extract Personalities and Their Jobs
# ==============================================================================

# --- Load the transformer-based spaCy model ---
print("\n--- Step 3: Extracting Personalities and Jobs ---")
print("Loading the spaCy transformer model (en_core_web_trf)...")
try:
    nlp = spacy.load("en_core_web_trf")
except OSError:
    print("\nError: 'en_core_web_trf' model not found.")
    print("Please run 'python -m spacy download en_core_web_trf' in your terminal.")
    # As a fallback, use the large model if the transformer is not available
    try:
        print("Falling back to 'en_core_web_lg' model...")
        nlp = spacy.load("en_core_web_lg")
    except OSError:
        print("Error: 'en_core_web_lg' also not found. Please download it.")
        nlp = None # Set nlp to None to skip this step if models are missing

# Define more comprehensive job keywords
job_keywords = {
    'Politician': ['minister', 'mp', 'government', 'chancellor', 'president', 'prime minister', 'senator', 'mayor', 'party leader', 'secretary of state'],
    'TV/Film Personality': ['actor', 'actress', 'director', 'producer', 'host', 'presenter', 'film', 'tv', 'star', 'screenwriter', 'comedian'],
    'Musician': ['singer', 'musician', 'band', 'artist', 'songwriter', 'guitarist', 'pianist', 'drummer', 'composer', 'rapper']
}

def extract_personalities_improved(text):
    """
    Finds names of people and tries to identify their job using a more
    robust search logic across the entire article.
    """
    if nlp is None:
        return "Skipped: spaCy model not loaded."

    doc = nlp(text)

    # First, find all unique people mentioned in the article
    # We use a set to avoid duplicates like "Tony Blair" and "Mr Blair" being treated differently later
    people_entities = {ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"}

    if not people_entities:
        return "None Found"

    personalities = {}
    sentences = list(doc.sents)

    # For each unique person, search the whole text for their job
    for person_name in people_entities:
        job_found = "Unknown"
        # Use the last name for a more flexible search (e.g., "Blair" for "Tony Blair")
        last_name = person_name.split()[-1]

        # 3. Iterate through all sentences to find a match
        for sentence in sentences:
            # Check if the person's last name is in the sentence
            if last_name in sentence.text:
                # If the name is found, check for a job keyword in the same sentence
                for job, keywords in job_keywords.items():
                    if any(keyword in sentence.text.lower() for keyword in keywords):
                        job_found = job
                        break # Stop searching for jobs for this person
            if job_found != "Unknown":
                break # Stop searching through sentences for this person

        personalities[person_name] = job_found

    return personalities

# --- Apply the function to the DataFrame ---
# This will be slower than before because the model is much larger.
print("Applying personality extraction...")
df['personalities'] = df['text'].apply(extract_personalities_improved)
print("Personality extraction complete.")

# --- Review the results ---
print("\nExamples of extracted personalities:")
# Filter to show rows where at least one personality was found
print(df[df['personalities'].apply(lambda x: isinstance(x, dict) and len(x) > 0)][['category', 'personalities']].head())


--- Step 3: Extracting Personalities and Jobs ---
Loading the spaCy transformer model (en_core_web_trf)...
Applying personality extraction...
Personality extraction complete.

Examples of extracted personalities:
  category                                      personalities
0    sport  {'Jimmy': 'TV/Film Personality', 'Greg Rusedsk...
1    sport  {'Igor Andreev': 'Politician', 'Greg Rusedski'...
2    sport  {'Graeme Souness': 'Unknown', 'Shepherd': 'Unk...
3    sport  {'Dwain': 'Politician', 'Dwain Chambers': 'Pol...
4    sport  {'Robben': 'Unknown', 'Mokoena': 'Unknown', 'O...


In [5]:
# Step 4: Summarize Articles Mentioning "April"
# ==============================================================================

print("\n--- Step 4 (Fixed): Summarizing 'April' Articles ---")

# Filter the DataFrame to get only articles containing the word "April"
print("Filtering for articles mentioning 'April'...")
april_df = df[df['text'].str.contains("April", case=False)].copy()
print(f"Found {len(april_df)} articles mentioning 'April'.")

if not april_df.empty:
    # Initialize the summarization pipeline (if not already done)
    print("Initializing summarization model...")
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

    # --- Summarization Function ---
    def generate_summary_fixed(text):
        """
        Generates a summary for a given text, first truncating it to a safe
        length to prevent model errors.
        """
        try:
            # The model's token limit is 1024. A token is roughly 4 characters.
            # Truncating to 4000 characters is a safe way to stay under the limit.
            truncated_text = text[:4000]

            # Generate the summary using the truncated text
            summary = summarizer(truncated_text, max_length=120, min_length=30, do_sample=False)
            return summary[0]['summary_text']
        except Exception as e:
            # This will catch any other unexpected errors
            return f"Could not summarize: {e}"

    # Apply summarization function
    print("Applying summarization with text truncation... (This may take some time)")
    april_df['summary'] = april_df['text'].apply(generate_summary_fixed)
    print("Summarization complete.")

    # --- Review the April Summaries ---
    print("\n--- Summaries of April Articles (First 3) ---")
    pd.set_option('display.max_colwidth', 400) # To see more of the summary
    print(april_df[['category', 'summary']].head(3))
else:
    print("No articles mentioning 'April' were found in the dataset.")


--- Step 4 (Fixed): Summarizing 'April' Articles ---
Filtering for articles mentioning 'April'...
Found 105 articles mentioning 'April'.
Initializing summarization model...


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cuda:0


Applying summarization with text truncation... (This may take some time)
Summarization complete.

--- Summaries of April Articles (First 3) ---
   category  \
13    sport   
15    sport   
33    sport   

                                                                                                                                                                                                                                                                                                             summary  
13   Sonia O'Sullivan has indicated that she would like to participate in the World Cross Country Championships in St Etienne . The 35-year-old Cobh runner may be included in the official line-up for the event in France on 19-20 March . O’Sullivan is currently preparing for the London marathon on 17 April .  
15            Kenya lift Susan Chepkemei's suspension after she makes an official apology . Two-time London Marathon runner-up was suspended for failing to turn up to a cros

In [6]:
# Step 5: Review Final Results
# ==============================================================================
print("\n--- Step 5: Final Review of the DataFrame ---")
print("Showing the first 5 rows with all new columns added:")
pd.set_option('display.max_colwidth', 100) # Reset column width
df.head()


--- Step 5: Final Review of the DataFrame ---
Showing the first 5 rows with all new columns added:


Unnamed: 0,category,text,sub_category,personalities
0,sport,Connors boost for British tennis\n\nFormer world number one Jimmy Connors is planning a long-ter...,tennis,"{'Jimmy': 'TV/Film Personality', 'Greg Rusedski': 'Politician', 'Tim Henman': 'Unknown', 'John C..."
1,sport,Henman overcomes rival Rusedski\n\nTim Henman saved a match point before fighting back to defeat...,tennis,"{'Igor Andreev': 'Politician', 'Greg Rusedski': 'Politician', 'Henman': 'Politician', 'Rusedski'..."
2,sport,Bellamy fined after row\n\nNewcastle have fined their Welsh striker Craig Bellamy two weeks' wag...,football,"{'Graeme Souness': 'Unknown', 'Shepherd': 'Unknown', 'Souness': 'Unknown', 'Craig Bellamy': 'Unk..."
3,sport,Collins calls for Chambers return\n\nWorld 100m champion Kim Collins says suspended sprinter Dwa...,athletics,"{'Dwain': 'Politician', 'Dwain Chambers': 'Politician', 'Chambers': 'Politician', 'Collins': 'Po..."
4,sport,Robben sidelined with broken foot\n\nChelsea winger Arjen Robben has broken two metatarsal bones...,football,"{'Robben': 'Unknown', 'Mokoena': 'Unknown', 'Olivier Dacourt': 'Unknown', 'David Beckham': 'Unkn..."


In [8]:
april_df.head(12)

Unnamed: 0,category,text,sub_category,personalities,summary
13,sport,O'Sullivan could run in Worlds\n\nSonia O'Sullivan has indicated that she would like to particip...,athletics,"{'Maria McCambridge': 'Unknown', 'O'Sullivan': 'Politician', 'Sonia O'Sullivan': 'Politician', '...",Sonia O'Sullivan has indicated that she would like to participate in the World Cross Country Ch...
15,sport,Kenya lift Chepkemei's suspension\n\nKenya's athletics body has reversed a ban on marathon runne...,athletics,"{'Edith Masai': 'Politician', 'Kiplagat': 'Unknown', 'Chepkemei': 'Politician', 'Isaiah Kiplagat...",Kenya lift Susan Chepkemei's suspension after she makes an official apology . Two-time London M...
33,sport,Henin-Hardenne beaten on comeback\n\nJustine Henin-Hardenne lost to Elena Dementieva in a comeba...,Olympics,"{'Nathalie Dechy': 'Politician', 'Dementieva': 'Unknown', 'Elena Dementieva': 'Unknown', 'Henin-...",The Australian Open champion is back in action for the first time since April . He has been out...
41,sport,Tindall aiming to earn Lions spot\n\nBath and England centre Mike Tindall believes he can make t...,rugby,"{'Gavin Henson': 'Politician', 'Clive Woodward': 'Unknown', 'Tindall': 'Unknown', 'Brian O'Drisc...","Mike Tindall has been out of action since December, having damaged both his shoulder and his fo..."
88,sport,London hope over Chepkemei\n\nLondon Marathon organisers are hoping that banned athlete Susan Ch...,athletics,"{'Paula Radcliffe': 'Unknown', 'Chepkemei': 'Politician', 'David Bedford': 'TV/Film Personality'...",Susan Chepkemei has been suspended from all competition until the end of the year . The 29-year...
114,sport,Radcliffe yet to answer GB call\n\nPaula Radcliffe has been granted extra time to decide whether...,athletics,"{'Paula': 'Politician', 'Hayley Yelling': 'Unknown', 'Paula Radcliffe': 'Politician', 'Zara Hyde...",Radcliffe has been granted extra time to decide whether to compete in the World Cross-Country C...
125,sport,Munster Cup tie switched to Spain\n\nMunster's Heineken Cup quarter-final tie against Biarritz o...,rugby,{'Marcel Martin': 'Unknown'},Munster's Heineken Cup quarter-final against Biarritz is moved to Real Sociedad's Paseo de Anoe...
169,sport,Owen delighted with Real display\n\nMichael Owen revelled in his return to the to the Real Madri...,football,"{'Ronaldo': 'Unknown', 'Owen': 'TV/Film Personality', 'Ivan Helguera': 'Unknown', 'Robert Carlos...",Michael Owen delighted with Real Madrid's 3-1 win over Real Betis on Wednesday . Owen started h...
263,sport,O'Sullivan commits to Dublin race\n\nSonia O'Sullivan will seek to regain her title at the Bupa ...,athletics,"{'Craig Mottram': 'Politician', 'O'Sullivan': 'Unknown', 'Sonia O'Sullivan': 'Unknown', 'Mark Ca...",Sonia O'Sullivan will seek to regain her title at the Bupa Great Ireland Run . The 35-year-old ...
269,sport,FA charges Liverpool and Millwall\n\nLiverpool and Millwall have been charged by the Football As...,football,{'Theo Paphitis': 'Unknown'},Liverpool and Millwall have been charged by the FA over crowd trouble during their Carling Cup ...


In [7]:
# ==============================================================================
# Step 6: Save the Final DataFrames to CSV Files
# ==============================================================================
#
# This final step saves your processed data into files so you can easily
# access it later without re-running the entire script.

print("\n--- Step 6: Saving DataFrames to CSV files ---")

try:
    # Save the main DataFrame with all articles and new columns
    # The 'index=False' argument prevents pandas from writing the row numbers as a column.
    df.to_csv('bbc_articles_analyzed.csv', index=False, encoding='utf-8')
    print("Successfully saved the main DataFrame to 'bbc_articles_analyzed.csv'")

    # Save the DataFrame containing only the April summaries
    if not april_df.empty:
        april_df.to_csv('april_articles_summaries.csv', index=False, encoding='utf-8')
        print("Successfully saved the April summaries to 'april_articles_summaries.csv'")

except Exception as e:
    print(f"An error occurred while saving the files: {e}")


--- Step 6: Saving DataFrames to CSV files ---
Successfully saved the main DataFrame to 'bbc_articles_analyzed.csv'
Successfully saved the April summaries to 'april_articles_summaries.csv'
