In [1]:
from google.colab import files
print("Please upload your bbcnews.csv file:")
uploaded = files.upload()


Please upload your bbcnews.csv file:


Saving BBCNews.csv to BBCNews.csv


In [14]:
import pandas as pd, json, os

FILENAME = list(uploaded.keys())[0]  # pick the uploaded file

def load_dataset(filename):
    if filename.lower().endswith(".csv"):
        return pd.read_csv(filename)
    elif filename.lower().endswith(".json") or filename.lower().endswith(".jsonl"):
        rows = []
        with open(filename, "r", encoding="utf-8") as f:
            for line in f:
                line=line.strip()
                if not line:
                    continue
                try:
                    rows.append(json.loads(line))
                except:
                    f.seek(0); rows = json.load(f); break
        return pd.DataFrame(rows)
    else:
        raise ValueError("Unsupported file type. Use CSV or JSON/JSONL.")

df_raw = load_dataset(FILENAME)
print("Original shape:", df_raw.shape)
print("Columns:", df_raw.columns.tolist())
df_raw.head(3)



Original shape: (2410, 3)
Columns: ['Unnamed: 0', 'descr', 'tags']


Unnamed: 0.1,Unnamed: 0,descr,tags
0,0,chelsea sack mutu chelsea have sacked adrian ...,"sports, stamford bridge, football association,..."
1,1,record fails to lift lacklustre meet yelena i...,"sports, madrid, birmingham, france, scotland, ..."
2,2,edu describes tunnel fracas arsenals edu has ...,"sports, derby, brazil, tunnel fracasedu, food,..."


In [15]:
# try common names, else fallback to your CSV columns
text_candidates  = ['text','content','article','body','description','descr','short_description','headline']
label_candidates = ['category','label','class','topic','section','tags']

TEXT_COL  = next((c for c in text_candidates  if c in df_raw.columns), None)
LABEL_COL = next((c for c in label_candidates if c in df_raw.columns), None)
assert TEXT_COL and LABEL_COL, "Set TEXT_COL and LABEL_COL manually."

df = df_raw[[TEXT_COL, LABEL_COL]].rename(columns={TEXT_COL:'content', LABEL_COL:'category'})
df = df.dropna(subset=['content','category']).copy()
print("After select:", df.shape)
df.head(3)


After select: (2410, 2)


Unnamed: 0,content,category
0,chelsea sack mutu chelsea have sacked adrian ...,"sports, stamford bridge, football association,..."
1,record fails to lift lacklustre meet yelena i...,"sports, madrid, birmingham, france, scotland, ..."
2,edu describes tunnel fracas arsenals edu has ...,"sports, derby, brazil, tunnel fracasedu, food,..."


In [16]:
import re

def to_bbc_cat(s: str):
    s = str(s).lower()
    if re.search(r'\bsport', s): return 'sport'
    if re.search(r'\bbusiness', s): return 'business'
    if re.search(r'\bentertain', s): return 'entertainment'
    if re.search(r'\bpolitic', s): return 'politics'
    if re.search(r'\btech', s): return 'tech'
    return None

df['category'] = df['category'].apply(to_bbc_cat)
df = df.dropna(subset=['category']).copy()

print("Counts (raw to BBC):\n", df['category'].value_counts())
print("Shape:", df.shape)


Counts (raw to BBC):
 category
business         599
sport            553
entertainment    442
politics         354
tech             312
Name: count, dtype: int64
Shape: (2260, 2)


In [17]:
# keep classes with at least 15 samples to allow stratified split later
min_per_class = 15
vc = df['category'].value_counts()
keep = vc[vc >= min_per_class].index
df = df[df['category'].isin(keep)].copy()

# cap to 2000 rows
if len(df) > 2000:
    df = df.sample(n=2000, random_state=42)

print("Final counts:\n", df['category'].value_counts())
print("Final shape:", df.shape)
df.head(3)



Final counts:
 category
business         512
sport            497
entertainment    392
politics         324
tech             275
Name: count, dtype: int64
Final shape: (2000, 2)


Unnamed: 0,content,category
1674,cactus diet deal for phytopharm a slimming ai...,business
814,parents face video game lessons ways of ensur...,entertainment
1270,bush website blocked outside us surfers outsi...,business


In [18]:
import os
os.makedirs("data", exist_ok=True)
out_path = "data/newsbot_dataset.csv"
df.to_csv(out_path, index=False)
print(f"✅ Saved: {out_path}")
!ls -la data | sed -n '1,5p'



✅ Saved: data/newsbot_dataset.csv
total 4352
drwxr-xr-x 2 root root    4096 Nov  1 19:39 .
drwxr-xr-x 1 root root    4096 Nov  1 19:39 ..
-rw-r--r-- 1 root root 4445730 Nov  1 19:39 newsbot_dataset.csv


In [19]:
import pandas as pd

# 1) Reload the saved dataset (this is what you'll use later)
df = pd.read_csv("data/newsbot_dataset.csv")
print(df.shape)
print(df['category'].value_counts())
df.head(3)


(2000, 2)
category
business         512
sport            497
entertainment    392
politics         324
tech             275
Name: count, dtype: int64


Unnamed: 0,content,category
0,cactus diet deal for phytopharm a slimming ai...,business
1,parents face video game lessons ways of ensur...,entertainment
2,bush website blocked outside us surfers outsi...,business


In [20]:
# 2) Basic quality checks
print("Nulls:\n", df.isnull().sum())
print("Avg text length (chars):", df['content'].str.len().mean())
print("Min/Max text length:", df['content'].str.len().min(), df['content'].str.len().max())


Nulls:
 content     0
category    0
dtype: int64
Avg text length (chars): 2213.172
Min/Max text length: 57 15672
