In [14]:
# Step 1: Install Kaggle API
!pip install kaggle



In [15]:
# Step 2: Upload your kaggle.json file
from google.colab import files
print("Please upload your kaggle.json file:")
uploaded = files.upload()


Please upload your kaggle.json file:


Saving kaggle.json to kaggle (1).json


In [16]:
# Step 3: Set up API credentials
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [17]:
#For BBC News Dataset (recommended):

# Download BBC News Dataset
!kaggle competitions download -c learn-ai-bbc

learn-ai-bbc.zip: Skipping, found more recently modified local copy (use --force to force download)


In [18]:
# Unzip the files
!unzip learn-ai-bbc.zip


Archive:  learn-ai-bbc.zip
replace BBC News Sample Solution.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [19]:
# List the files to see what we have
!ls -la

total 6880
drwxr-xr-x 1 root root    4096 Nov  2 16:19  .
drwxr-xr-x 1 root root    4096 Nov  2 16:13  ..
-rw-r--r-- 1 root root   10369 Dec  2  2019 'BBC News Sample Solution.csv'
-rw-r--r-- 1 root root 1712432 Dec  2  2019 'BBC News Test.csv'
-rw-r--r-- 1 root root 3351206 Dec  2  2019 'BBC News Train.csv'
drwxr-xr-x 4 root root    4096 Oct 30 13:36  .config
-rw-r--r-- 1 root root      64 Nov  2 16:19 'kaggle (1).json'
-rw-r--r-- 1 root root      64 Nov  2 16:14  kaggle.json
-rw-r--r-- 1 root root 1936538 Dec  2  2019  learn-ai-bbc.zip
drwxr-xr-x 1 root root    4096 Oct 30 13:36  sample_data


In [20]:
# Load the dataset
import pandas as pd
import os

In [21]:
# Check what files are available
print("Available files:")
for file in os.listdir('.'):
    if file.endswith('.csv'):
        print(f"  - {file}")

Available files:
  - BBC News Sample Solution.csv
  - BBC News Train.csv
  - BBC News Test.csv


In [22]:
# Load the main dataset (adjust filename as needed)
# Common filenames: train.csv, bbc-text.csv, or similar
df = pd.read_csv('BBC News Train.csv')  # Adjust filename based on what you see
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Categories: {df['Category'].unique()}")  # Adjust column name as needed

Dataset shape: (1490, 3)
Columns: ['ArticleId', 'Text', 'Category']
Categories: ['business' 'tech' 'politics' 'sport' 'entertainment']


In [23]:
# Data preparation template
import pandas as pd
import numpy as np

In [33]:
# 2. Identify text and category columns
# Adjust these column names based on your dataset
text_column = 'Text'  # or 'description', 'content', 'headline', etc.
category_column = 'Category'  # or 'label', 'class', etc.

In [25]:
# 3. Check for missing values
print(f"\nMissing values:")
print(df.isnull().sum())


Missing values:
ArticleId    0
Text         0
Category     0
dtype: int64


In [28]:
# 4. Remove rows with missing text or categories
df_clean = df.dropna(subset=[text_column, category_column])

In [29]:
# 5. Check category distribution
print(f"\nCategory distribution:")
print(df_clean[category_column].value_counts())


Category distribution:
Category
sport            346
business         336
politics         274
entertainment    273
tech             261
Name: count, dtype: int64


In [30]:
# 6. Sample if dataset is too large (keep under 2000 for Colab)
if len(df_clean) > 2000:
    df_final = df_clean.sample(n=2000, random_state=42)
    print(f"\nSampled dataset to {len(df_final)} rows")
else:
    df_final = df_clean

In [31]:
# 7. Rename columns for consistency
df_final = df_final.rename(columns={
    text_column: 'content',
    category_column: 'category'
})

In [32]:
# 8. Save prepared dataset
df_final.to_csv('newsbot_dataset.csv', index=False)
print("\n✅ Dataset prepared and saved as 'newsbot_dataset.csv'")


✅ Dataset prepared and saved as 'newsbot_dataset.csv'
