#### Imports

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
import string

In [7]:
# Download NLTK stopwords (if not already downloaded)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/melwyn/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

#### Load CSV

In [8]:
df = pd.read_csv("youtube_categories.csv")

In [None]:
# Handle missing values (if any)
df.dropna(subset=["Description", "Category"], inplace=True)

In [None]:
# Preprocess the text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df["Description"] = df["Description"].apply(preprocess_text)

In [11]:
# Balance the dataset (optional, if classes are imbalanced)
# Count the number of samples per category
category_counts = df["Category"].value_counts()
# Find the minimum number of samples in any category
min_samples = category_counts.min()
# Balance the dataset by resampling
balanced_data = []
for category in df["Category"].unique():
    category_data = df[df["Category"] == category]
    if len(category_data) > min_samples:
        category_data = resample(category_data, replace=False, n_samples=min_samples, random_state=42)
    balanced_data.append(category_data)
df = pd.concat(balanced_data)

In [12]:
# Encode labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Category"])

In [14]:
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

##### Save the datasets

In [15]:
# Save the datasets
train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)

print("Training and validation datasets created successfully!")

Training and validation datasets created successfully!
