In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

file_path = '/content/drive/MyDrive/final_data.csv'
data = pd.read_csv(file_path)

In [3]:
print(data['genres'].head(10))

0    ['Young Adult', 'Fiction', 'Dystopia', 'Fantas...
1    ['Fantasy', 'Young Adult', 'Fiction', 'Magic',...
2    ['Classics', 'Fiction', 'Historical Fiction', ...
3    ['Classics', 'Fiction', 'Romance', 'Historical...
4    ['Young Adult', 'Fantasy', 'Romance', 'Vampire...
5    ['Historical Fiction', 'Fiction', 'Young Adult...
6    ['Classics', 'Fiction', 'Dystopia', 'Fantasy',...
7    ['Fantasy', 'Classics', 'Fiction', 'Young Adul...
8    ['Fantasy', 'Fiction', 'Classics', 'Adventure'...
9    ['Classics', 'Historical Fiction', 'Fiction', ...
Name: genres, dtype: object


In [4]:
print(data.shape)

(46977, 4)


In [5]:
all_tags = data['genres'].str.split(',').explode()
import re

def normalize_tag(tag):
    if pd.isnull(tag):
        return ''
    return re.sub(r'[^\w\s]', '', tag).strip().lower()

all_tags = all_tags.dropna().apply(normalize_tag)
top_tags = all_tags.value_counts().head(5).index

def new_tags(genres, top_tags):
    if pd.isnull(genres):
        return None
    tags = genres.split(',')
    normalized_tags = [normalize_tag(tag) for tag in tags]
    filtered = [tag for tag in normalized_tags if tag in top_tags]
    return ','.join(filtered) if filtered else None

data['genres'] = data['genres'].apply(lambda x: new_tags(x, top_tags))
data = data[data['genres'].notnull()]
data.to_csv('/content/drive/MyDrive/fix_news_tag.csv', index=False)

In [6]:
top_tags

Index(['fiction', 'romance', 'fantasy', 'young adult', 'contemporary'], dtype='object', name='genres')

In [7]:
print(data['genres'].head(10))

0    young adult,fiction,fantasy,romance
1            fantasy,young adult,fiction
2                    fiction,young adult
3                        fiction,romance
4    young adult,fantasy,romance,fiction
5                    fiction,young adult
6                        fiction,fantasy
7            fantasy,fiction,young adult
8            fantasy,fiction,young adult
9                        fiction,romance
Name: genres, dtype: object


In [8]:
print(data.shape)

(36657, 4)


In [9]:
print(data.isnull().sum())

title          110
author           2
description      0
genres           0
dtype: int64


In [10]:
data['num_tags'] = data['genres'].apply(lambda x: len(x.split(',')) if pd.notnull(x) else 0)

min_tags = data['num_tags'].min()
print(min_tags)

max_tags = data['num_tags'].max()
print(max_tags)

1
5


In [24]:
all_tags.value_counts().head(5)

Unnamed: 0_level_0,count
genres,Unnamed: 1_level_1
fiction,31282
romance,15397
fantasy,14933
young adult,11787
contemporary,10477


In [20]:
data = data.dropna()
data = data.drop(columns=['num_tags'])

In [23]:
nan_counts = data.isnull().sum()
total_nan = data.isnull().sum().sum()
print(nan_counts)

title          0
author         0
description    0
genres         0
dtype: int64


In [18]:
from sklearn.model_selection import train_test_split

In [22]:
train_df, val_test = train_test_split(data, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(val_test, test_size=0.5, random_state=42)

train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)