In [9]:
import pandas as pd
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import string
import ast
from collections import Counter
import re
import numpy as np

In [3]:
df = pd.read_csv("/final_data.csv")
df

Unnamed: 0,title,author,description,genres
0,The Hunger Games,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas..."
1,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPr (Illustrator)",There is a door at the end of a silent corrido...,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',..."
2,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ..."
3,Pride and Prejudice,"Jane Austen, Anna Quindlen (Introduction)",Alternate cover edition of ISBN 9780679783268S...,"['Classics', 'Fiction', 'Romance', 'Historical..."
4,Twilight,Stephenie Meyer,About three things I was absolutely positive.F...,"['Young Adult', 'Fantasy', 'Romance', 'Vampire..."
...,...,...,...,...
46972,Fractured,Cheri Schmidt (Goodreads Author),The Fateful Trilogy continues with Fractured. ...,"['Vampires', 'Paranormal', 'Young Adult', 'Rom..."
46973,Anasazi,Emma Michaels,"'Anasazi', sequel to 'The Thirteenth Chime' by...","['Mystery', 'Young Adult']"
46974,Marked,Kim Richardson (Goodreads Author),--READERS FAVORITE AWARDS WINNER 2011--Sixteen...,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange..."
46975,Wayward Son,"Tom Pollack (Goodreads Author), John Loftus (G...",A POWERFUL TREMOR UNEARTHS AN ANCIENT SECRETBu...,"['Fiction', 'Mystery', 'Historical Fiction', '..."


### Gộp và xử lý các cột title, author, description

In [6]:
df['text'] = df['title'] + ' ' + df['author'] + ' ' + df['description']
lemmatizer = WordNetLemmatizer()

# Hàm tiền xử lý văn bản
def preprocess_text(text):
    # Chuyển thành chữ thường
    text = text.lower()
    # Loại bỏ ký tự đặc biệt
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenization và loại bỏ stopwords
    words = text.split()
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Áp dụng tiền xử lý cho các cột 'title', 'author', 'description'
df['text']= df['text'].astype(str)
df['text'] = df['text'].apply(preprocess_text)


In [7]:
df['text']

Unnamed: 0,text
0,hunger game suzanne collins winning mean fame ...
1,harry potter order phoenix jk rowling mary gra...
2,kill mockingbird harper lee unforgettable nove...
3,pride prejudice jane austen anna quindlen intr...
4,twilight stephenie meyer thing absolutely posi...
...,...
46972,fractured cheri schmidt goodreads author fatef...
46973,anasazi emma michael anasazi sequel thirteenth...
46974,marked kim richardson goodreads author reader ...
46975,wayward son tom pollack goodreads author john ...


### Embedding cột text

In [8]:
model = Word2Vec(sentences=df['text'], vector_size=200, window=5, min_count=1, sg=0)

# Get word embeddings for each description
def get_embedding(text_tokens):
    # Get embeddings for each word in the description and average them
    embeddings = [model.wv[word] for word in text_tokens if word in model.wv]
    if embeddings:
        return sum(embeddings) / len(embeddings)
    else:
        return [0] * model.vector_size  # Return a zero vector if no words are in the model

# Apply to create embedding column
df['embedding'] = df['text'].apply(get_embedding)



### Tiền xử lý nhãn sách

In [10]:
df['genres'] = df['genres'].apply(ast.literal_eval)


In [11]:
all_genres = [genre for genres_list in df['genres'] for genre in genres_list]
genre_counts = Counter(all_genres)
genre_counts_df = pd.DataFrame(genre_counts.items(), columns=['Genre', 'Count'])
total_genres = len(genre_counts)
genre_counts_df = genre_counts_df.sort_values(by='Count', ascending=False)

print(f'Total number of genres: {total_genres}')
print(genre_counts_df)

Total number of genres: 982
               Genre  Count
1            Fiction  31282
5            Romance  15397
3            Fantasy  14933
0        Young Adult  11787
41      Contemporary  10477
..               ...    ...
940          Algebra      1
941   Did Not Finish      1
974  Royal Air Force      1
975         Warriors      1
976  Human Resources      1

[982 rows x 2 columns]


In [None]:
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
import torch

# Load pretrained BERT model và tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# danh sách nhãn
labels = genre_counts_df['Genre']

# embedding nhãn bằng BERT
def get_bert_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1) 
    return embeddings

embeddings = labels.apply(get_bert_embeddings)

# Phân cụm bằng KMeans
num_clusters = 50  
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
kmeans.fit(np.vstack(embeddings))

clustered_labels = {i: [] for i in range(num_clusters)}
for i, label in enumerate(labels):
    cluster_idx = kmeans.labels_[i]
    clustered_labels[cluster_idx].append(label)

print(clustered_labels)

{0: ['Polish Literature', 'Turkish Literature', 'Romanian Literature', 'Finnish Literature', 'Czech Literature', 'Bulgarian Literature', 'Serbian Literature', 'Hungarian Literature', 'Albanian Literature', 'Ukrainian Literature'], 1: ['Middle Grade', 'Christian', 'Self Help', 'High School', 'Read For School', 'Boarding School', '2nd Grade', '1st Grade', 'Grad School', 'Homeschool', 'Read For College', 'Foster Children', 'Back To School', 'Foster Parents'], 2: ['Lie', 'Yeti', 'Go'], 3: ['Manga', 'Shojo', 'Shonen', 'Anime', 'Seinen', 'Batman', 'Superman', 'Josei', 'Shounen Ai', 'Manga Romance', 'Comedian'], 4: ['Star Wars', 'Doctor Who', 'Star Trek', 'Buffy The Vampire Slayer', 'Star Trek The Next Generation', 'Star Trek Deep Space Nine', 'Star Trek Original Series', 'Star Trek Voyager'], 5: ['Shapeshifters', 'Sword and Sorcery', 'Harlequin', 'Cthulhu Mythos', 'Harlequin Romance', 'Harlequin Heartwarming', 'Harlequin Desire', 'Swashbuckling', 'Harlequin Blaze', 'Harlequin Nocturne', 'Har

In [14]:
cluster_representatives = {}
for cluster_id, tags_in_cluster in clustered_labels.items():
    # Nhãn đại diện: chọn nhãn phổ biến nhất hoặc kết hợp nhãn
    # Ở đây chọn nhãn đầu tiên trong cụm làm đại diện
    cluster_representatives[cluster_id] = tags_in_cluster[0]

# Tạo DataFrame hiển thị nhãn đại diện và các nhãn trong từng cụm
representative_df = pd.DataFrame([
    {"Cluster ID": cluster_id, "Representative": representative, "Tags": tags_in_cluster}
    for cluster_id, (representative, tags_in_cluster) in enumerate(zip(cluster_representatives.values(), clustered_labels.values()))
])

print(representative_df)

    Cluster ID                Representative  \
0            0             Polish Literature   
1            1                  Middle Grade   
2            2                           Lie   
3            3                         Manga   
4            4                     Star Wars   
5            5                 Shapeshifters   
6            6          Contemporary Romance   
7            7                       Mystery   
8            8                      Dystopia   
9            9  The United States Of America   
10          10                         Drama   
11          11                Latin American   
12          12                   M M Romance   
13          13                    Philosophy   
14          14                        Travel   
15          15                       Erotica   
16          16                        Russia   
17          17            Historical Fiction   
18          18                       Germany   
19          19            British Litera

In [None]:
import pandas as pd

# tạo từ điển ánh xạ nhãn đến nhãn đại diện
genre_to_rep = {}
for idx, row in representative_df.iterrows():
    for genre in row['Tags']:
        genre_to_rep[genre] = row['Representative']

def map_to_representative_category(genre_list):
    return [genre_to_rep.get(genre, genre) for genre in genre_list]  # Default to genre if not found in the mapping

df['mapped_genres'] = df['genres'].apply(map_to_representative_category)
print(df[['genres', 'mapped_genres']].head())



                                              genres  \
0  [Young Adult, Fiction, Dystopia, Fantasy, Scie...   
1  [Fantasy, Young Adult, Fiction, Magic, Childre...   
2  [Classics, Fiction, Historical Fiction, School...   
3  [Classics, Fiction, Romance, Historical Fictio...   
4  [Young Adult, Fantasy, Romance, Vampires, Fict...   

                                       mapped_genres  
0  [Young Adult, Fiction, Dystopia, Fantasy, Scie...  
1  [Fantasy, Young Adult, Fiction, Mystery, Child...  
2  [Romance, Fiction, Historical Fiction, Fantasy...  
3  [Romance, Fiction, Romance, Historical Fiction...  
4  [Young Adult, Fantasy, Romance, Mystery, Ficti...  


In [None]:
# Xóa các thẻ trùng lặp trong từng hàng của cột 'mapped_genres'
df['mapped_genres'] = df['mapped_genres'].apply(lambda x: list(set(x)))
print(df[['genres', 'mapped_genres']].head())


                                              genres  \
0  [Young Adult, Fiction, Dystopia, Fantasy, Scie...   
1  [Fantasy, Young Adult, Fiction, Magic, Childre...   
2  [Classics, Fiction, Historical Fiction, School...   
3  [Classics, Fiction, Romance, Historical Fictio...   
4  [Young Adult, Fantasy, Romance, Vampires, Fict...   

                                       mapped_genres  
0  [Young Adult, Science Fiction, Fiction, Erotic...  
1  [Young Adult, Science Fiction, Fiction, Middle...  
2  [Young Adult, Historical Fiction, Fiction, Mid...  
3  [Historical Fiction, British Literature, Ficti...  
4  [Young Adult, Science Fiction, Fiction, Contem...  


In [25]:
df.to_csv('check2.csv',index = False)

In [26]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['mapped_genres'])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df['embedding'], y, test_size=0.2, random_state=42)

In [None]:
import pickle
# Lưu train và test set
with open('X_train.pkl', 'wb') as f:
    pickle.dump(X_train, f)

with open('X_test.pkl', 'wb') as f:
    pickle.dump(X_test, f)

with open('y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)

with open('y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)

print("Train and test sets have been saved.")


Train and test sets have been saved.
