In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
manga_df = pd.read_csv('../data/processed_comic.csv')
manga_df.head()

Unnamed: 0,Title,Score,Vote,Ranked,Popularity,Members,Favorite,Volumes,Chapters,Status,Genres,Author,Recommended,Mixed Feelings,Not Recommended,Release date,Time from release (months)
0,Berserk,9.47,331288,1,1,665300,122841,4.0,23.0,Publishing,"['Action', 'Adventure', 'Award Winning', 'Dram...",['Miura Kentarou Studio Gaga'],233,15,10,1989-08-25,411
1,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,9.3,156368,2,26,256146,42864,24.0,96.0,Finished,"['Action', 'Adventure', 'Mystery', 'Supernatur...",['Araki Hirohiko'],120,7,1,2004-01-19,87
2,Vagabond,9.24,136403,3,15,364891,40158,37.0,327.0,On Hiatus,"['Action', 'Adventure', 'Award Winning']",['Inoue Takehiko Yoshikawa Eiji'],88,8,1,1998-09-03,201
3,One Piece,9.22,366668,4,3,599278,114531,4.0,23.0,Publishing,"['Action', 'Adventure', 'Fantasy']",['Oda Eiichiro'],173,17,16,1997-07-22,316
4,Monster,9.15,93945,5,29,236355,20501,18.0,162.0,Finished,"['Award Winning', 'Drama', 'Mystery']",['Urasawa Naoki'],64,7,5,1994-12-05,84


In [3]:
# Bước 1: Xử lý cột "Genres"
# Tách các nhãn và sử dụng One-Hot Encoding
genres = manga_df['Genres'].str.get_dummies(sep=', ')
manga_df = pd.concat([manga_df, genres], axis=1)

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np

# shuffle dataset
# manga_df = manga_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Bước 2: Xử lý cột "Status"
# Sử dụng Label Encoding hoặc One-Hot Encoding
le = LabelEncoder()
manga_df['Status'] = le.fit_transform(manga_df['Status'])

# Chọn các cột số để làm đặc trưng
numeric_columns = ['Vote', 'Popularity', 'Members', 'Favorite', 'Volumes','Chapters', 'Recommended', 'Mixed Feelings', 'Not Recommended', 'Status']

# Chọn cột 'Score' làm biến mục tiêu
target_column = 'Score'

# Lọc dữ liệu
data = manga_df[numeric_columns + list(genres.columns) + [target_column]].dropna()

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
# Chia dữ liệu thành tập huấn luyện và tập temp (tổng cộng tập test và validation)
X_train, X_temp, y_train, y_temp = train_test_split(data.drop(target_column, axis=1), data[target_column], test_size=0.2, random_state=42)

# Chia tập temp thành tập test và tập validation
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# Hàm để thực hiện cross-validation và tính trung bình lỗi
def cross_val(model, X, y, cv=5):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    mse = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf)
    return -mse.mean()

# Huấn luyện các mô hình và đánh giá chúng
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor()
}

scaler = StandardScaler()

# Tạo pipeline để tự động thực hiện chuẩn hóa và huấn luyện mô hình
numeric_features = numeric_columns + list(genres.columns)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_features)
    ])

for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])
    
    pipeline.fit(X_train, y_train)
    
    train_accuracy = pipeline.score(X_train, y_train)
    test_accuracy = pipeline.score(X_test, y_test)
    
    print(f'{name} Train Accuracy: {train_accuracy * 100:.2f}%')
    print(f'{name} Test Accuracy: {test_accuracy * 100:.2f}%')




Linear Regression Train Accuracy: 50.70%
Linear Regression Test Accuracy: -213618517759394486157312.00%
Random Forest Train Accuracy: 95.61%
Random Forest Test Accuracy: 70.75%
XGBoost Train Accuracy: 92.62%
XGBoost Test Accuracy: 68.21%


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
import numpy as np

# Chia dữ liệu thành features và target
X = data.drop(target_column, axis=1).values
y = data[target_column].values

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Chia thành tập train, validation, test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Chuyển dữ liệu thành Tensor
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train).view(-1, 1)
X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val).view(-1, 1)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test).view(-1, 1)

# Tạo DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Xây dựng mô hình
class RegressionModel(nn.Module):
    def __init__(self, input_size):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Hàm huấn luyện mô hình
def train_model(model, train_loader, val_loader, num_epochs=50, lr=0.001):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                val_loss += criterion(outputs, labels)

        print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss.item():.3f}')

# Hàm tính score
def evaluate_model(model, data_loader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for inputs, labels in data_loader:
            outputs = model(inputs)
            predictions.extend(outputs.numpy())
            true_labels.extend(labels.numpy())

    mse = mean_squared_error(true_labels, predictions)
    r2 = r2_score(true_labels, predictions)
    return mse, r2


# Tạo và huấn luyện mô hình
input_size = X_train.shape[1]
model = RegressionModel(input_size)
train_model(model, train_loader, val_loader)

# Đánh giá trên tập train
train_mse, train_r2 = evaluate_model(model, train_loader)
print(f'Train Mean Squared Error: {train_mse:.3f}, Train R^2 Score: {train_r2:.3f}')

# Đánh giá trên tập validation
val_mse, val_r2 = evaluate_model(model, val_loader)
print(f'Validation Mean Squared Error: {val_mse:.3f}, Validation R^2 Score: {val_r2:.3f}')

# Đánh giá trên tập test
test_mse, test_r2 = evaluate_model(model, test_loader)
print(f'Test Mean Squared Error: {test_mse:.3f}, Test R^2 Score: {test_r2:.3f}')


Epoch 1/50, Validation Loss: 292.531
Epoch 2/50, Validation Loss: 48.990
Epoch 3/50, Validation Loss: 10.072
Epoch 4/50, Validation Loss: 7.605
Epoch 5/50, Validation Loss: 6.518
Epoch 6/50, Validation Loss: 5.653
Epoch 7/50, Validation Loss: 5.015
Epoch 8/50, Validation Loss: 4.527
Epoch 9/50, Validation Loss: 4.215
Epoch 10/50, Validation Loss: 4.003
Epoch 11/50, Validation Loss: 3.816
Epoch 12/50, Validation Loss: 3.721
Epoch 13/50, Validation Loss: 3.566
Epoch 14/50, Validation Loss: 3.488
Epoch 15/50, Validation Loss: 3.495
Epoch 16/50, Validation Loss: 3.415
Epoch 17/50, Validation Loss: 3.487
Epoch 18/50, Validation Loss: 3.367
Epoch 19/50, Validation Loss: 3.368
Epoch 20/50, Validation Loss: 3.355
Epoch 21/50, Validation Loss: 3.333
Epoch 22/50, Validation Loss: 3.316
Epoch 23/50, Validation Loss: 3.353
Epoch 24/50, Validation Loss: 3.358
Epoch 25/50, Validation Loss: 3.323
Epoch 26/50, Validation Loss: 3.454
Epoch 27/50, Validation Loss: 3.311
Epoch 28/50, Validation Loss: 3.4