# Setup Working directory

In [None]:
import os

if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("../")

# Download Dataset and Libraries

In [3]:
!pip install -q cuml-cu12

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build 

# Processing

## Load Exists Dataset

In [5]:
import os
import pandas as pd
import cudf
import numpy as np
import cupy as cp
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.model_selection import train_test_split

In [6]:
converted_df = pd.read_csv('dataset/converted_train_dataset.csv')
converted_test_df = pd.read_csv('dataset/converted_test_dataset.csv')

## Word2Vec

In [7]:
# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on the training data
vectorizer.fit(converted_df['content'])

# Transform both the training and testing data
X_vectors = vectorizer.transform(converted_df['content'])
X_test = vectorizer.transform(converted_test_df['content'])

In [8]:
y = converted_df['label']
# Convert y to numerical labels if it contains strings
if y.dtype == 'object':
    unique_classes = y.unique()
    class_to_number = {class_name: i for i, class_name in enumerate(unique_classes)}
    y = y.map(class_to_number)
    y = cudf.Series(y) # Convert to cudf Series if necessary

y_test = converted_test_df['label']  # Assuming you need this for testing later
# Convert y_test to numerical labels as well if necessary
if y_test.dtype == 'object':
    y_test = y_test.map(class_to_number)
    y_test = cudf.Series(y_test) # Convert to cudf Series if necessary

## Split Dataset

In [9]:
X_vectors = cp.asnumpy(X_vectors.toarray())
y = cp.asnumpy(y.to_cupy())

In [10]:
# Convert train_vectors to dense CuPy array
# X_vectors = X_vectors.toarray()  # Convert to dense NumPy array

# Split the data using train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_vectors, y, test_size=0.2,
                                                  stratify=y, random_state=12345)

In [11]:
# X_train = cp.asnumpy(X_train)
# X_val = cp.asnumpy(X_val)
X_test = cp.asnumpy(X_test.toarray())
# y_train = cp.asnumpy(y_train.to_cupy())
# y_val = cp.asnumpy(y_val.to_cupy())
y_test = cp.asnumpy(y_test.to_cupy())

# All Models

In [None]:
from cuml.ensemble import RandomForestClassifier
import xgboost as xgb
from cuml.linear_model import LogisticRegression
from cuml.svm import SVC
from cuml.neighbors import NearestNeighbors
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

In [None]:
def show_evaluation(grid_model, model_name='SVM'):
    print(f'Best model of {model_name}', grid_model.best_params_)
    
    val_pred = grid_model.best_estimator_.predict(cp.asnumpy(X_val))
    test_pred = grid_model.best_estimator_.predict(cp.asnumpy(X_test))

    print(f'Accuracy of {model_name} in Val Set is: ', accuracy_score(y_val, val_pred))
    print(f'Accuracy of {model_name} in Test Set is: ', accuracy_score(y_test, test_pred))

    print(f'\nClassification Report of {model_name} in Test Set is:\n',
            classification_report(y_test, test_pred, digits=6, target_names=list(class_to_number.keys())))

In [14]:
ps = PredefinedSplit(test_fold= [-1] * len(X_train) + [0] * len(X_val))

# split = [range(len(X_train)), range(len(X_train), len(X_train) + len(X_val))]

X_merge = np.concatenate((X_train, X_val), axis=0)
y_merge = np.concatenate((y_train, y_val), axis=0)

In [38]:
# All Params Grid

rf_params_grid = { 
                'n_estimators': [100, 200, 300],
                'max_features': ['sqrt', 'log2'],
                'max_depth' : [6,8,10],
                'split_criterion' :[0, 1] # 0 or 'gini' for gini impurity; 1 or 'entropy' for information gain (entropy)
                }

xgb_params_grid = {'learning_rate': [0.001, 0.01, 0.1], #so called `eta` value
              'max_depth': [5, 6, None],
              'n_estimators': [5, 6, 7]}

lr_params_grid = {'C': [0.1, 1, 10],
              'penalty': ['l1', 'l2', None],
              'max_iter': [100, 200, 300]}

svm_params_grid = {'C': [0.1, 1, 10],
              'gamma': [1, 0.1, 0.01],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
knn_params_grid = {"n_neighbors": list(range(1, 10))}

In [28]:
rf_model = RandomForestClassifier()
xgb_model = xgb.XGBClassifier(tree_method= 'gpu_hist')
lr_model = LogisticRegression()
svm_model = SVC()
knn_model = NearestNeighbors()

In [39]:
rf_grid_search = GridSearchCV(rf_model, rf_params_grid, cv=ps, n_jobs=-1, scoring='accuracy')
xgb_grid_search = GridSearchCV(xgb_model, xgb_params_grid, cv=ps, n_jobs=-1, scoring='accuracy')
lr_grid_search = GridSearchCV(lr_model, lr_params_grid, cv=ps, n_jobs=-1, scoring='accuracy')
svm_grid_search = GridSearchCV(svm_model, svm_params_grid, cv=ps, n_jobs=-1, scoring='accuracy')
knn_grid_search = GridSearchCV(knn_model, knn_params_grid, cv=ps, n_jobs=-1, scoring='accuracy')

## Random Forest

In [40]:
rf_grid_search.fit(X_merge, y_merge)
show_evaluation(rf_grid_search, 'Random Forest')

Best model of Random Forest {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 100, 'split_criterion': 1}
Accuracy of Random Forest in Val Set is:  0.886777969679524
Accuracy of Random Forest in Test Set is:  0.6516666666666666

Classification Report of Random Forest in Test Set is:
               precision    recall  f1-score   support

    Doi song   0.447531  0.483333  0.464744       300
   Phap luat   0.714667  0.893333  0.794074       300
     Van hoa   0.489362  0.153333  0.233503       300
    The thao   0.848571  0.990000  0.913846       300
  Kinh doanh   0.660000  0.880000  0.754286       300
    The gioi   0.892857  0.666667  0.763359       300
    Khoa hoc   0.933333  0.186667  0.311111       300
    Suc khoe   0.502618  0.960000  0.659794       300

    accuracy                       0.651667      2400
   macro avg   0.686117  0.651667  0.611839      2400
weighted avg   0.686117  0.651667  0.611839      2400



## XGB

In [26]:
xgb_grid_search.fit(X_merge, y_merge)
show_evaluation(xgb_grid_search, 'XGB')


    E.g. tree_method = "hist", device = "cuda"



Best model of XGB {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 7}



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Accuracy of XGB in Val Set is:  0.8906160046056419
Accuracy of XGB in Test Set is:  0.6779166666666666

Classification Report of XGB in Test Set is:
               precision    recall  f1-score   support

    Doi song   0.456869  0.476667  0.466558       300
   Phap luat   0.845878  0.786667  0.815199       300
     Van hoa   0.426667  0.106667  0.170667       300
    The thao   0.886503  0.963333  0.923323       300
  Kinh doanh   0.663957  0.816667  0.732436       300
    The gioi   0.625698  0.746667  0.680851       300
    Khoa hoc   0.684015  0.613333  0.646749       300
    Suc khoe   0.666667  0.913333  0.770745       300

    accuracy                       0.677917      2400
   macro avg   0.657032  0.677917  0.650816      2400
weighted avg   0.657032  0.677917  0.650816      2400



## Logistic Regression

In [31]:
lr_grid_search.fit(X_merge, y_merge)
show_evaluation(lr_grid_search, 'Logistic Regression')

Exception ignored in: <cyfunction RandomForestClassifier.__del__ at 0x7f5643d4b9f0>
Traceback (most recent call last):
  File "randomforestclassifier.pyx", line 317, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__
  File "randomforestclassifier.pyx", line 321, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data
  File "base.pyx", line 336, in cuml.internals.base.Base.__getattr__
AttributeError: rf_forest


[W] [10:36:58.191192] L-BFGS: max iterations reached
[W] [10:36:58.191690] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
Best model of Logistic Regression {'C': 10, 'max_iter': 100, 'penalty': 'l2'}
Accuracy of Logistic Regression in Val Set is:  0.9967376703127998
Accuracy of Logistic Regression in Test Set is:  0.7691666666666667

Classification Report of Logistic Regression in Test Set is:
               precision    recall  f1-score   support

    Doi song   0.519403  0.580000  0.548031       300
   Phap luat   0.924731  0.860000  0.891192       300
     Van hoa   0.535211  0.126667  0.204852       300
    The thao   0.960784  0.980000  0.970297       300
  Kinh doanh   0.815789  0.930000  0.869159       300
    The gioi   0.786145  0.870000  0.825949       300
    Khoa hoc   0.722222  0.823333  0.769470       300
    Suc khoe   0.750636  0.983333  0.85

## SVM

In [32]:
svm_grid_search.fit(X_merge, y_merge)
show_evaluation(svm_grid_search, 'SVM')



Best model of SVM {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
Accuracy of SVM in Val Set is:  0.9990404912684705
Accuracy of SVM in Test Set is:  0.77125

Classification Report of SVM in Test Set is:
               precision    recall  f1-score   support

    Doi song   0.508333  0.610000  0.554545       300
   Phap luat   0.929630  0.836667  0.880702       300
     Van hoa   0.622951  0.126667  0.210526       300
    The thao   0.957929  0.986667  0.972085       300
  Kinh doanh   0.818713  0.933333  0.872274       300
    The gioi   0.781065  0.880000  0.827586       300
    Khoa hoc   0.739521  0.823333  0.779180       300
    Suc khoe   0.756477  0.973333  0.851312       300

    accuracy                       0.771250      2400
   macro avg   0.764327  0.771250  0.743526      2400
weighted avg   0.764327  0.771250  0.743526      2400



## Neural Network

In [45]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import LabelBinarizer

In [50]:
lb = LabelBinarizer()
lb.fit(y_train)

y_train_oh = lb.transform(y_train)
y_val_oh = lb.transform(y_val)
y_test_oh = lb.transform(y_test)

In [129]:
nn_model = keras.Sequential([
            layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)), # input shape has to be specified on the first layer only
            layers.Dropout(0.2),
            layers.Dense(256, activation='relu'),
            layers.Dropout(0.2),
            layers.Dense(128, activation='relu'),
            layers.Dropout(0.2),
            layers.Dense(64, activation='relu'),
            layers.Dropout(0.2),
            layers.Dense(32, activation='relu'),
            layers.Dropout(0.2),
            # Output Layer (Sigmoid for binary classification)
            layers.Dense(len(class_to_number.keys()), activation='softmax')
            ])


# Compile the model
nn_model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4),
          loss='categorical_crossentropy',
          metrics=['accuracy'])


my_callbacks = [
                keras.callbacks.EarlyStopping(monitor="val_loss", patience=100,
                              mode="min", restore_best_weights=True)
                ]
nn_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [130]:
nn_model.fit(X_train, y_train_oh,
             batch_size=64, epochs=500, verbose=2, callbacks=my_callbacks,
             validation_data=(X_val, y_val_oh))

Epoch 1/500
326/326 - 11s - 35ms/step - accuracy: 0.4911 - loss: 1.4801 - val_accuracy: 0.8645 - val_loss: 0.5549
Epoch 2/500
326/326 - 1s - 3ms/step - accuracy: 0.8395 - loss: 0.5303 - val_accuracy: 0.9115 - val_loss: 0.2962
Epoch 3/500
326/326 - 1s - 3ms/step - accuracy: 0.9107 - loss: 0.3128 - val_accuracy: 0.9378 - val_loss: 0.2208
Epoch 4/500
326/326 - 1s - 3ms/step - accuracy: 0.9415 - loss: 0.2197 - val_accuracy: 0.9457 - val_loss: 0.1962
Epoch 5/500
326/326 - 1s - 3ms/step - accuracy: 0.9593 - loss: 0.1524 - val_accuracy: 0.9432 - val_loss: 0.2046
Epoch 6/500
326/326 - 1s - 3ms/step - accuracy: 0.9685 - loss: 0.1182 - val_accuracy: 0.9478 - val_loss: 0.2069
Epoch 7/500
326/326 - 1s - 3ms/step - accuracy: 0.9766 - loss: 0.0944 - val_accuracy: 0.9495 - val_loss: 0.2139
Epoch 8/500
326/326 - 1s - 3ms/step - accuracy: 0.9811 - loss: 0.0760 - val_accuracy: 0.9449 - val_loss: 0.2301
Epoch 9/500
326/326 - 1s - 3ms/step - accuracy: 0.9847 - loss: 0.0596 - val_accuracy: 0.9465 - val_los

<keras.src.callbacks.history.History at 0x7f54a8bd0310>

### Neural Nework Evaluation

In [80]:
def show_evaluation_nn(model, model_name='Neural Network'):
    # print(f'Best model of {model_name}', model.best_params_)
    
    val_pred = model.predict(X_val, verbose=0)
    test_pred = model.predict(X_test, verbose=0)

    print(f'Accuracy of {model_name} in Val Set is: ', accuracy_score(y_val,
                                                                      np.argmax(val_pred, axis=1)))
    print(f'Accuracy of {model_name} in Test Set is: ', accuracy_score(y_test,
                                                                       np.argmax(test_pred, axis=1)))

    print(f'\nClassification Report of {model_name} in Test Set is:\n',
            classification_report(y_test,  np.argmax(test_pred, axis=1), digits=6,
                                  target_names=list(class_to_number.keys())))

In [131]:
# nn_model.evaluate(X_test, y_test_oh)
show_evaluation_nn(nn_model, model_name='Neural Network')

Accuracy of Neural Network in Val Set is:  0.9456918057954328
Accuracy of Neural Network in Test Set is:  0.7820833333333334

Classification Report of Neural Network in Test Set is:
               precision    recall  f1-score   support

    Doi song   0.504249  0.593333  0.545176       300
   Phap luat   0.901754  0.856667  0.878632       300
     Van hoa   0.614679  0.223333  0.327628       300
    The thao   0.945687  0.986667  0.965742       300
  Kinh doanh   0.840625  0.896667  0.867742       300
    The gioi   0.852090  0.883333  0.867430       300
    Khoa hoc   0.811502  0.846667  0.828711       300
    Suc khoe   0.734848  0.970000  0.836207       300

    accuracy                       0.782083      2400
   macro avg   0.775679  0.782083  0.764659      2400
weighted avg   0.775679  0.782083  0.764659      2400

