In [50]:
import pandas as pd

df = pd.read_json('test/new_dataset.json')

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 63622 entries, 0 to 63621
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   CategoryV7               63622 non-null  int64 
 1   remarks_text             53566 non-null  object
 2   subject_content_text     63622 non-null  object
 3   root_category_name       63622 non-null  object
 4   root_category_code       63622 non-null  int64 
 5   total_no_of_stages       63622 non-null  int64 
 6   hierarchy_order          63622 non-null  object
 7   category_hierarchy_code  63622 non-null  object
dtypes: int64(3), object(5)
memory usage: 4.4+ MB


In [52]:
def remove_first_line(text):
    lines = text.splitlines()
    new_text = '\r\n'.join(lines[1:])
    return new_text
df['subject_content_text'] = df['subject_content_text'].apply(lambda x: remove_first_line(x))

In [53]:
from sklearn.model_selection import train_test_split

X = df['subject_content_text']
y = df['root_category_code']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [59]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
model = LinearSVC(random_state=42)
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))



Accuracy: 0.8485658153241651

Classification Report:
               precision    recall  f1-score   support

           1       0.97      0.96      0.96       777
          61       0.94      0.93      0.93       780
         251       0.92      0.96      0.94      1288
         353       0.91      0.96      0.93       207
         398       0.74      0.71      0.73        86
         616       0.98      0.98      0.98       241
         656       0.62      0.63      0.63       247
        1221       0.94      0.85      0.90       156
        1341       0.90      0.91      0.91       126
        1442       0.88      0.88      0.88        91
        2113       0.88      0.96      0.92       112
        2173       0.96      0.97      0.96      2045
        2426       0.72      0.73      0.73       244
        2565       0.97      0.99      0.98       427
        2570       0.70      0.77      0.74       324
        4414       0.98      0.99      0.98      1194
        4465       0.85    

In [60]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# XGBoost works with DMatrix data structure, so we need to convert our datasets
dtrain = xgb.DMatrix(X_train_tfidf, label=LabelEncoder().fit_transform(y_train))
dtest = xgb.DMatrix(X_test_tfidf, label=LabelEncoder().fit_transform(y_test))

# Set XGBoost parameters
# You might want to tune these parameters, especially 'max_depth' and 'eta'
params = {
    'objective': 'multi:softmax',  # Use softmax for multi-class classification
    'num_class': len(y_train.unique()),  # Number of unique classes
    'max_depth': 6,  # Depth of the trees in the boosting process
    'eta': 0.4,  # Learning rate
    'eval_metric': 'mlogloss',  # Evaluation metrics for validation data
    'verbosity': 1  # Verbosity of printing messages. 1 means it prints all messages
}

# Number of boosting rounds
num_boost_round = 100

# Train the model
bst = xgb.train(params, dtrain, num_boost_round, evals=[(dtest, 'test')], early_stopping_rounds=10)

# Predictions
y_pred = bst.predict(dtest)
y_pred = [round(value) for value in y_pred]  # Round predictions to the nearest integer

# Decode the predicted labels back to original class names
label_decoder = LabelEncoder().fit(y_train)
y_pred_labels = label_decoder.inverse_transform(y_pred)

# Evaluate the model
accuracy = accuracy_score(LabelEncoder().fit_transform(y_test), y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("\nClassification Report:\n", classification_report(y_test, y_pred_labels))

[0]	test-mlogloss:1.50497
[1]	test-mlogloss:2.78759
[2]	test-mlogloss:3.49065
[3]	test-mlogloss:6.46110
[4]	test-mlogloss:5.95866
[5]	test-mlogloss:7.42129
[6]	test-mlogloss:6.53571
[7]	test-mlogloss:6.40089
[8]	test-mlogloss:8.03281
[9]	test-mlogloss:6.87394
Accuracy: 60.33%

Classification Report:
               precision    recall  f1-score   support

           1       0.96      0.88      0.91       777
          61       0.77      0.73      0.75       780
         251       0.89      0.80      0.84      1288
         353       0.89      0.75      0.81       207
         398       0.33      0.28      0.30        86
         616       0.96      0.91      0.94       241
         656       0.08      0.46      0.14       247
        1221       0.68      0.81      0.74       156
        1341       0.84      0.81      0.82       126
        1442       0.57      0.56      0.57        91
        2113       0.94      0.79      0.85       112
        2173       0.94      0.78      0.85      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
