In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
sns.set()
# Code for hiding seaborn warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import KFold, RepeatedStratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [5]:
colab = False
if colab:
  train_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/training.csv').set_index('article_number')
  test_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/test.csv').set_index('article_number')
else:
  train_df = pd.read_csv('training.csv').set_index('article_number')
  test_df = pd.read_csv('test.csv').set_index('article_number')

In [6]:
train_df['words_count'] = train_df['article_words'].str.split(',').str.len()
train_df = train_df[train_df['words_count'] < train_df['words_count'].quantile(0.95)]

In [7]:
vectorizer = TfidfVectorizer(strip_accents='ascii')
le = LabelEncoder()
x_train = vectorizer.fit_transform(train_df.iloc[:,0])
y_train = le.fit_transform(train_df.iloc[:,1])
x_test = vectorizer.transform(test_df.iloc[:,0])
y_test = le.transform(test_df.iloc[:,1])
print(list(le.classes_))

['ARTS CULTURE ENTERTAINMENT', 'BIOGRAPHIES PERSONALITIES PEOPLE', 'DEFENCE', 'DOMESTIC MARKETS', 'FOREX MARKETS', 'HEALTH', 'IRRELEVANT', 'MONEY MARKETS', 'SCIENCE AND TECHNOLOGY', 'SHARE LISTINGS', 'SPORTS']


In [8]:
gbm = XGBClassifier().fit(x_train, y_train)
y_pred = gbm.predict(x_test)
print(classification_report(y_test, y_pred, target_names=list(le.classes_)))

In [10]:
sgd = SGDClassifier()
sgd.fit(x_train,y_train)
y_pred = sgd.predict(x_test)
print(classification_report(y_test, y_pred, target_names=list(le.classes_)))

                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.33      0.67      0.44         3
BIOGRAPHIES PERSONALITIES PEOPLE       1.00      0.20      0.33        15
                         DEFENCE       0.89      0.62      0.73        13
                DOMESTIC MARKETS       1.00      0.50      0.67         2
                   FOREX MARKETS       0.41      0.25      0.31        48
                          HEALTH       0.75      0.64      0.69        14
                      IRRELEVANT       0.86      0.90      0.88       266
                   MONEY MARKETS       0.52      0.70      0.60        69
          SCIENCE AND TECHNOLOGY       0.00      0.00      0.00         3
                  SHARE LISTINGS       0.40      0.29      0.33         7
                          SPORTS       0.95      0.97      0.96        60

                        accuracy                           0.77       500
                       macro avg    

In [11]:
xgb_model = XGBClassifier()
parameters = {
              'objective':['binary:logistic'],
              'learning_rate': [0.05, 0.1, 0.15, 0.2], #so called `eta` value
              'max_depth': [6,7,8,9,10],
              'min_child_weight': [11,12,13,15],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5,10,15], #number of trees, change it to 1000 for better results
              'random_state': [1337],
              'n_jobs': [2]
              }

In [12]:
clf = GridSearchCV(xgb_model, parameters, n_jobs=-1, cv=5, 
                   scoring='accuracy',
                   verbose=5, refit=True)
clf.fit(x_train,y_train)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed: 27.9min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed: 36.2min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed: 38.6min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
             iid='deprecated', n_jobs=-1,
             param_grid={'colsample_bytree': [0.7],
                         'learning_rate': [0.05, 0.1, 0.15, 0.2],
                         'max_depth': [6, 7, 8, 9, 10],
                         'min_child_weight': [11, 12, 13, 15],
    

In [13]:
clf.best_estimator_

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.2, max_delta_step=0, max_depth=10,
              min_child_weight=13, missing=nan, monotone_constraints=None,
              n_estimators=15, n_jobs=2, num_parallel_tree=1,
              objective='multi:softprob', random_state=1337, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, silent=1, subsample=0.8,
              tree_method=None, validate_parameters=False, verbosity=None)

In [20]:
gbm = XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.2, max_delta_step=0, max_depth=10,
              min_child_weight=13, monotone_constraints=None,
              n_estimators=1000, num_parallel_tree=1,
              objective='multi:softprob', random_state=1337, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, n_jobs=12, silent=1, subsample=0.8,
              validate_parameters=False, verbosity=None).fit(x_train, y_train)
y_pred = gbm.predict(x_test)
print(classification_report(y_test, y_pred, target_names=list(le.classes_)))

                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.50      0.67      0.57         3
BIOGRAPHIES PERSONALITIES PEOPLE       0.40      0.13      0.20        15
                         DEFENCE       0.71      0.38      0.50        13
                DOMESTIC MARKETS       0.50      0.50      0.50         2
                   FOREX MARKETS       0.54      0.31      0.39        48
                          HEALTH       0.70      0.50      0.58        14
                      IRRELEVANT       0.84      0.89      0.86       266
                   MONEY MARKETS       0.50      0.67      0.57        69
          SCIENCE AND TECHNOLOGY       0.00      0.00      0.00         3
                  SHARE LISTINGS       0.50      0.43      0.46         7
                          SPORTS       0.92      0.98      0.95        60

                        accuracy                           0.75       500
                       macro avg    

In [23]:
steps = [('over', SMOTE()), ('model', gbm)]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)