# Multilabel XGBClassifier Analysis

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stopwords = set(stopwords.words('english'))

In [2]:
df = pd.read_csv('superheroes_nlp_dataset.csv')

In [3]:
data = df.loc[:, ['name', 'real_name', 'full_name', 'history_text', 'powers_text', 'creator']]
# drop all NA's; reduces df by about 46%; 667 rows
data.dropna(inplace=True)
# remove numbers from the text
data['hist'] = df['history_text'].str.lower().replace(r'\d+', '', regex=True)

# get features and labels
X = data['hist'].to_numpy()
y = data['creator'].to_numpy()

In [5]:
# create our tfidf vectorizer stuff
vectorizer = TfidfVectorizer(
    analyzer='word',
    stop_words=stopwords,
    preprocessor=WordNetLemmatizer().lemmatize,
    ngram_range=(1, 2)
)
history_tfidf = vectorizer.fit_transform(X)

# one hot encode our labels and features for prediction sake
le = LabelEncoder()
oe = OneHotEncoder(sparse=False)

# integer_X = le.fit_transform(vectorizer.get_feature_names())
# integer_X = integer_X.reshape(len(integer_X), 1)
# onehot_X = oe.fit_transform(integer_X)

integer_y = le.fit_transform(y)
integer_y = integer_y.reshape(len(integer_y), 1)
onehot_y = oe.fit_transform(integer_y)


# get back the original labels
# inverted = le.inverse_transform([np.argmax(onehot_y[0])])
# this is how we'd get out the actual label names of the encoder
# for i in onehot_y:
#     print(le.inverse_transform([np.argmax(i)]))

# create test and train set
X_train, X_test = train_test_split(
    history_tfidf,
    test_size=0.20,
    random_state=42069
)
y_train, y_test = train_test_split(
    onehot_y,
    test_size=0.20,
    random_state=42069
)



In [None]:
# create the xgbclassifier object
multilabel_xgbc = MultiOutputClassifier(
    XGBClassifier(n_estimators=200, n_jobs=-1)
)

# fit the model to our training data
multilabel_xgbc_fitted = multilabel_xgbc.fit(X_train, y_train)

In [7]:
# evaluate on test data
print(
    'Accuracy on test data: {:.1f}%'.format(
        accuracy_score(y_test, multilabel_xgbc_fitted.predict(X_test))*100
    )
)

Accuracy on test data: 73.9%
