In [1]:
import xgboost
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = SUBSCRIPTION_ID
resource_group = 'projekt2_mbti'
workspace_name = 'AML2'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='mbti_short')
df = dataset.to_pandas_dataframe()

In [3]:
df = df[['post', 'type']]
df.head()

Unnamed: 0,post,type
0,welcome welcome welcome!! ^^ your type rocks.....,ENFJ
1,Just realized I completely missed a letter a t...,ENFJ
2,Enough to fit a suitcase full of drugs.,ENFJ
3,Thanks! I've been working on opening up a bit ...,ENFJ
4,Ok...so im fully aware of my power of persuasi...,ENFJ


In [4]:

X_train, X_test, y_train, y_test = train_test_split(df['post'],
                                                    df['type'],
                                                    test_size=0.2,
                                                    random_state=77)


In [5]:
vectorizer = TfidfVectorizer(stop_words='english',
                             ngram_range=(1,1),
                             min_df=3)


In [6]:
X_train_vectorized = vectorizer.fit_transform(X_train.apply(lambda x: np.str_(x)))
X_test_vectorized = vectorizer.transform(X_test.apply(lambda x: np.str_(x)))
X_train_vectorized
len(vectorizer.get_feature_names())

11317

In [7]:
clf = xgboost.XGBClassifier(eval_metric="logloss")
clf.fit(X_train_vectorized, y_train)
y_pred = clf.predict(X_test_vectorized)


pickle.dump(vectorizer, open("vectorizer_xgb_mbti.sav", 'wb'))
pickle.dump(clf, open("model_XGB.sav", 'wb'))