In [1]:
import pandas as pd
import numpy as np
from lazypredict.Supervised import LazyClassifier
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import scipy.stats as stats
import xgboost as xgb
from xgboost import XGBClassifier
import pickle

dataset = pd.read_excel("Synthetic User Stories.xlsx")
target = []
for row in dataset.iterrows():
    target.append(np.where(dataset["Domain"].unique() == row[1]["Domain"])[0][0])
dataset["Target"] = target
dataset

Unnamed: 0,Domain Cluster,Topic,Domain,Machine Learning Task,User Story,Target
0,Biology & Botanic,1,Biology,abstractive summarization,A group of researchers is using abstractive su...,0
1,Biology & Botanic,1,Plant Science,abstractive summarization,"As a plant scientist, I want to use abstractiv...",1
2,Biology & Botanic,1,Biology,action model learning,"As a molecular biologist, I want to use action...",0
3,Biology & Botanic,1,Plant Science,action model learning,"As a plant scientist, I want to use action mod...",1
4,Biology & Botanic,1,Biology,activation function,"As a bioinformatics researcher, I want to use ...",0
...,...,...,...,...,...,...
12396,Technical Domains,9,Computer Vision,word-sense disambiguation,"As a computer vision researcher, I want to use...",37
12397,Technical Domains,9,Computer Networks,word2vec,"As a network engineer, I want to use word2vec ...",36
12398,Technical Domains,9,Computer Vision,word2vec,"As a computer vision researcher, I want to use...",37
12399,Technical Domains,9,Computer Networks,wordnet,"As a network engineer, I want to use WordNet t...",36


In [2]:
def getTrainSetBERT():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenized_data = tokenizer(dataset['User Story'].tolist(), padding=True, truncation=True, max_length=100)
    traindata = []
    for msg in tokenized_data['input_ids']:
        traindata.append(msg)
    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    return traindata

In [3]:
X = getTrainSetBERT() #Change this to get training set based on word embeddings method.
y = dataset['Target']
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,101,1037,2177,1997,6950,2003,2478,10061,3512,7680,...,0,0,0,0,0,0,0,0,0,0
1,101,2004,1037,3269,7155,1010,1045,2215,2000,2224,...,0,0,0,0,0,0,0,0,0,0
2,101,2004,1037,8382,21477,1010,1045,2215,2000,2224,...,0,0,0,0,0,0,0,0,0,0
3,101,2004,1037,3269,7155,1010,1045,2215,2000,2224,...,0,0,0,0,0,0,0,0,0,0
4,101,2004,1037,16012,2378,14192,17592,10753,1010,1045,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12396,101,2004,1037,3274,4432,10753,1010,1045,2215,2000,...,0,0,0,0,0,0,0,0,0,0
12397,101,2004,1037,2897,3992,1010,1045,2215,2000,2224,...,0,0,0,0,0,0,0,0,0,0
12398,101,2004,1037,3274,4432,10753,1010,1045,2215,2000,...,0,0,0,0,0,0,0,0,0,0
12399,101,2004,1037,2897,3992,1010,1045,2215,2000,2224,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models

100%|██████████| 29/29 [01:23<00:00,  2.89s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.98,0.88,,0.98,7.1
BaggingClassifier,0.98,0.87,,0.98,0.71
DecisionTreeClassifier,0.97,0.87,,0.97,0.16
ExtraTreesClassifier,0.85,0.76,,0.85,0.79
RandomForestClassifier,0.82,0.74,,0.81,1.43
LogisticRegression,0.54,0.48,,0.53,0.62
LinearSVC,0.49,0.44,,0.46,11.24
ExtraTreeClassifier,0.47,0.42,,0.47,0.04
LinearDiscriminantAnalysis,0.46,0.41,,0.45,0.4
CalibratedClassifierCV,0.44,0.39,,0.38,42.95


In [5]:
param_grid = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.01, 0.1),
    'subsample': stats.uniform(0.5, 0.5),
    'n_estimators':stats.randint(50, 200)
}

# Create the XGBoost model object
xgb_model = XGBClassifier()

# Create the GridSearchCV object
grid_search = RandomizedSearchCV(xgb_model, param_grid, cv=5, verbose=3, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END learning_rate=0.08743744901533687, max_depth=3, n_estimators=80, subsample=0.9241937829171241;, score=0.908 total time=   9.6s
[CV 2/5] END learning_rate=0.08743744901533687, max_depth=3, n_estimators=80, subsample=0.9241937829171241;, score=0.983 total time=  10.2s
[CV 3/5] END learning_rate=0.08743744901533687, max_depth=3, n_estimators=80, subsample=0.9241937829171241;, score=0.991 total time=  10.3s
[CV 4/5] END learning_rate=0.08743744901533687, max_depth=3, n_estimators=80, subsample=0.9241937829171241;, score=0.990 total time=  10.1s
[CV 5/5] END learning_rate=0.08743744901533687, max_depth=3, n_estimators=80, subsample=0.9241937829171241;, score=0.989 total time=  10.1s
[CV 1/5] END learning_rate=0.05726722015902535, max_depth=5, n_estimators=168, subsample=0.6473869884405905;, score=0.907 total time=  24.9s
[CV 2/5] END learning_rate=0.05726722015902535, max_depth=5, n_estimators=168, subsample=0.6473869

In [11]:
# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best set of hyperparameters:  {'learning_rate': 0.08743744901533687, 'max_depth': 3, 'n_estimators': 80, 'subsample': 0.9241937829171241}
Best score:  0.9722654756796818


In [12]:
xgb_model = XGBClassifier(learning_rate=00.08743744901533687, max_depth= 3, n_estimators= 80, subsample=0.9241937829171241)
xgb_model.fit(X, y)

In [13]:
user_story = "As an educator, I want to use fasttext to analyze student writing and provide personalized feedback on grammar and syntax, so that students can improve their writing skills."
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_data = tokenizer([user_story], padding='max_length', max_length=100, truncation=True)
traindata = []
for msg in tokenized_data['input_ids']:
    traindata.append(msg)
traindata = pd.DataFrame(traindata)
traindata.columns = traindata.columns.astype(str)
predict = xgb_model.predict(traindata)
dataset["Domain"].unique()[predict[0]]

'Education'

In [15]:
with open('XGBClassifier.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

In [16]:
with open('XGBClassifier.pkl', 'rb') as f:
    xgb_loaded= pickle.load(f)
predict = xgb_loaded.predict(traindata)
dataset["Domain"].unique()[predict[0]]

'Education'