In [1]:
import pandas as pd
import numpy as np
from skmultilearn.problem_transform import LabelPowerset
from sklearn.preprocessing import MultiLabelBinarizer
import gensim
from sklearn.svm import LinearSVC
import pickle

glove_vectors = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.100d.txt',binary=False, no_header=True)

dataset = pd.read_excel("Synthetic User Stories.xlsx")

labels = pd.read_excel("Keyword labelled.xlsx", header=None)
labels[2] = labels[2].apply(lambda x: x.lower())
labels

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,classification,,abstractive summarization,Data summarization,,,,,,,...,,,,,,,,,,
1,regression,,action model learning,Other,,,,,,,...,,,,,,,,,,
2,ranking,,activation function,Representation learning,Classification,Regression,Anomaly detection,Clustering,Spatio-temporal process learning,Graph diffusion,...,,,,,,,,,,
3,matching,,active learning setting,Classification,Regression,Anomaly detection,Entity resolution,Sentiment analysis,Bias detection in word embeddings,Bias detection in language models,...,,,,,,,,,,
4,risk assessment,,adaboost,Classification,Regression,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,,,word segmentation,Machine translation,Speech recognition,,,,,,...,,,,,,,,,,
363,,,word similarity,Classification,Ranking,Matching,Clustering,Anomaly detection,Sentiment analysis,Entity resolution,...,,,,,,,,,,
364,,,word-sense disambiguation,Classification,Ranking,Matching,Clustering,Anomaly detection,Sentiment analysis,Entity resolution,...,,,,,,,,,,
365,,,word2vec,Classification,Ranking,Matching,Clustering,Anomaly detection,Sentiment analysis,Entity resolution,...,,,,,,,,,,


In [2]:
categories_column = []
for row in labels.iterrows():
    current_labels = []
    for label in row[1][3:]:
        if isinstance(label, str):
            current_labels.append(label.lower())
    categories_column.append(current_labels)
labels["Categories array"] = categories_column
target = []
counter = 0
for row in dataset.iterrows():
    target.append(labels[labels[2]==row[1]["Machine Learning Task"].lower()]["Categories array"].values[0])
    counter += 1
dataset["Target"] = target
dataset[["User Story","Target"]]

Unnamed: 0,User Story,Target
0,A group of researchers is using abstractive su...,[data summarization]
1,"As a plant scientist, I want to use abstractiv...",[data summarization]
2,"As a molecular biologist, I want to use action...",[other]
3,"As a plant scientist, I want to use action mod...",[other]
4,"As a bioinformatics researcher, I want to use ...","[representation learning, classification, regr..."
...,...,...
12396,"As a computer vision researcher, I want to use...","[classification, ranking, matching, clustering..."
12397,"As a network engineer, I want to use word2vec ...","[classification, ranking, matching, clustering..."
12398,"As a computer vision researcher, I want to use...","[classification, ranking, matching, clustering..."
12399,"As a network engineer, I want to use WordNet t...","[representation learning, clustering, matching..."


In [3]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(dataset['Target'])
pd.DataFrame(y, columns=multilabel.classes_)

Unnamed: 0,advertising,anomaly detection,bias detection in language models,bias detection in word embeddings,classification,clustering,data summarization,districting,entity resolution,graph augmentation,...,ranking,regression,representation learning,resource allocation,risk assessment,sentiment analysis,spatio-temporal process learning,speech recognition,subset selection,task assignment
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,1,1,0,0,0,0,...,0,1,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12396,0,1,0,0,1,1,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
12397,0,1,0,0,1,1,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
12398,0,1,0,0,1,1,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
12399,0,1,1,1,0,1,0,0,1,0,...,0,0,1,0,0,1,0,0,1,0


In [4]:
def getTrainSetGlove():
    traindata = []
    for msg in dataset['User Story']:
        words = msg.split()
        vecs = []
        for word in words:
            if word in glove_vectors:
                vecs.append(glove_vectors[word])
        if vecs:
            vec_avg = sum(vecs) / len(vecs)
        else:
            vec_avg = [0] * 100
        traindata.append(vec_avg)
    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    return traindata

In [5]:
X = getTrainSetGlove()
clf=LabelPowerset(LinearSVC())
clf.fit(X=X.values, y=y)


In [6]:
user_story = "A group of researchers is using abstractive summarization to identify key trends and insights in large sets of biological data, enabling more efficient analysis and interpretation."
traindata = []
for msg in [user_story]:
    words = msg.split()
    vecs = []
    for word in words:
        if word in glove_vectors:
            vecs.append(glove_vectors[word])
    if vecs:
        vec_avg = sum(vecs) / len(vecs)
    else:
        vec_avg = [0] * 100
    traindata.append(vec_avg)
traindata = pd.DataFrame(traindata)
traindata.columns = traindata.columns.astype(str)
output = []
for prediction in multilabel.inverse_transform(clf.predict(traindata.values))[0]:
    output.append(prediction)
output

['data summarization']

In [7]:
# Serialize the multilabel trasformer.
with open('multilabel.pkl', 'wb') as f:
    pickle.dump(multilabel, f)

# Serialize the classifier
with open('LinearSVC_LabelPowerset.pkl', 'wb') as f:
    pickle.dump(clf, f)