In [1]:
from pathlib import Path
from mare.analysis import RequirementsPreprocessor, PreTrainedWord2VecAnalyser

In [2]:
path_to_requirements = Path('..', 'crowdre_cleaned-csv', 'requirements.csv')
preprocessor = RequirementsPreprocessor(path_to_requirements)
preprocessor._preprocess_requirements()

In [3]:
google_data = Path('..', 'GoogleNews-vectors-negative300.bin')
pt_word_2_vec = PreTrainedWord2VecAnalyser(preprocessor.requirements)
pt_word_2_vec.load(google_data)
pt_word_2_vec.build_vocabulary()

In [4]:
def build_requirement_vectors(re_list, threshold=100):
    """Replaces each word in a given list of RE sentences with its vector representation."""
    shortest_req = float("inf")
    longest_req = 0
    requirements = []
    sentences = []
    domains = []
    for requirement in re_list:
        # the lexical words are the tokenized sentences which were freed from stopwords
        filtered_tokens = filter(
            lambda token: pt_word_2_vec._token_not_redundant(token) and pt_word_2_vec._token_in_training_data(token),
            requirement.lexical_words
        )
        sentence = pt_word_2_vec.vectors[filtered_tokens]
        if len(sentence) >= threshold:
            shortest_req = min(len(sentence), shortest_req)
            longest_req = max(len(sentence), longest_req)
            requirements.append(sentence.transpose())
            sentences.append(requirement.cleaned_text)
            domains.append(requirement.domain)
    return requirements, sentences, domains, shortest_req, longest_req

req_vecs, sentences, domains, shortest, longest = build_requirement_vectors(pt_word_2_vec.requirements_list, threshold=6)

In [5]:
import pprint
from collections import defaultdict

def analyze_clusters(sentences, domains, clusters):
    re_clusters = defaultdict(list)

    for sentence, domain, cluster in zip(sentences, domains, clusters):
        re_clusters[cluster].append((sentence, domain))

    cluster_domains = {}
    for cluster_key in re_clusters.keys():
        domain_count = defaultdict(int)
        for sent, domain in re_clusters[cluster_key]:
            domain_count[domain] += 1
        cluster_domains[cluster_key] = ("Assigned reqs: {}".format(len(re_clusters[cluster_key])), domain_count)

    pprint.pprint(cluster_domains)

In [6]:
from sklearn.decomposition import PCA
import numpy as np

pcaed = []
pca = PCA(n_components=shortest)
for vec in req_vecs:
    pcaed.append(pca.fit_transform(vec))

pcaed_arr = np.array(pcaed)
x, y, z = pcaed_arr.shape

In [7]:
from collections import defaultdict
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=3,
    n_init=40,
    max_iter=2000,
    precompute_distances=True,
    random_state=0,
    n_jobs=-1
).fit(pcaed_arr.reshape(x, y*z))

In [8]:
analyze_clusters(sentences, domains, kmeans.labels_)

{0: ('Assigned reqs: 789',
     defaultdict(<class 'int'>,
                 {'Energy': 206,
                  'Entertainment': 89,
                  'Health': 168,
                  'Other': 101,
                  'Safety': 225})),
 1: ('Assigned reqs: 1397',
     defaultdict(<class 'int'>,
                 {'Energy': 297,
                  'Entertainment': 263,
                  'Health': 267,
                  'Other': 185,
                  'Safety': 385})),
 2: ('Assigned reqs: 673',
     defaultdict(<class 'int'>,
                 {'Energy': 105,
                  'Entertainment': 106,
                  'Health': 132,
                  'Other': 87,
                  'Safety': 243}))}


In [9]:
from sklearn.cluster import AgglomerativeClustering

aggc = AgglomerativeClustering(n_clusters=5).fit(pcaed_arr.reshape(x, y*z))

In [10]:
analyze_clusters(sentences, domains, aggc.labels_)

{0: ('Assigned reqs: 934',
     defaultdict(<class 'int'>,
                 {'Energy': 123,
                  'Entertainment': 227,
                  'Health': 151,
                  'Other': 99,
                  'Safety': 334})),
 1: ('Assigned reqs: 609',
     defaultdict(<class 'int'>,
                 {'Energy': 206,
                  'Entertainment': 56,
                  'Health': 168,
                  'Other': 77,
                  'Safety': 102})),
 2: ('Assigned reqs: 85',
     defaultdict(<class 'int'>,
                 {'Entertainment': 5,
                  'Health': 32,
                  'Other': 14,
                  'Safety': 34})),
 3: ('Assigned reqs: 768',
     defaultdict(<class 'int'>,
                 {'Energy': 105,
                  'Entertainment': 134,
                  'Health': 100,
                  'Other': 97,
                  'Safety': 332})),
 4: ('Assigned reqs: 463',
     defaultdict(<class 'int'>,
                 {'Energy': 174,
                  '

In [42]:
from sklearn.manifold import TSNE
import plotly.graph_objects as go

CHERRY = "rgba(137,28,86,.9)"
TEAL = "rgba(57,117,121,.9)"
ORANGE = "rgba(212,129,59,.9)"
PURPLE = "rgba(136,104,156,.9)"
SAND = "rgba(186,171,155,.9)"

DOMAIN_COLORS = {
    'Energy': TEAL,
    'Entertainment': SAND,
    'Health': PURPLE,
    'Safety': CHERRY,
    'Other': ORANGE,
}
colors = list(map(lambda dom: DOMAIN_COLORS[dom], domains))
sentences = [re.cleaned_text for re in preprocessor.requirements]

def plot_tsne(data, perplexity=30.0, learning_rate=200.0):
    fig = go.Figure()
    traces = TSNE(
        n_components=2,
        perplexity=perplexity,
        learning_rate=learning_rate,
        n_jobs=-1
    ).fit_transform(data)
    zip(traces, domains)
    fig.add_trace(go.Scatter(x=traces[:,0], y=traces[:,1], mode='markers', marker_color=colors, text=sentences))
    fig.show()
    return fig

In [56]:
# PLOT PCA
fig = go.Figure()
traces = PCA(n_components=2).fit_transform(pcaed_arr.reshape(x, y*z))
fig.add_trace(go.Scatter(x=traces[:,0], y=traces[:,1], mode='markers', marker_color=colors, text=sentences))
fig.show()

In [43]:
fig = plot_tsne(pcaed_arr.reshape(x, y*z), perplexity=35, learning_rate=400)

In [55]:
traces

array([[ 2.4780793 , -0.1628017 ],
       [ 2.479658  , -2.1620262 ],
       [-2.180948  , -0.2915679 ],
       ...,
       [-2.771173  , -0.0055225 ],
       [-3.2941058 ,  0.48186374],
       [-2.5209076 , -0.67932594]], dtype=float32)

In [58]:
from sklearn.manifold import TSNE
import plotly.graph_objects as go


CHERRY = "rgba(137,28,86,.9)"
TEAL = "rgba(57,117,121,.9)"
ORANGE = "rgba(212,129,59,.9)"
PURPLE = "rgba(136,104,156,.9)"
SAND = "rgba(186,171,155,.9)"

DOMAIN_COLORS = {
    'Energy': TEAL,
    'Entertainment': SAND,
    'Health': PURPLE,
    'Safety': CHERRY,
    'Other': ORANGE,
    '6': "rgb(74,208,74)",
    '7': "rgb(155,216,153)",
}

# DOMAIN_COLORS = {
#     'Energy': "rgb(236,231,121)",
#     'Entertainment': "rgb(234,153,153)",
#     'Health': "rgb(34,92,123)",
#     'Safety': "rgb(133,99,153)",
#     'Other': "rgb(129,206,168)",
# }

# DOMAIN_COLORS = {
#     'Energy': "rgb(218,135,52)",
#     'Entertainment': "rgb(234,153,153)",
#     'Health': "rgb(34,92,123)",
#     'Safety': "rgb(133,99,153)",
#     'Other': "rgb(155,216,153)",
# }
# DOMAIN_COLORS = {
#     'Energy': "rgb(209,50,69)",
#     'Entertainment': "rgb(218,135,52)",
#     'Health': "rgb(4, 76, 93)",
#     'Safety': "rgb(47, 155, 19)",
#     'Other': "rgb(255,255,255)",
# }
re_sentences = list(map(lambda re: re.cleaned_text, preprocessor.requirements))
re_domains = list(map(lambda re: re.domain, preprocessor.requirements))
colors = list(map(lambda dom: DOMAIN_COLORS[dom], re_domains))

def tsne(data, perplexity=30.0, learning_rate=200.0):
    traces = TSNE(
        n_components=2,
        perplexity=perplexity,
        learning_rate=learning_rate,
        n_jobs=-1
    ).fit_transform(data)
    return traces
    
    
def plot(traces):
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=traces[:,0],
            y=traces[:,1],
            mode='markers',
            marker_color=colors,
            text=preprocessor.requirements
        )
    )
    fig.show()
    
class Plotter(object):
    
    def __init__(self):
        self.figure = go.Figure()
    
    def add_trace(self, traces, texts, colors, legend):
        self.figure.add_trace(
            go.Scatter(
                x=traces[:,0],
                y=traces[:,1],
                mode='markers',
                marker_color=colors,
                text=texts,
                name=legend,
            )
        )
    
    def add_traces_by_domain(self, traces, sentences, colors, domains, filter_domain):
        zipped_for_filter = list(zip(traces, sentences, colors, domains))
        domain_specific = list(filter(lambda x: x[3] == filter_domain, zipped_for_filter))
        unzipped_for_tracing = list(zip(*domain_specific))
        filtered = {
            "traces": np.array(unzipped_for_tracing[0]),
            "sentences": unzipped_for_tracing[1],
            "colors": unzipped_for_tracing[2],
        }
        self.add_trace(filtered['traces'], filtered['sentences'], filtered['colors'], filter_domain)
    
    def show(self):
        self.figure.show()

In [59]:
#traces = tsne(distance_matrix)
# Colour traces by domains
p = Plotter()
p.add_traces_by_domain(traces, preprocessor.requirements, colors, re_domains, "Health")
p.add_traces_by_domain(traces, preprocessor.requirements, colors, re_domains, "Entertainment")
p.add_traces_by_domain(traces, preprocessor.requirements, colors, re_domains, "Energy")
p.add_traces_by_domain(traces, preprocessor.requirements, colors, re_domains, "Safety")
p.add_traces_by_domain(traces, preprocessor.requirements, colors, re_domains, "Other")
p.show()

In [61]:
for i, s in enumerate(sentences):
    if "I want music to play whenever I am in the kitchen so that I can be entertained" in s:
        print(i, s)
    if "when I get home so that it will help me relax" in s:
        print(i, s)

1181 As a home occupant I want music to be played when I get home so that it will help me relax
1213 As a home owner I want music to play whenever I am in the kitchen so that I can be entertained while cooking or cleaning


In [62]:
print(traces[1181], traces[1213])

[3.894778  1.4844187] [-2.5041924 -1.3534437]


In [126]:
def color_word(w):
    if w in d1.split(" ") and w in d2.split(" "):
        print(w, "SAND")
        return SAND
    if w in d1.split(" "):
        print(w, "CHERRY")
        return CHERRY
    print(w, "TEAL")
    return TEAL

d1 = sentences[1181]
d2 = sentences[1213]
df1 = list(filter(lambda w: w in pt_word_2_vec.vocabulary, d1.split(" ")))
df2 = list(filter(lambda w: w in pt_word_2_vec.vocabulary, d2.split(" ")))
d1_vec = [pt_word_2_vec.vectors[w] for w in df1]
d2_vec = [pt_word_2_vec.vectors[w] for w in df2]
d1_colors = [color_word(w) for w in df1]
d2_colors = [color_word(w) for w in df2]

occupant CHERRY
music SAND
played CHERRY
get CHERRY
help CHERRY
relax CHERRY
music SAND
play TEAL
whenever TEAL
kitchen TEAL
entertained TEAL
cooking TEAL
cleaning TEAL


In [127]:
t12 = tsne(d1_vec+d2_vec)
t1 = list(t12[:len(df1)])
t2 = list(t12[len(df1):])

In [130]:
pl = Plotter()
pl.add_trace(np.array(t1), df1, d1_colors, d1)
pl.add_trace(np.array(t2), df2, d2_colors[1:], d2)

In [131]:
pl.show()

In [132]:
pl.figure.write_image("../figures/example_sentences.svg")