In [33]:
from pathlib import Path
from mare.analysis import RequirementsPreprocessor, PreTrainedWord2VecAnalyser

In [2]:
path_to_requirements = Path('..', 'crowdre_cleaned-csv', 'requirements.csv')
preprocessor = RequirementsPreprocessor(path_to_requirements)
preprocessor._preprocess_requirements()

In [3]:
google_data = Path('..', 'GoogleNews-vectors-negative300.bin')
pt_word_2_vec = PreTrainedWord2VecAnalyser(preprocessor.requirements)
pt_word_2_vec.load(google_data)
pt_word_2_vec.build_vocabulary()

In [6]:
def build_requirement_vectors(re_list, threshold=100):
    """Replaces each word in a given list of RE sentences with its vector representation."""
    shortest_req = float("inf")
    longest_req = 0
    requirements = []
    sentences = []
    domains = []
    for requirement in re_list:
        # the lexical words are the tokenized sentences which were freed from stopwords
        filtered_tokens = filter(
            lambda token: pt_word_2_vec._token_not_redundant(token) and pt_word_2_vec._token_in_training_data(token),
            requirement.lexical_words
        )
        sentence = pt_word_2_vec.vectors[filtered_tokens]
        if len(sentence) >= threshold:
            shortest_req = min(len(sentence), shortest_req)
            longest_req = max(len(sentence), longest_req)
            requirements.append(sentence.transpose())
            sentences.append(requirement.cleaned_text)
            domains.append(requirement.domain)
    return requirements, sentences, domains, shortest_req, longest_req

req_vecs, sentences, domains, shortest, longest = build_requirement_vectors(pt_word_2_vec.requirements_list, threshold=6)

In [7]:
import pprint
from collections import defaultdict

def analyze_clusters(sentences, domains, clusters):
    re_clusters = defaultdict(list)

    for sentence, domain, cluster in zip(sentences, domains, clusters):
        re_clusters[cluster].append((sentence, domain))

    cluster_domains = {}
    for cluster_key in re_clusters.keys():
        domain_count = defaultdict(int)
        for sent, domain in re_clusters[cluster_key]:
            domain_count[domain] += 1
        cluster_domains[cluster_key] = ("Assigned reqs: {}".format(len(re_clusters[cluster_key])), domain_count)

    pprint.pprint(cluster_domains)

In [9]:
from sklearn.decomposition import PCA
import numpy as np

pcaed = []
pca = PCA(n_components=shortest)
for vec in req_vecs:
    pcaed.append(pca.fit_transform(vec))

pcaed_arr = np.array(pcaed)
x, y, z = pcaed_arr.shape

In [12]:
from collections import defaultdict
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=3,
    n_init=40,
    max_iter=2000,
    precompute_distances=True,
    random_state=0,
    n_jobs=-1
).fit(pcaed_arr.reshape(x, y*z))

In [13]:
analyze_clusters(sentences, domains, kmeans.labels_)

{0: ('Assigned reqs: 789',
     defaultdict(<class 'int'>,
                 {'Energy': 206,
                  'Entertainment': 89,
                  'Health': 168,
                  'Other': 101,
                  'Safety': 225})),
 1: ('Assigned reqs: 1397',
     defaultdict(<class 'int'>,
                 {'Energy': 297,
                  'Entertainment': 263,
                  'Health': 267,
                  'Other': 185,
                  'Safety': 385})),
 2: ('Assigned reqs: 673',
     defaultdict(<class 'int'>,
                 {'Energy': 105,
                  'Entertainment': 106,
                  'Health': 132,
                  'Other': 87,
                  'Safety': 243}))}


In [14]:
from sklearn.cluster import AgglomerativeClustering

aggc = AgglomerativeClustering(n_clusters=5).fit(pcaed_arr.reshape(x, y*z))

In [15]:
analyze_clusters(sentences, domains, aggc.labels_)

{0: ('Assigned reqs: 934',
     defaultdict(<class 'int'>,
                 {'Energy': 123,
                  'Entertainment': 227,
                  'Health': 151,
                  'Other': 99,
                  'Safety': 334})),
 1: ('Assigned reqs: 609',
     defaultdict(<class 'int'>,
                 {'Energy': 206,
                  'Entertainment': 56,
                  'Health': 168,
                  'Other': 77,
                  'Safety': 102})),
 2: ('Assigned reqs: 85',
     defaultdict(<class 'int'>,
                 {'Entertainment': 5,
                  'Health': 32,
                  'Other': 14,
                  'Safety': 34})),
 3: ('Assigned reqs: 768',
     defaultdict(<class 'int'>,
                 {'Energy': 105,
                  'Entertainment': 134,
                  'Health': 100,
                  'Other': 97,
                  'Safety': 332})),
 4: ('Assigned reqs: 463',
     defaultdict(<class 'int'>,
                 {'Energy': 174,
                  '

In [23]:
from sklearn.manifold import TSNE
import plotly.graph_objects as go

DOMAIN_COLORS = {
    'Energy': "rgb(209,50,69)",
    'Entertainment': "rgb(218,135,52)",
    'Health': "rgb(4, 76, 93)",
    'Safety': "rgb(47, 155, 19)",
    'Other': "rgb(255,255,255)",
}
colors = list(map(lambda dom: DOMAIN_COLORS[dom], domains))

def plot_tsne(data, perplexity=30.0, learning_rate=200.0):
    fig = go.Figure()
    traces = TSNE(
        n_components=2,
        perplexity=perplexity,
        learning_rate=learning_rate,
        n_jobs=-1
    ).fit_transform(data)
    fig.add_trace(go.Scatter(x=traces[:,0], y=traces[:,1], mode='markers', marker_color=colors))
    fig.show()

In [22]:
# PLOT PCA
fig = go.Figure()
traces = PCA(n_components=2).fit_transform(pcaed_arr.reshape(x, y*z))
fig.add_trace(go.Scatter(x=traces[:,0], y=traces[:,1], mode='markers', marker_color=colors))
fig.show()

In [24]:
plot_tsne(pcaed_arr.reshape(x, y*z))

In [26]:
plot_tsne(pcaed_arr.reshape(x, y*z), learning_rate=1000)

In [27]:
plot_tsne(pcaed_arr.reshape(x, y*z), learning_rate=10)

In [28]:
plot_tsne(pcaed_arr.reshape(x, y*z), perplexity=5)

In [29]:
plot_tsne(pcaed_arr.reshape(x, y*z), perplexity=50)

In [30]:
plot_tsne(pcaed_arr.reshape(x, y*z), perplexity=5, learning_rate=10)

In [31]:
plot_tsne(pcaed_arr.reshape(x, y*z), perplexity=50, learning_rate=1000)

In [32]:
plot_tsne(pcaed_arr.reshape(x, y*z), perplexity=35, learning_rate=400)