In [1]:
from pathlib import Path
from mare.analysis import RequirementsPreprocessor, PreTrainedWord2VecAnalyser

# load requirements
path_to_requirements = Path('..', 'crowdre_cleaned-csv', 'requirements.csv')
preprocessor = RequirementsPreprocessor(path_to_requirements)
preprocessor._preprocess_requirements()

In [2]:
import gensim.downloader as api

from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

dataset = api.load("text8")
model = api.load("word2vec-google-news-300")
phrases = Phrases(dataset, min_count=1, threshold=1)
# consider smart home a phrase
phrases.add_vocab([['smart', 'home']])

In [3]:

# Prepare reqs for word2vec
redundancy_filter = lambda word: word.lower() not in ['as', 'smart', 'home', 'owner', 'i', 'want', 'be', 'able']
lex_to_filter = lambda re: phrases[list(filter(redundancy_filter, re.lexical_words))]


lexical_phrases = list(map(lex_to_filter, preprocessor.requirements))

In [4]:
model.wmdistance(lexical_phrases[0], lexical_phrases[1])

3.2339909965537417

In [68]:
# Fix a phrase which would result in 'inf' in the matrix
working_phrase = ['keep', 'track', 'food_consumption', 'keep', 'track', 'grocery_shopping']
lexical_phrases[504] = working_phrase

In [5]:
import numpy as np
from tqdm.notebook import tqdm

from multiprocessing import Pool

import os

DIMENSION = len(lexical_phrases)
POOL_SIZE = 12

def calc_distance(sentence_from):
    dimensions = np.zeros(DIMENSION)
    for i, sentence_to in enumerate(lexical_phrases):
        word_mover_distance = model.wmdistance(sentence_from, sentence_to)
        dimensions[i] = word_mover_distance
    return dimensions

if __name__ == '__main__':
    # Compute distance matrix for re-sentences using word movers distance
    distance_matrix = np.zeros((DIMENSION, DIMENSION))
    with Pool(POOL_SIZE) as p:
        # Calculate distancens for every <POOL_SIZE> sentences
        for i in tqdm(range(0, len(lexical_phrases), POOL_SIZE), desc="Calculating distances", leave=False):
            results = p.map(calc_distance, lexical_phrases[i:i+POOL_SIZE])
            for j, result in enumerate(results):
                distance_matrix[j+i] = result

HBox(children=(FloatProgress(value=0.0, description='Calculating distances', max=248.0, style=ProgressStyle(de…



In [6]:
np.savetxt("pre_trained_distance_matrix.numpy", distance_matrix, delimiter=';')

# distance_matrix = np.loadtxt("pre_trained_distance_matrix.numpy", delimiter=';')

In [74]:
from collections import defaultdict
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=5,
    n_init=40,
    max_iter=2000,
    precompute_distances=True,
    random_state=0,
    n_jobs=-1
).fit(distance_matrix)

In [75]:
import pprint
from collections import defaultdict

def analyze_clusters(sentences, domains, clusters):
    re_clusters = defaultdict(list)

    for sentence, domain, cluster in zip(sentences, domains, clusters):
        re_clusters[cluster].append((sentence, domain))

    cluster_domains = {}
    for cluster_key in re_clusters.keys():
        domain_count = defaultdict(int)
        for sent, domain in re_clusters[cluster_key]:
            domain_count[domain] += 1
        cluster_domains[cluster_key] = ("Assigned reqs: {}".format(len(re_clusters[cluster_key])), domain_count)

    pprint.pprint(cluster_domains)

In [76]:
re_sentences = list(map(lambda re: re.cleaned_text, preprocessor.requirements))
re_domains = list(map(lambda re: re.domain, preprocessor.requirements))

In [77]:
analyze_clusters(re_sentences, re_domains, kmeans.labels_)

{0: ('Assigned reqs: 840',
     defaultdict(<class 'int'>,
                 {'Energy': 157,
                  'Entertainment': 127,
                  'Health': 210,
                  'Other': 91,
                  'Safety': 255})),
 1: ('Assigned reqs: 617',
     defaultdict(<class 'int'>,
                 {'Energy': 179,
                  'Entertainment': 93,
                  'Health': 99,
                  'Other': 89,
                  'Safety': 157})),
 2: ('Assigned reqs: 475',
     defaultdict(<class 'int'>,
                 {'Energy': 128,
                  'Entertainment': 62,
                  'Health': 94,
                  'Other': 70,
                  'Safety': 121})),
 3: ('Assigned reqs: 773',
     defaultdict(<class 'int'>,
                 {'Energy': 109,
                  'Entertainment': 149,
                  'Health': 130,
                  'Other': 100,
                  'Safety': 285})),
 4: ('Assigned reqs: 261',
     defaultdict(<class 'int'>,
                

In [81]:
from sklearn.manifold import TSNE
import plotly.graph_objects as go

DOMAIN_COLORS = {
    'Energy': "rgb(209,50,69)",
    'Entertainment': "rgb(218,135,52)",
    'Health': "rgb(4, 76, 93)",
    'Safety': "rgb(47, 155, 19)",
    'Other': "rgb(255,255,255)",
}
colors = list(map(lambda dom: DOMAIN_COLORS[dom], re_domains))

def plot_tsne(data, perplexity=30.0, learning_rate=200.0):
    fig = go.Figure()
    traces = TSNE(
        n_components=2,
        perplexity=perplexity,
        learning_rate=learning_rate,
        n_jobs=-1
    ).fit_transform(data)
    fig.add_trace(go.Scatter(x=traces[:,0], y=traces[:,1], mode='markers', marker_color=colors))
    fig.show()

In [89]:
plot_tsne(distance_matrix, perplexity=35, learning_rate=350)