In [2]:
from pathlib import Path
from mare.analysis import RequirementsPreprocessor, PreTrainedWord2VecAnalyser

path_to_requirements = Path('..', 'crowdre_cleaned-csv', 'requirements.csv')
preprocessor = RequirementsPreprocessor(path_to_requirements)
preprocessor._preprocess_requirements()

In [3]:
# Prepare reqs for word2vec
redundancy_filter = lambda stem: stem.lower() not in ['as', 'smart', 'home', 'owner', 'i', 'want', 'be', 'able']
stem_to_filter = lambda re: list(filter(redundancy_filter, re.stems))

stemmed_sentences = list(map(stem_to_filter, preprocessor.requirements))

In [4]:
# Compute distance matrix for re-sentences using word movers distance

import gensim

model = gensim.models.Word2Vec(stemmed_sentences, min_count=5, size=50, sg=1)

In [5]:
model.wv.wmdistance(stemmed_sentences[1], stemmed_sentences[2])

0.24543347724184175

In [None]:
import numpy as np
from tqdm.notebook import tqdm

from multiprocessing import Pool

import os

DIMENSION = len(stemmed_sentences)
POOL_SIZE = 12

def calc_distance(sentence_from):
    dimensions = np.zeros(DIMENSION)
    for i, sentence_to in enumerate(stemmed_sentences):
        word_mover_distance = model.wv.wmdistance(sentence_from, sentence_to)
        dimensions[i] = word_mover_distance
    return dimensions

if __name__ == '__main__':
    distance_matrix = np.zeros((DIMENSION, DIMENSION))
    with Pool(POOL_SIZE) as p:
        # Calculate distancens for every <POOL_SIZE> sentences
        for i in tqdm(range(0, len(stemmed_sentences), POOL_SIZE), desc="Calculating distances", leave=False):
            results = p.map(calc_distance, stemmed_sentences[i:i+POOL_SIZE])
            for j, result in enumerate(results):
                distance_matrix[j+i] = result


HBox(children=(FloatProgress(value=0.0, description='Calculating distances', max=248.0, style=ProgressStyle(de…