In [1]:
%load_ext autoreload
%autoreload 2

# Word2Vec Clustering (self- and pre-trained)

## RE Preparation

In [2]:
from pathlib import Path
from mare.analysis import RequirementsPreprocessor, PreTrainedWord2VecAnalyzer, SelfTrainedWord2VecAnalyzer

In [3]:
path_to_requirements = Path('..', 'crowdre_cleaned-csv', 'requirements.csv')
preprocessor = RequirementsPreprocessor(path_to_requirements)
preprocessor._preprocess_requirements()

## Word2Vec

### Pre-Trained

In [96]:
google_data = Path('..', 'word2vec-google-news-300.gz')
pt_word_2_vec = PreTrainedWord2VecAnalyzer(preprocessor.requirements)
pt_word_2_vec.load(google_data)

In [97]:
pt_word_2_vec.build_requirement_vectors()
pcaed_arr_pt = pt_word_2_vec.reduce_dimensions()

In [102]:
x, y, z = pcaed_arr_pt.shape
pt_word_2_vec.tsne_traces(pcaed_arr_pt.reshape(x, y*z))

In [None]:
# pt_word_2_vec.save_traces("numpy_results/tsne_traces_pretrained_word2vec_2020-05-29.numpy")

### Self-trained

In [27]:
st_word_2_vec = SelfTrainedWord2VecAnalyzer(preprocessor.requirements)
st_word_2_vec.train(min_occurrences=5)

In [28]:
st_word_2_vec.build_requirement_vectors(force_overwrite=True)
pcaed_arr_st = st_word_2_vec.reduce_dimensions()

Sentence cannot be embedded:
	 As a aspiring baker I want to bake so that my baked goods are considered delicious ['As', 'aspiring', 'baker', 'I', 'want', 'bake', 'baked', 'goods', 'considered', 'delicious']


In [29]:
x, y, z = pcaed_arr_st.shape
st_word_2_vec.tsne_traces(pcaed_arr_st.reshape(x, y*z))

In [31]:
st_word_2_vec.save_traces("numpy_results/tsne_traces_selftrained_word2vec_2020-05-29_minocc_5.numpy")

## Plotting

In [9]:
import plotly.graph_objects as go

def simple_plot(traces, domains, sentences):
    CHERRY = "rgba(137,28,86,.9)"
    TEAL = "rgba(57,117,121,.9)"
    ORANGE = "rgba(212,129,59,.9)"
    PURPLE = "rgba(136,104,156,.9)"
    SAND = "rgba(186,171,155,.9)"

    DOMAIN_COLORS = {
        'Energy': TEAL,
        'Entertainment': SAND,
        'Health': PURPLE,
        'Safety': CHERRY,
        'Other': ORANGE,
    }
    colors = list(map(lambda dom: DOMAIN_COLORS[dom], domains))
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=traces[:,0], y=traces[:,1], mode='markers', marker_color=colors, text=sentences))
    fig.show()

In [106]:
simple_plot(pt_word_2_vec.traces, pt_word_2_vec.domains, pt_word_2_vec.sentences)

In [107]:
# plot pca for comparison
from sklearn.decomposition import PCA
pca_traces_pt = PCA(n_components=2).fit_transform(pcaed_arr_pt.reshape(x, y*z))
simple_plot(pca_traces, pt_word_2_vec.domains, pt_word_2_vec.sentences)

In [30]:
simple_plot(st_word_2_vec.traces, st_word_2_vec.domains, st_word_2_vec.sentences)

In [22]:
# plot pca for comparison
from sklearn.decomposition import PCA
pca_traces_st = PCA(n_components=2).fit_transform(pcaed_arr_st.reshape(x, y*z))
simple_plot(pca_traces_st, st_word_2_vec.domains, st_word_2_vec.sentences)

## Analysis

In [28]:
from collections import defaultdict

# Analysis: which tokens can be embedded?
re_tokens = []
for re in preprocessor.requirements:
    re_tokens += re.tokens
    
unique = set(re_tokens)
print("Unique tokens: ", len(unique))

print("In training data: ", len(list(filter(lambda t: pt_word_2_vec._token_in_training_data(t), unique))))

sentences_with_replacements = defaultdict(int)
for re in preprocessor.requirements:
    unique_tokens = set(re.lexical_words)
    tokens_removed = len(list(filter(lambda t: not pt_word_2_vec._token_in_training_data(t), unique_tokens)))    
    sentences_with_replacements[tokens_removed] += 1

print("\nSentencens with replacements:")
print("tokens rmvd\tno of reqs")
for k, v in sentences_with_replacements.items():
    print(k, "\t\t", v)


Unique tokens:  4968
In training data:  4630

Sentencens with replacements:
tokens rmvd	no of reqs
0 		 2577
1 		 355
2 		 29
3 		 3
4 		 1
6 		 1


## Clustering

In [16]:
import pprint
from collections import defaultdict

def analyze_clusters(sentences, domains, clusters):
    re_clusters = defaultdict(list)

    for sentence, domain, cluster in zip(sentences, domains, clusters):
        re_clusters[cluster].append((sentence, domain))

    cluster_domains = {}
    for cluster_key in re_clusters.keys():
        domain_count = defaultdict(int)
        for sent, domain in re_clusters[cluster_key]:
            domain_count[domain] += 1
        cluster_domains[cluster_key] = ("Assigned reqs: {}".format(len(re_clusters[cluster_key])), domain_count)

    pprint.pprint(cluster_domains)

In [17]:
from collections import defaultdict
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=3,
    n_init=40,
    max_iter=2000,
    precompute_distances=True,
    random_state=0,
    n_jobs=-1
).fit(pcaed_arr.reshape(x, y*z))

### KMeans

In [18]:
analyze_clusters(sentences, domains, kmeans.labels_)

{0: ('Assigned reqs: 834',
     defaultdict(<class 'int'>,
                 {'Energy': 212,
                  'Entertainment': 93,
                  'Health': 183,
                  'Other': 105,
                  'Safety': 241})),
 1: ('Assigned reqs: 676',
     defaultdict(<class 'int'>,
                 {'Energy': 107,
                  'Entertainment': 108,
                  'Health': 132,
                  'Other': 86,
                  'Safety': 243})),
 2: ('Assigned reqs: 1456',
     defaultdict(<class 'int'>,
                 {'Energy': 307,
                  'Entertainment': 270,
                  'Health': 278,
                  'Other': 193,
                  'Safety': 408}))}


### Agglomerative Clustering

In [19]:
from sklearn.cluster import AgglomerativeClustering

aggc = AgglomerativeClustering(n_clusters=5).fit(pcaed_arr.reshape(x, y*z))

In [20]:
analyze_clusters(sentences, domains, aggc.labels_)

{0: ('Assigned reqs: 1326',
     defaultdict(<class 'int'>,
                 {'Energy': 265,
                  'Entertainment': 263,
                  'Health': 237,
                  'Other': 174,
                  'Safety': 387})),
 1: ('Assigned reqs: 420',
     defaultdict(<class 'int'>,
                 {'Energy': 134,
                  'Entertainment': 31,
                  'Health': 140,
                  'Other': 58,
                  'Safety': 57})),
 2: ('Assigned reqs: 106',
     defaultdict(<class 'int'>,
                 {'Energy': 2,
                  'Entertainment': 7,
                  'Health': 43,
                  'Other': 15,
                  'Safety': 39})),
 3: ('Assigned reqs: 130',
     defaultdict(<class 'int'>,
                 {'Energy': 42,
                  'Entertainment': 7,
                  'Health': 41,
                  'Other': 19,
                  'Safety': 21})),
 4: ('Assigned reqs: 984',
     defaultdict(<class 'int'>,
                 {'Energ

### Single Sentences

In [45]:
for i, s in enumerate(sentences):
    if "I want music to play whenever I am in the kitchen so that I can be entertained" in s:
        print(i, s, traces[i])
    if "when I get home so that it will help me relax" in s:
        print(i, s, traces[i])

1181 As a home occupant I want music to be played when I get home so that it will help me relax [-24.766968   4.210367]
1213 As a home owner I want music to play whenever I am in the kitchen so that I can be entertained while cooking or cleaning [53.929493   4.7521205]


In [46]:
for i, s in enumerate(sentences):
    if "I want Room thermostat sensor so that The room is optimal" in s:
        print(i, s, traces[i])
    if "I want Room thermostats so that Protect the" in s:
        print(i, s, traces[i])

341 As a home owner I want Room thermostat sensor so that The room is optimal temperature for an occupant  [-58.033245  11.973038]
804 As a home occupant I want Room thermostats so that Protect the room temperature [-57.889973  12.275528]


In [47]:
for i, s in enumerate(sentences):
    if "shut off lights when I leave the room" in s:
        print(i, s, traces[i])
    if "automatically shut off lights in rooms" in s:
        print(i, s, traces[ib])

273 As a home occupant I want the house to automatically shut off lights in rooms without occupants so that I will save on electricity bills [-22.960062     0.42760837]
782 As a home occupant I want my smart home to shut off lights when I leave the room so that I can save on my energy bill [23.517483 18.734446]


### Presentation

In [113]:
def color_word(w):
    if w in d1.split(" ") and w in d2.split(" "):
        return SAND
    if w in d1.split(" "):
        return CHERRY
    return TEAL

d1 = sentences[1181]
d2 = sentences[1213]
df1 = list(filter(lambda w: w in pt_word_2_vec.vocabulary, d1.split(" ")))
df2 = list(filter(lambda w: w in pt_word_2_vec.vocabulary, d2.split(" ")))
d1_vec = [pt_word_2_vec.vectors[w] for w in df1]
d2_vec = [pt_word_2_vec.vectors[w] for w in df2]
d1_colors = [color_word(w) for w in df1]
d2_colors = [color_word(w) for w in df2]

AttributeError: 'PreTrainedWord2VecAnalyzer' object has no attribute 'vocabulary'

In [127]:
t12 = tsne(d1_vec+d2_vec)
t1 = list(t12[:len(df1)])
t2 = list(t12[len(df1):])

In [130]:
pl = Plotter()
pl.add_trace(np.array(t1), df1, d1_colors, d1)
pl.add_trace(np.array(t2), df2, d2_colors[1:], d2)

In [131]:
pl.show()

In [132]:
pl.figure.write_image("../figures/example_sentences.svg")