# Develop a Prototype Featureset as Style Embedding

Goal: Model the style of committers by creating a self-built style embedding including features like length, polarity, and others that were explored earlier.

#### Load Data

In [1]:
import pandas as pd

subset_size = 100000

data = pd.read_pickle('../data/03_Subset_Frequent_Committers.pkl')
data.head(3)

Unnamed: 0,message,author_email,project
0,Fixed an error happening when the memory stats...,michele.simionato@gmail.com,gem_oq-engine
1,Updated setup.py [skip CI],michele.simionato@gmail.com,micheles_decorator
2,Fixed an exposure test [skip hazardlib],michele.simionato@gmail.com,gem_oq-engine


### Construct First Feature Set

A first feature set is set up with the length and the amount of point characters of a message to learn how to do the implementation.

In [2]:
import numpy as np

simple_feature_set = []

for message in data['message'][:subset_size]:
    number_of_chars         = len(message)
    point_count             = message.count(".")
    simple_feature_set.append([number_of_chars, point_count])

simple_feature_set = np.array(simple_feature_set)

In [3]:
simple_feature_set

array([[148,   0],
       [ 26,   1],
       [ 39,   0],
       ...,
       [ 66,   0],
       [ 40,   0],
       [ 51,   0]])

### Construct Complex Feature Set

A second featureset containing more values is constructed by the corresponding function in features.py.

In [4]:
import sys

sys.path.append('..')

from util.features import build_featureset

# Taking a subset has not any influence when training on the frequent committer dataset since it has less then 100000 sample

complex_featureset = build_featureset(data[:subset_size], normalize=False)

In [None]:
np.save('../data/03_Subset_Frequent_Committers_Features', complex_featureset)

In [None]:
import numpy as np

complex_featureset = np.load('../data/03_Subset_Frequent_Committers_Features.npy', allow_pickle=True)

### Normalize

In [None]:
#feature_set_normed = simple_feature_set / np.linalg.norm(simple_feature_set)
feature_set_normed = complex_featureset / np.linalg.norm(complex_featureset)

In [None]:
feature_set_normed

array([[7.59403485e-03, 2.10375290e-03, 3.59177324e-04, ...,
        5.13110463e-03, 4.61799416e-03, 4.72061626e-03],
       [1.33408720e-03, 3.07866278e-04, 1.53933139e-04, ...,
        5.13110463e-03, 4.92586044e-03, 5.18241567e-03],
       [2.00113080e-03, 4.10488370e-04, 5.13110463e-05, ...,
        5.13110463e-03, 4.61799416e-03, 4.72061626e-03],
       ...,
       [4.92586044e-03, 7.18354648e-04, 3.59177324e-04, ...,
        4.92586044e-03, 5.13110463e-03, 4.72061626e-03],
       [1.23146511e-03, 2.05244185e-04, 5.13110463e-05, ...,
        5.13110463e-03, 4.31012789e-03, 4.72061626e-03],
       [9.23598833e-04, 1.53933139e-04, 5.13110463e-05, ...,
        5.13110463e-03, 4.72061626e-03, 4.97717149e-03]])

### Calculate Distance Matrix

Use Subset for now because computationally intensive.

In [None]:
distance_matrix = np.array([[np.linalg.norm(feat_vector - compare_feat_vector) for compare_feat_vector in feature_set_normed] for feat_vector in feature_set_normed])

In [None]:
distance_matrix

array([[0.        , 0.00788893, 0.00692035, ..., 0.00342254, 0.00790026,
        0.008298  ],
       [0.00788893, 0.        , 0.00113591, ..., 0.00465466, 0.00080488,
        0.00054564],
       [0.00692035, 0.00113591, 0.        , ..., 0.00370049, 0.00102384,
        0.00140633],
       ...,
       [0.00342254, 0.00465466, 0.00370049, ..., 0.        , 0.00470304,
        0.00505619],
       [0.00790026, 0.00080488, 0.00102384, ..., 0.00470304, 0.        ,
        0.00063054],
       [0.008298  , 0.00054564, 0.00140633, ..., 0.00505619, 0.00063054,
        0.        ]])

### Train K-Means

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(100)
kmeans.fit(feature_set_normed)
kmeans_prediction = kmeans.predict(feature_set_normed)

In [None]:
kmeans_prediction

array([67, 91, 40, ..., 47, 70, 52], dtype=int32)

### Evaluate K-Means

In [None]:
from util.k_means import k_means_summary

k_means_summary = k_means_summary(kmeans_prediction, kmeans.n_clusters, data[:subset_size])
k_means_summary

IndexError: index 10000 is out of bounds for axis 0 with size 10000

In [None]:
k_means_summary.describe()

In [None]:
from util.k_means import print_k_means_classes

print_k_means_classes(kmeans_prediction, kmeans.n_clusters, data[:subset_size])