# Develop a Prototype Featureset as Style Embedding

Goal: Model the style of committers by creating a self-built style embedding including features like length, polarity, and others that were explored earlier.

#### Load Data

In [1]:
import pandas as pd

data = pd.read_pickle('../data/03_Subset_Frequent_Committers.pkl')
data.head(3)

Unnamed: 0,message,author_email,project
0,Fixed an error happening when the memory stats...,michele.simionato@gmail.com,gem_oq-engine
1,Updated setup.py [skip CI],michele.simionato@gmail.com,micheles_decorator
2,Fixed an exposure test [skip hazardlib],michele.simionato@gmail.com,gem_oq-engine


### Construct First Feature Set

A first feature set is set up with the length and the amount of point characters of a message to learn how to do the implementation.

In [2]:
import numpy as np

simple_feature_set = []
subset_size = 100000

for message in data['message'][:subset_size]:
    number_of_chars         = len(message)
    point_count             = message.count(".")
    simple_feature_set.append([number_of_chars, point_count])

simple_feature_set = np.array(simple_feature_set)

In [3]:
simple_feature_set

array([[148,   0],
       [ 26,   1],
       [ 39,   0],
       ...,
       [ 66,   0],
       [ 40,   0],
       [ 51,   0]])

A second featureset containing more values is constructed by the corresponding function in features.py.

In [6]:
from features import build_featureset

subset_size = 10

complex_featureset = build_featureset(data[:subset_size], normalize=False)

In [7]:
print(complex_featureset)

[[148, 41, 7, 103, 0, 0, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, 4, 4, 0, 0, -0.05, 0.30000000000000004, 100, 90, 92], [26, 6, 3, 17, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0.0, 0.0, 100, 96, 101], [39, 8, 1, 31, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0.1, 0.2, 100, 90, 92], [106, 39, 8, 66, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 0, 0, 0.0, 0.0, 100, 90, 92], [79, 35, 8, 45, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 0, 0, 0.0, 0.0, 100, 90, 92], [17, 5, 3, 10, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0.0, 0.0, 96, 97, 101], [24, 6, 3, 16, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0.5, 0.5, 86, 100, 97], [31, 8, 1, 20, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0.5, 0.5, 84, 92, 85], [32, 5, 1, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0.0, 100, 100, 92], [45, 10, 4, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, -0.225, 0.225, 84, 92, 85]]


### Normalize

In [13]:
feature_set_normed = simple_feature_set / np.linalg.norm(simple_feature_set)
#feature_set_normed = complex_featureset / np.linalg.norm(complex_featureset)

In [14]:
feature_set_normed

array([[6.73477363e-03, 0.00000000e+00],
       [1.18313591e-03, 4.55052272e-05],
       [1.77470386e-03, 0.00000000e+00],
       ...,
       [3.00334500e-03, 0.00000000e+00],
       [1.82020909e-03, 0.00000000e+00],
       [2.32076659e-03, 0.00000000e+00]])

### Calculate Distance Matrix

Use Subset for now because computationally intensive.

In [15]:
distance_matrix = np.array([[np.linalg.norm(feat_vector - compare_feat_vector) for compare_feat_vector in feature_set_normed[:1000]] for feat_vector in feature_set_normed[:1000]])

In [16]:
distance_matrix

array([[0.00000000e+00, 5.55182422e-03, 4.96006977e-03, ...,
        1.77470386e-03, 5.73365863e-03, 5.64264818e-03],
       [5.55182422e-03, 0.00000000e+00, 5.93315574e-04, ...,
        7.32648291e-03, 1.87622858e-04, 1.01752781e-04],
       [4.96006977e-03, 5.93315574e-04, 0.00000000e+00, ...,
        6.73477363e-03, 7.73588863e-04, 6.82578409e-04],
       ...,
       [1.77470386e-03, 7.32648291e-03, 6.73477363e-03, ...,
        0.00000000e+00, 7.50836250e-03, 7.41735204e-03],
       [5.73365863e-03, 1.87622858e-04, 7.73588863e-04, ...,
        7.50836250e-03, 0.00000000e+00, 9.10104545e-05],
       [5.64264818e-03, 1.01752781e-04, 6.82578409e-04, ...,
        7.41735204e-03, 9.10104545e-05, 0.00000000e+00]])

How to evaluate a large distance matrix?

### Train K-Means

In [17]:
from sklearn.cluster import KMeans

kmeans = KMeans(20)
kmeans.fit(feature_set_normed)
kmeans_prediction = kmeans.predict(feature_set_normed)

In [18]:
kmeans_prediction

array([13, 12,  0, ...,  4,  0, 16], dtype=int32)

### Evaluate K-Means

In [20]:
from utils.k_means import k_means_summary

subset_size = 100000

k_means_summary = k_means_summary(kmeans_prediction, kmeans.n_clusters, data[:subset_size])
k_means_summary

Unnamed: 0,Number of Messages,Number of different Authors,Average number of commits per different Author,Most common Author,Number of different Projects,Average number of commits per different Project,Most common project
0,9247.0,42.0,220.166667,"('igor.kroitor@gmail.com', 655)",436.0,21.208716,"('saltstack_salt', 771)"
1,990.0,41.0,24.146341,"('michele.simionato@gmail.com', 178)",122.0,8.114754,"('gem_oq-engine', 236)"
2,3827.0,42.0,91.119048,"('michele.simionato@gmail.com', 391)",270.0,14.174074,"('gem_oq-engine', 492)"
3,56.0,18.0,3.111111,"('palehose@gmail.com', 20)",24.0,2.333333,"('saltstack_salt', 21)"
4,7134.0,42.0,169.857143,"('thomas.parrott@canonical.com', 452)",369.0,19.333333,"('lxc_lxd', 494)"
5,589.0,38.0,15.5,"('ingo@silverstripe.com', 81)",80.0,7.3625,"('gem_oq-engine', 140)"
6,1566.0,41.0,38.195122,"('mark@mark-story.com', 127)",169.0,9.266272,"('gem_oq-engine', 153)"
7,279.0,36.0,7.75,"('ingo@silverstripe.com', 65)",61.0,4.57377,"('saltstack_salt', 44)"
8,4625.0,42.0,110.119048,"('crynobone@gmail.com', 1762)",255.0,18.137255,"('gem_oq-engine', 888)"
9,8414.0,42.0,200.333333,"('thomas.parrott@canonical.com', 460)",407.0,20.673219,"('saltstack_salt', 746)"


In [21]:
k_means_summary.describe()

Unnamed: 0,Number of Messages,Number of different Authors,Average number of commits per different Author,Number of different Projects,Average number of commits per different Project
count,20.0,20.0,20.0,20.0,20.0
mean,3416.25,38.7,81.996422,207.9,12.072915
std,3482.798344,5.956862,82.380318,145.090861,6.762211
min,56.0,18.0,3.111111,24.0,2.333333
25%,547.75,38.75,14.342949,76.75,7.103965
50%,1698.5,41.0,41.426829,186.0,9.56681
75%,6024.75,42.0,143.446429,319.5,18.830858
max,9919.0,42.0,236.166667,452.0,21.94469


In [22]:
from utils.k_means import print_k_means_classes

print_k_means_classes(kmeans_prediction, kmeans.n_clusters, data[:subset_size])


________________ Class 0 ________________

___
1) 
Fixed an exposure test [skip hazardlib]

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
___
2) 
Forbidded aggregate_by except in ebrisk

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
___
3) 
Fixed report_writer.count_eff_ruptures

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
___
4) 
Trapped only the DataStoreExportError

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
___
5) 
Rounding lon, lat in the sourcewriter

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
___
6) 
Tripled the saving speed [skip hazardlib]

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
___
7) 
Changed the tags from bytes to strings

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
___
8) 
Fixed a terribly hard to debug ordering bug

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-eng