# Develop a Prototype Featureset as Style Embedding

Goal: Model the style of committers by creating a self-built style embedding including features like length, polarity, and others that were explored earlier.

#### Load Data

In [1]:
import pandas as pd

data = pd.read_pickle('../data/03_Subset_Frequent_Committers.pkl')
data.head(3)

Unnamed: 0,message,author_email,project
0,Fixed an error happening when the memory stats...,michele.simionato@gmail.com,gem_oq-engine
1,Updated setup.py [skip CI],michele.simionato@gmail.com,micheles_decorator
2,Fixed an exposure test [skip hazardlib],michele.simionato@gmail.com,gem_oq-engine


### Construct First Feature Set

A first feature set is set up with the length and the amount of point characters of a message to learn how to do the implementation.

In [16]:
# the following installations are required
# python3 -m textblob.download_corpora
# python3 -m spacy download en_core_web_sm

import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x2968a7c40>

In [4]:
import numpy as np

simple_feature_set = []
subset_size = 100000

for message in data['message'][:subset_size]:
    number_of_chars         = len(message)
    point_count             = message.count(".")
    simple_feature_set.append([number_of_chars, point_count])

simple_feature_set = np.array(simple_feature_set)

In [18]:
simple_feature_set

array([[148.,  41.,   7., ..., 100.,  90.,  92.],
       [ 26.,   6.,   3., ..., 100.,  96., 101.],
       [ 39.,   8.,   1., ..., 100.,  90.,  92.],
       ...,
       [ 66.,  14.,   3., ...,  92.,  97.,  92.],
       [ 40.,   7.,   1., ...,  87.,  94., 100.],
       [ 51.,   9.,   1., ..., 100., 100.,  84.]])

A second featureset containing more values is constructed by the corresponding function in features.py.

In [2]:
from features import build_featureset

subset_size = 10

complex_featureset = build_featureset(data[:subset_size])

In [3]:
print(complex_featureset)

[[148, 41, 7, 103, 0, 0, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, 4, 4, 0, 0, -0.05, 0.30000000000000004, 100, 90, 92], [26, 6, 3, 17, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0.0, 0.0, 100, 96, 101], [39, 8, 1, 31, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0.1, 0.2, 100, 90, 92], [106, 39, 8, 66, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 0, 0, 0.0, 0.0, 100, 90, 92], [79, 35, 8, 45, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 0, 0, 0.0, 0.0, 100, 90, 92], [17, 5, 3, 10, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0.0, 0.0, 96, 97, 101], [24, 6, 3, 16, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0.5, 0.5, 86, 100, 97], [31, 8, 1, 20, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0.5, 0.5, 84, 92, 85], [32, 5, 1, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0.0, 100, 100, 92], [45, 10, 4, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, -0.225, 0.225, 84, 92, 85]]


### Normalize

In [19]:
feature_set_normed = simple_feature_set / np.linalg.norm(simple_feature_set)

In [20]:
feature_set_normed

array([[2.89836148e-03, 8.02924464e-04, 1.37084665e-04, ...,
        1.95835235e-03, 1.76251712e-03, 1.80168416e-03],
       [5.09171611e-04, 1.17501141e-04, 5.87505705e-05, ...,
        1.95835235e-03, 1.88001826e-03, 1.97793587e-03],
       [7.63757417e-04, 1.56668188e-04, 1.95835235e-05, ...,
        1.95835235e-03, 1.76251712e-03, 1.80168416e-03],
       ...,
       [1.29251255e-03, 2.74169329e-04, 5.87505705e-05, ...,
        1.80168416e-03, 1.89960178e-03, 1.80168416e-03],
       [7.83340940e-04, 1.37084665e-04, 1.95835235e-05, ...,
        1.70376655e-03, 1.84085121e-03, 1.95835235e-03],
       [9.98759699e-04, 1.76251712e-04, 1.95835235e-05, ...,
        1.95835235e-03, 1.95835235e-03, 1.64501597e-03]])

Is this way of normalizing correct?

### Calculate Distance Matrix

Use Subset for now because computationally intensive.

In [21]:
distance_matrix = np.array([[np.linalg.norm(feat_vector - compare_feat_vector) for compare_feat_vector in feature_set_normed[:1000]] for feat_vector in feature_set_normed[:1000]])

In [22]:
distance_matrix

array([[0.        , 0.00301358, 0.00264421, ..., 0.00129712, 0.00311891,
        0.00302938],
       [0.00301358, 0.        , 0.00043396, ..., 0.00393755, 0.00043549,
        0.00014263],
       [0.00264421, 0.00043396, 0.        , ..., 0.00360752, 0.00057805,
        0.0004164 ],
       ...,
       [0.00129712, 0.00393755, 0.00360752, ..., 0.        , 0.00403199,
        0.00396672],
       [0.00311891, 0.00043549, 0.00057805, ..., 0.00403199, 0.        ,
        0.00042261],
       [0.00302938, 0.00014263, 0.0004164 , ..., 0.00396672, 0.00042261,
        0.        ]])

How to evaluate a large distance matrix?

### Train K-Means

In [28]:
from sklearn.cluster import KMeans

kmeans = KMeans(20)
kmeans.fit(feature_set_normed)
kmeans_prediction = kmeans.predict(feature_set_normed)

In [24]:
kmeans_prediction

array([7, 3, 1, ..., 5, 1, 1], dtype=int32)

### Evaluate K-Means

In [29]:
from utils.k_means import k_means_summary

k_means_summary = k_means_summary(kmeans_prediction, kmeans.n_clusters, data[:subset_size])
k_means_summary

Unnamed: 0,Number of Messages,Number of different Authors,Average number of commits per different Author,Most common Author,Number of different Projects,Average number of commits per different Project,Most common project
0,8625.0,42.0,205.357143,"('thomas.parrott@canonical.com', 543)",391.0,22.058824,"('lxc_lxd', 780)"
1,1594.0,41.0,38.878049,"('mark@mark-story.com', 127)",167.0,9.54491,"('lxc_lxd', 160)"
2,4192.0,42.0,99.809524,"('thomas.parrott@canonical.com', 439)",286.0,14.657343,"('lxc_lxd', 459)"
3,418.0,38.0,11.0,"('ingo@silverstripe.com', 87)",77.0,5.428571,"('saltstack_salt', 71)"
4,9002.0,42.0,214.333333,"('thomas.parrott@canonical.com', 527)",416.0,21.639423,"('saltstack_salt', 667)"
5,2152.0,41.0,52.487805,"('mark@mark-story.com', 163)",208.0,10.346154,"('gem_oq-engine', 202)"
6,8402.0,42.0,200.047619,"('michele.simionato@gmail.com', 891)",400.0,21.005,"('gem_oq-engine', 918)"
7,763.0,38.0,20.078947,"('ingo@silverstripe.com', 145)",95.0,8.031579,"('saltstack_salt', 94)"
8,391.0,28.0,13.964286,"('igor.kroitor@gmail.com', 141)",66.0,5.924242,"('ccxt_ccxt', 141)"
9,5023.0,42.0,119.595238,"('crynobone@gmail.com', 1581)",282.0,17.812057,"('gem_oq-engine', 968)"


In [30]:
k_means_summary.describe()

Unnamed: 0,Number of Messages,Number of different Authors,Average number of commits per different Author,Number of different Projects,Average number of commits per different Project
count,20.0,20.0,20.0,20.0,20.0
mean,3416.25,35.25,88.524888,203.0,19.333783
std,3327.399757,11.511436,75.837296,151.234917,28.457206
min,101.0,4.0,4.269231,5.0,2.846154
25%,617.5,32.5,23.806322,62.5,7.504745
50%,1873.0,41.0,60.827236,187.5,13.330737
75%,6769.75,42.0,161.184524,339.75,20.504346
max,9002.0,42.0,214.333333,428.0,136.8


In [31]:
from utils.k_means import print_k_means_classes

print_k_means_classes(kmeans_prediction, kmeans.n_clusters, data[:subset_size])


________________ Class 0 ________________

___
1) 
Relaxed the case_master tests with a delta of 1E-5

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
___
2) 
Better logging


Former-commit-id: <I>e<I>eab<I>f<I>a6ec<I>

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
___
3) 
Fixed  qa_tests/hazard/event_based/spatial_correlation

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
___
4) 
Small fix to the RecordBuilder to manage string fields

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
___
5) 
Restored compute_hazard_curves to the poissonian form

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
___
6) 
Fixed a missing loss_type in export_loss_fraction_xml

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
___
7) 
[skip CI]


Former-commit-id: <I>b4c<I>a5cbd4b<I>dafffd<I>ca<I>c

- - - 
Committer: michele.simionato@gmail.com
Project:   gem_oq-engine
__