# Commit Message Exploration with SpaCy

### Load Data

In [14]:
import pandas as pd

data = pd.read_csv("results.csv")

## Exploration with spaCy

In [15]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [16]:
from spacy import displacy

print("Structure of first commit message as an example")
displacy.render(nlp(data["message"][0]))

Structure of first commit message as an example


Take a subset because unable to process all data locally.

In [17]:
subset_size = 100000

messages = data["message"][:subset_size].tolist()

In [19]:
first_tokens = []
first_token_tags = []
second_tokens = []
second_token_tags = []
tokens = []
token_tags = []
cats = []
ents = []
sentiments = []
polarities = []
subjectivities = []
imperative_count = []

def token_filter(token):
    return not token.is_stop and token.is_alpha

from spacytextblob.spacytextblob import SpacyTextBlob

# the following installations are required
# python3 -m textblob.download_corpora
# python3 -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('spacytextblob')

docs = nlp.pipe(messages)

for doc in docs:
    first_tokens.append(doc[0].lemma_)
    first_token_tags.append(doc[0].tag_)
    if len(doc) > 1:
        second_tokens.append(doc[1].lemma_)
        second_token_tags.append(doc[1].tag_)
    else:
        second_tokens.append(None)
        second_token_tags.append(None)
    tokens.extend([token.lemma_ if token_filter(token) else None for token in doc])
    token_tags.extend([token.tag_ for token in doc])
    cats.extend(doc.cats if doc.cats != "" else "")
    ents.extend([ent.lemma_ for ent in doc.ents])
    sentiments.append(doc.sentiment)
    polarities.append(doc._.blob.polarity)
    subjectivities.append(doc._.blob.subjectivity)
    imperative_count.extend([token.morph for token in doc])

### First Tokens of a Commit Message

In [20]:
print("20 most common first (lemmatized) tokens in first {subset_size} messages:".format(subset_size = subset_size))

from collections import Counter

first_tokens_count = Counter(first_tokens)
first_tokens_count.most_common(20)

20 most common first (lemmatized) tokens in first 100000 messages:


[('fix', 12120),
 ('add', 11359),
 ('remove', 4198),
 ('[', 3935),
 ('update', 3218),
 ('use', 2106),
 ('do', 2047),
 ('make', 1876),
 ('Fix', 1757),
 ('change', 1635),
 ('allow', 1022),
 ('test', 980),
 ('bump', 657),
 ('move', 600),
 ('#', 539),
 ('check', 533),
 ('ensure', 530),
 ('set', 522),
 ('only', 492),
 ('bugfix', 472)]

### Tags of First Tokens

In [22]:
print("5 most common first (lemmatized) token tags in first {subset_size} messages:".format(subset_size = subset_size))

first_token_tags_count = Counter(first_token_tags)
first_token_tags_count.most_common(5)

5 most common first (lemmatized) token tags in first 100000 messages:


[('VB', 32234), ('NN', 14022), ('NNP', 11809), ('VBN', 9916), ('JJ', 5131)]

Check meaning of token tags with:

In [25]:
spacy.explain("JJ")

'adjective (English), other noun-modifier (Chinese)'

### Second Tokens of a Commit Message

In [26]:
print("20 most common second (lemmatized) tokens in first {subset_size} messages:".format(subset_size = subset_size))

second_tokens_count = Counter(second_tokens)
second_tokens_count.most_common(20)

20 most common second (lemmatized) tokens in first 100000 messages:


[(':', 8251),
 ('the', 3196),
 ('a', 2523),
 ('not', 2143),
 ('>', 1861),
 ('test', 1775),
 ('to', 1768),
 ('):', 1575),
 ('/', 1444),
 ('#', 1442),
 ('for', 1228),
 ('<', 1200),
 ('-', 1176),
 ('fix', 1094),
 ('bug', 768),
 ('support', 618),
 ('issue', 608),
 ('error', 599),
 ('be', 598),
 ('miss', 579)]

### Tags of Second Tokens

In [28]:
print("5 most common first (lemmatized) token tags in first {subset_size} messages:".format(subset_size = subset_size))

second_token_tags_count = Counter(second_token_tags)
second_token_tags_count.most_common(5)

5 most common first (lemmatized) token tags in first 100000 messages:


[('NN', 28260), (':', 9917), ('JJ', 7793), ('NNP', 7456), ('DT', 7357)]

### Tokens of a Commit Message

In [29]:
print("20 most common (lemmatized) alphabetical tokens in first {subset_size} messages excluding stopwords:".format(subset_size = subset_size))

tokens_count = Counter(tokens)
tokens_count.most_common(21)[1:]

20 most common (lemmatized) alphabetical tokens in first 100000 messages excluding stopwords:


[('fix', 20749),
 ('add', 17329),
 ('test', 11266),
 ('remove', 7423),
 ('use', 6156),
 ('update', 5640),
 ('change', 5470),
 ('method', 5208),
 ('error', 5186),
 ('url', 3951),
 ('file', 3932),
 ('check', 3768),
 ('version', 3606),
 ('issue', 3549),
 ('set', 3423),
 ('support', 3040),
 ('bug', 3003),
 ('return', 2833),
 ('Fix', 2655),
 ('code', 2644)]

### Token Tags

In [30]:
print("Most common token tags in first {subset_size} messages:".format(subset_size = subset_size))

token_tags_count = Counter(token_tags)
token_tags_count.most_common(10)

Most common token tags in first 100000 messages:


[('NN', 294339),
 ('IN', 137675),
 ('VB', 94535),
 ('NNP', 90266),
 ('DT', 75444),
 ('JJ', 72476),
 ('NNS', 70804),
 ('XX', 62388),
 ('RB', 51032),
 ('_SP', 46894)]

### Categories

In [31]:
print("Spacy finds the following categories: " + str(cats))
print("(Not expected to find any categories)")

Spacy finds the following categories: []
(Not expected to find any categories)


### Entities

In [32]:
ents_count = Counter(ents)
print("20 most common (lemmatized) entities in first {subset_size} messages:".format(subset_size = subset_size))
ents_count.most_common(20)

20 most common (lemmatized) entities in first 100000 messages:


[('first', 867),
 ('fix', 716),
 ('one', 666),
 ('fix #', 580),
 ('1', 477),
 ('2', 473),
 ('#', 447),
 ('0', 433),
 ('3', 429),
 ('api', 395),
 ('two', 370),
 ('Python', 321),
 ('API', 294),
 ('doc', 293),
 ('CI', 251),
 ('PHP', 236),
 ('improve', 234),
 ('second', 223),
 ('json', 200),
 ('zero', 193)]

### Sentiments

In [33]:
import numpy as np

def print_statistics(values : list):
    mean = np.mean(values)
    std = np.std(values)
    min = np.min(values)
    lower_quartile = np.percentile(values, 25)
    median = np.median(values)
    upper_quartile = np.percentile(values, 75)
    max = np.max(values)

    print("Mean: " + str(mean))
    print("Standard deviation: " + str(std))
    print("Minimum: " + str(min))
    print("25 % quartile: " + str(lower_quartile))
    print("Median: " + str(median))
    print("75 % quartile: " + str(upper_quartile))
    print("Maximum: " + str(max))
    print()

In [34]:
print_statistics(sentiments)

Mean: 0.0
Standard deviation: 0.0
Minimum: 0.0
25 % quartile: 0.0
Median: 0.0
75 % quartile: 0.0
Maximum: 0.0



Spacy Sentiment is always zero.

### Polarities

In [35]:
print_statistics(polarities)

Mean: 0.012914568989495398
Standard deviation: 0.16576854685975387
Minimum: -1.0
25 % quartile: 0.0
Median: 0.0
75 % quartile: 0.0
Maximum: 1.0



There are only minor polarities found in the commit messages.

### Subjectivities

In [36]:
print_statistics(subjectivities)

Mean: 0.17876842287919062
Standard deviation: 0.27419620402845424
Minimum: 0.0
25 % quartile: 0.0
Median: 0.0
75 % quartile: 0.3333333333333333
Maximum: 1.0



Most commit messages are objective (subjectivity score of 0.0).

### Imperative Counts

In [37]:
imperative_count

[Number=Sing,
 ,
 VerbForm=Inf,
 ,
 Number=Sing,
 ConjType=Cmp,
 VerbForm=Inf,
 Aspect=Perf|Tense=Past|VerbForm=Part,
 ,
 Number=Sing,
 ,
 Aspect=Prog|Tense=Pres|VerbForm=Part,
 Number=Sing,
 Number=Sing,
 ,
 ,
 Number=Sing,
 Aspect=Perf|Tense=Past|VerbForm=Part,
 Number=Sing,
 PunctType=Peri,
 Number=Sing,
 Number=Sing,
 VerbForm=Inf,
 ,
 PunctType=Dash,
 Aspect=Perf|Tense=Past|VerbForm=Part,
 Number=Plur,
 ,
 Case=Nom|Person=2|PronType=Prs,
 Mood=Ind|Tense=Pres|VerbForm=Fin,
 Polarity=Neg,
 VerbForm=Inf,
 Number=Sing,
 Number=Sing,
 ConjType=Cmp,
 Degree=Pos,
 Number=Plur,
 PunctType=Peri,
 Number=Sing,
 ,
 Number=Sing,
 Number=Plur,
 VerbForm=Fin,
 VerbForm=Inf,
 Definite=Def|PronType=Art,
 Number=Sing,
 ,
 ,
 Degree=Pos,
 Degree=Pos,
 Number=Plur,
 Number=Sing|Person=3|Tense=Pres|VerbForm=Fin,
 ,
 VerbForm=Inf,
 Definite=Ind|PronType=Art,
 Degree=Cmp,
 Number=Sing,
 PunctType=Peri,
 VerbForm=Inf,
 ,
 Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin,
 Aspect=Perf|Tense=Past|Ver

Spacy seems not to be able to detect imperatives.