In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

In [2]:
import matplotlib.pyplot as plt 
import pickle
import numpy as np
from pygments.token import Comment, String, Whitespace, Text

In [19]:
from commitgen.data import RawDataset, extract_commits, parse_commits
from commitgen.diff import AddRemExtractor, PerFileExtractor, is_nl
from commitgen.code import CodeChunkTokenizer, CodeLinesTokenizer
from commitgen.nlp import SennaTokenizer, TreebankTokenizer

In [4]:
language = "python"

In [5]:
marker = "NEW_FILE"

In [6]:
code_chunk_tokenizer = CodeChunkTokenizer(language=language)
code_lines_tokenizer = CodeLinesTokenizer(language=language)

In [7]:
add_rem_code_extractor = AddRemExtractor(marker=marker)
per_file_code_extractor = PerFileExtractor(marker=marker)

In [8]:
senna_tokenizer = SennaTokenizer()
treebank_tokenizer = TreebankTokenizer()

In [9]:
raw_dataset = RawDataset("/home/ubuntu/data/Theano_commits/")

In [10]:
# for atomic only
extract_filters = [lambda c: len(c.diff_file.modified_files) + 
                             len(c.diff_file.added_files) +
                             len(c.diff_file.removed_files) == 1]

In [11]:
def is_atomic(parsed_commit):
    return parsed_commit.code_tokens.count("NEW_FILE") <= 1    

In [12]:
def get_len_filter(max_code_len, max_nl_len):
    return lambda pc: 1 <= len(pc.code_tokens) <= max_code_len \
                      and 1 <= len(pc.nl_tokens) <= max_nl_len

In [13]:
ignore_list = [Comment, String, Whitespace, Text]

In [14]:
commits = extract_commits(raw_dataset,
                          per_file_code_extractor,
                          filters=extract_filters)

In [None]:
parsed_commits = parse_commits(commits, treebank_tokenizer,
                               code_lines_tokenizer,
                               ignore_types=ignore_list,
                               marker=marker)
print len(parsed_commits)

In [None]:
# to load already parsed commits from pickle
# with open("/home/ubuntu/data/preprocessing/Theano.pickle", "r") as f:
#     parsed_commits = pickle.load(f)

In [None]:
print "% atomic = " + str(100.0*len(filter(is_atomic, parsed_commits))/len(parsed_commits))

In [None]:
true_max_code_len = np.max([len(pc.code_tokens) for pc in parsed_commits])
true_max_nl_len = np.max([len(pc.nl_tokens) for pc in parsed_commits])
print "Max code len", true_max_code_len
print "Max nl len", true_max_nl_len

In [None]:
def get_code_mean(parsed_commits):
    return np.mean([len(pc.code_tokens) for pc in parsed_commits])

def get_nl_mean(parsed_commits):
    return np.mean([len(pc.nl_tokens) for pc in parsed_commits])

In [None]:
lens = []

counts = []
counts_atomic = []
counts_non_atomic = []

avg_nl = []
avg_code = []

avg_nl_atomic = []
avg_code_atomic = []

avg_nl_non_atomic = []
avg_code_non_atomic = []

for code_len in range(100, 1500, 10):
    for nl_len in [100]:
        lens.append(code_len)
        len_filter = get_len_filter(code_len, nl_len)
        
        filtered_parsed_commits = filter(len_filter, parsed_commits)
        atomic_filtered = filter(is_atomic, filtered_parsed_commits)
        non_atomic_filtered = filter(lambda c: not is_atomic(c), filtered_parsed_commits)
        
        counts.append(len(filtered_parsed_commits))
        counts_atomic.append(len(atomic_filtered))
        counts_non_atomic.append(len(non_atomic_filtered))
            
        avg_nl.append(get_nl_mean(filtered_parsed_commits))
        avg_code.append(get_code_mean(filtered_parsed_commits))
        
        avg_nl_atomic.append(get_nl_mean(atomic_filtered))
        avg_code_atomic.append(get_code_mean(atomic_filtered))
        

        avg_nl_non_atomic.append(get_nl_mean(non_atomic_filtered))
        avg_code_non_atomic.append(get_code_mean(non_atomic_filtered))

In [None]:
plt.plot(lens, avg_code, label="All")
plt.plot(lens, avg_code_atomic, label="Atomic")
plt.plot(lens, avg_code_non_atomic, label="Non-atomic")
plt.xlabel("Max Code length") 
plt.ylabel("Avg. Code Length")
plt.legend(loc="best")

In [None]:
plt.plot(lens, avg_nl)
plt.plot(lens, avg_nl_atomic)
plt.plot(lens, avg_nl_non_atomic)

In [None]:
from collections import Counter
words = Counter()
code_tokens = Counter()
for parsed_commit in parsed_commits:
    words.update(parsed_commit.nl_tokens)
    code_tokens.update(parsed_commit.code_tokens)

In [None]:
words

In [None]:
code_tokens