In [1]:
import pydriller
import os
import json
import dpu_utils
import numpy as np

from collections import namedtuple, defaultdict
from dpu_utils.utils import save_jsonl_gz, RichPath
from tqdm import tqdm
from joblib import Parallel, delayed, cpu_count

In [21]:
from featurizer.utils import is_fork

In [2]:
author_modifications_dir = os.path.join('extracted_data', 'author_modifications')
os.makedirs(author_modifications_dir, exist_ok=True)

In [3]:
repo_list = os.listdir('repos')

In [18]:
repo = pydriller.RepositoryMining(os.path.join('repos', 'intellij-community'))
for i, commit in enumerate(repo.traverse_commits()):
    print(int(commit.author_date.timestamp()))
    break

1099674951


In [22]:
def process_repo(repo_name):
    repo = pydriller.RepositoryMining(os.path.join('repos', repo_name))
    author_modifications = defaultdict(list)
    authors = set()
    print(f"Processing {repo_name}")
    for i, commit in enumerate(repo.traverse_commits()):
        author = (commit.author.name, commit.author.email)
        authors.add(author)
        for m in commit.modifications:
            if m.change_type in (pydriller.ModificationType.ADD, pydriller.ModificationType.MODIFY):
                author_modifications[author].append(
                    {
                        'hash': commit.hash,
                        'time': int(commit.author_date.timestamp()), 
                        'path': m.new_path, 
                        'lines': [line_no for line_no, _ in m.diff_parsed['added']]
                    }
                )
        if (i + 1) % 10000 == 0:
            print(f"Processed {i + 1} commits in {repo_name}")

    mod_list = [
        {
            'name': name, 
            'email': email, 
            'mods': mods
        } 
        for (name, email), mods in author_modifications.items()
    ]
    save_jsonl_gz(mod_list, os.path.join(author_modifications_dir, f'{repo_name}.jsonl.gz'))
    print(f"Finished processing {repo_name}")
    return authors

In [24]:
with Parallel(cpu_count()) as pool:
    all_authors = pool(delayed(process_repo)(repo_name) for repo_name in tqdm(repo_list) if not is_fork[repo_name])

100%|██████████| 529/529 [19:44<00:00,  2.24s/it]


In [26]:
files = set(f[:-len('.jsonl.gz')] for f in os.listdir(author_modifications_dir))

In [28]:
set(repo_list) - files

{'azure-profiling'}

In [30]:
all_authors = set()

for f in tqdm(os.listdir(author_modifications_dir)):
    for data in dpu_utils.utils.RichPath.create(os.path.join(author_modifications_dir, f)).read_by_file_suffix():
        all_authors.add((data['name'], data['email']))

100%|██████████| 529/529 [01:05<00:00,  8.07it/s]


In [62]:
is_fork = {}

In [65]:
for f in os.listdir('repo_lists'):
    for data in dpu_utils.utils.RichPath.create(os.path.join('repo_lists', f)).read_as_json():
        is_fork[data['name']] = data['fork']

In [56]:
raw_code_dir = os.path.join('extracted_data', 'raw_code')
os.makedirs(raw_code_dir, exist_ok=True)

In [72]:
def check_extension(fname):
        ext = fname.split('.')[-1]
        return ext.lower() in [
            "js", 'javascript', "py", "java", "go", "c", "cpp", "ruby", "rb",
            "ts", "tsx", "php", "cs", "sh", "zsh", "rs", "rust", "kotlin", 
            "kt", "hs", "scala", "sc", "swift"
        ]

In [61]:
def extract_raw_code(repo_name):
    repo = pydriller.RepositoryMining(os.path.join('repos', repo_name))
    print(f"Processing {repo_name}")
    repo_code_dir = os.path.join(raw_code_dir, repo_name)

    for i, commit in enumerate(repo.traverse_commits()):
    
        pre_dir = os.path.join(repo_code_dir, commit.hash[:2])
        post_dir = os.path.join(pre_dir, commit.hash[2:])
        os.makedirs(post_dir, exist_ok=True)
        
        for m in commit.modifications:
            if m.change_type in (pydriller.ModificationType.ADD, pydriller.ModificationType.MODIFY) \
                and m.source_code is not None:
                fname = os.path.join(post_dir, m.new_path.replace('/', '_').replace('\\', '_'))
                if check_extension(fname):
                    try:
                        open(fname, 'w').write(m.source_code)
                    except OSError:
                        pass

        if (i + 1) % 10000 == 0:
            print(f"Processed {i + 1} commits in {repo_name}")

    print(f"Finished processing {repo_name}")

In [62]:
extract_raw_code('kotless')

Processing kotless
Finished processing kotless


In [63]:
with Parallel(cpu_count()) as pool:
    all_authors = pool(delayed(extract_raw_code)(repo_name) for repo_name in repo_list if not is_fork[repo_name])

In [4]:
from collections import defaultdict, Counter

In [5]:
authors = set()
n2e = defaultdict(set)
e2n = defaultdict(set)

for f in tqdm(os.listdir(author_modifications_dir)):
    iterator = RichPath.create(os.path.join(author_modifications_dir, f)).read_by_file_suffix()
    for data in iterator:
        name = data['name'].lower()
        email = data['email'].lower()
        n2e[name].add(email)
        e2n[email].add(name)
        authors.add((name, email))

100%|██████████| 529/529 [00:59<00:00,  8.95it/s]


In [6]:
len(authors)

23125

In [7]:
len(e2n)

20779

In [8]:
degE = Counter()
degN = Counter()

def useE(e):
    return degE[e] <= 5

for e, ns in e2n.items():
    degE[e] = len(ns)
for n, es in n2e.items():
    degN[n] = len(list(e for e in es if useE(e)))

In [9]:
e2n['necmon@yahoo.com']

{'charlie root',
 'jasonwilliams200ok',
 'p. jass',
 'peter',
 'peter jas',
 'root',
 'vagrant'}

In [10]:
def useN(n):
    return degN[n] < 5 or ' ' in n

In [11]:
degE.most_common(20)

[('none@none', 549),
 ('unknown', 234),
 ('no_reply@jetbrains.com', 31),
 ('', 12),
 ('gdb@fsf.org', 10),
 ('dependency-updater', 10),
 ('devnull@localhost', 9),
 ('necmon@yahoo.com', 7),
 ('mishinalina@gmail.com', 5),
 ('mikhael.bogdanov@jetbrains.com', 5),
 ('dblock@dblock.org', 5),
 ('ralph@scanmyfood.de', 5),
 ('nikolay.pianikov@jetbrains.com', 4),
 ('nikolayp@live.com', 4),
 ('filipp.riabchun@jetbrains.com', 4),
 ('ilya.lintsbakh@jetbrains.com', 4),
 ('semen.alperovich@jetbrains.com', 4),
 ('alexander.podkhalyuzin@jetbrains.com', 4),
 ('peter@jetbrains.com', 4),
 ('gregory.shrago@jetbrains.com', 4)]

In [12]:
for n, cnt in degN.most_common(500):
    if cnt >= 5:
        print(n)

unknown
alexander
daniel
david
michael
mike
alex
timothy wall
root
andrew
jetbrains
dmitry
sergey
andrey
simon marchi
rui fang
andrey breslav
ivan
nikita
sergey bogolepov
kate
james
phil
michail plushnikov
tom tromey
maciej w. rozycki
mark
ilya sergey
michael weiss
john ericson
lassulus
tim steinbach
tristan helmich
alexey tsvetkov
nikolay igotti
adam
lluis sanchez
alan mcgovern
chris
alexandre mutel
paul
max
pranavkm
arkadiy shapkin


In [60]:
colorsN, colorsE = {}, {}
colorsPairs = {}

def dfsN(n, c):
    colorsN[n] = c
    for e in n2e[n]:
        if useE(e) and e not in colorsE:
            dfsE(e, c)
            
def dfsE(e, c):
    colorsE[e] = c
    for n in e2n[e]:
        if useN(n) and n not in colorsN:
            dfsN(n, c)

color = 0
for n in n2e:
    if useN(n) and n not in colorsN:
        dfsN(n, color)
        color += 1

for e in e2n:
    if useE(e) and e not in colorsE:
        dfsE(e, color)
        color += 1
        
for name, email in authors:
    if name not in colorsN and email not in colorsE:
        colorsPairs[(name, email)] = color
        color += 1
    
def get_color(name, email):
    if name in colorsN:
        return colorsN[name]
    if email in colorsE:
        return colorsE[email]
    return colorsPairs[(name, email)]

In [100]:
def extract_counters_from_mods(repo_code_dir, author_ind, mods):
    tokens = Counter()
    for item in mods:
        hash_val = item['hash']
        path = item['path']
        line_inds = item['lines']
        if check_extension(path):
            pre_dir = os.path.join(repo_code_dir, hash_val[:2])
            post_dir = os.path.join(pre_dir, hash_val[2:])
            path = os.path.join(post_dir, path.replace('/', '_').replace('\\', '_'))
            if not os.path.exists(path):
                continue
            all_lines = open(path, 'r').readlines()
            for ind in line_inds:
                if ind <= len(all_lines):
                    for token in all_lines[ind - 1].strip().split():
                        tokens[token] += 1
    
    return author_ind, tokens

In [103]:
with Parallel(cpu_count()) as pool:
    author_results = pool(
        delayed(extract_counters_from_mods)(
            os.path.join('extracted_data', 'tokenized_code', f[:-len('.jsonl.gz')]),
            get_color(d['name'].lower(), d['email'].lower()),
            d['mods']
        ) 
        for f in tqdm(os.listdir(author_modifications_dir))
        for d in RichPath.create(os.path.join(author_modifications_dir, f)).read_by_file_suffix()
        if not is_fork[f[:-len('.jsonl.gz')]]
    )


  0%|          | 0/529 [00:00<?, ?it/s][A
  0%|          | 2/529 [00:00<00:35, 14.84it/s][A
  1%|          | 5/529 [00:00<00:43, 11.94it/s][A
  1%|▏         | 7/529 [00:00<00:39, 13.36it/s][A
  2%|▏         | 9/529 [00:03<05:13,  1.66it/s][A
  2%|▏         | 10/529 [00:03<04:25,  1.96it/s][A
  2%|▏         | 13/529 [00:03<02:30,  3.44it/s][A
  4%|▎         | 19/529 [00:06<03:19,  2.56it/s][A
  4%|▍         | 21/529 [00:07<03:20,  2.54it/s][A
  4%|▍         | 22/529 [00:08<03:41,  2.29it/s][A
  5%|▌         | 29/529 [00:08<01:39,  5.03it/s][A
  6%|▋         | 34/529 [00:09<01:34,  5.26it/s][A
  7%|▋         | 38/529 [00:09<01:10,  6.95it/s][A
  8%|▊         | 44/529 [00:12<02:12,  3.66it/s][A
 11%|█         | 56/529 [00:12<01:05,  7.25it/s][A
 12%|█▏        | 61/529 [00:12<01:01,  7.59it/s][A
 12%|█▏        | 64/529 [00:14<01:24,  5.50it/s][A
 13%|█▎        | 68/529 [00:14<01:08,  6.70it/s][A
 13%|█▎        | 70/529 [00:19<03:56,  1.94it/s][A
 14%|█▍        | 75/529 

In [105]:
all_results = defaultdict(Counter)
for author, tokens in tqdm(author_results):
    all_results[author] += tokens


  0%|          | 0/12937 [00:00<?, ?it/s][A
  9%|▉         | 1146/12937 [00:00<00:01, 11441.76it/s][A
 21%|██▏       | 2760/12937 [00:00<00:00, 14192.43it/s][A
 33%|███▎      | 4241/12937 [00:00<00:00, 14453.64it/s][A
 44%|████▍     | 5750/12937 [00:00<00:00, 14433.57it/s][A
 56%|█████▌    | 7194/12937 [00:00<00:00, 12802.26it/s][A
 67%|██████▋   | 8663/12937 [00:00<00:00, 13385.19it/s][A
 77%|███████▋  | 10025/12937 [00:00<00:00, 10093.53it/s][A
 86%|████████▋ | 11182/12937 [00:00<00:00, 10455.81it/s][A
100%|██████████| 12937/12937 [00:01<00:00, 11996.32it/s][A


In [123]:
all_results[get_color('tagir valeev', '')].most_common(50)

[('psi', 42919),
 ('get', 40311),
 ('type', 27332),
 ('expression', 25119),
 ('string', 21231),
 ('my', 18458),
 ('util', 18009),
 ('list', 16885),
 ('value', 15986),
 ('method', 14834),
 ('element', 13362),
 ('java', 13012),
 ('test', 12962),
 ('null', 12252),
 ('class', 11427),
 ('not', 11342),
 ('call', 11189),
 ('com', 11048),
 ('is', 9960),
 ('name', 9812),
 ('intellij', 9550),
 ('statement', 7956),
 ('map', 7809),
 ('to', 7321),
 ('dfa', 7198),
 ('text', 6982),
 ('variable', 6820),
 ('stream', 6603),
 ('array', 6075),
 ('set', 6001),
 ('result', 5770),
 ('parent', 5702),
 ('code', 5690),
 ('project', 5349),
 ('file', 5260),
 ('state', 5241),
 ('range', 5167),
 ('inspection', 5022),
 ('equals', 4781),
 ('reference', 4623),
 ('parameter', 4385),
 ('out', 4359),
 ('override', 4346),
 ('empty', 4253),
 ('factory', 4231),
 ('length', 4207),
 ('of', 4199),
 ('system', 4171),
 ('add', 3962)]

In [124]:
import pickle

In [129]:
mapping = pickle.load(open('extracted_data/concatenated_data/mapping_32.pkl', 'rb'))

In [17]:
devs = [
    "Ruslan Kuleshov",
"Tagir Valeev",
"Anna Kozlova",
"Nikolay Chashnikov",
"Nikita Katkov",
"Sergey Simonchik",
"Artemiy Sartakov",
"Eugene Zhuravlev",
"Gleb Drozdov",
"Vitaliy Bibaev",
"Vassiliy Kudryashov",
"Andrey Vokin",
"Alexander Kass",
"Anton Lobov",
"Serge Baranov",
"Konstantin Aleev",
"Egor Klepikov",
"Valentin Fondaratov",
"Petr Rastegaev",
"Philipp Nurullin",
"Yann Cebron",
"Sergey Malenkov",
"Daniil Ovchinnikov",
"Konstantin Annikov",
"Dmitry Batrak",
"Alexander Lobas",
"Dmitry Zhuravlev",
"Arina Efremova",
"Aleksey Pivovarov",
"Vladimir Petrenko",
"Dmitry Batkovich",
"Daniil Tsaryov",
"Aleksandr Izmaylov",
"Valentin Kipiatkov",
"Alexandr Suhinin",
"Roman Shevchenko",
"Andrey Starovoyt",
"Vladimir Lagunov",
"Nicolay Mitropolsky",
"Arseniy Nisnevich",
"Denis Konoplev",
"Elena Shaverdova",
"Vladislav Tankov",
"Sergey Vasiliev",
"Kirill Timofeev",
"Aleksandr Krasilnikov",
"Bas Leijdekkers",
"Egor Ushakov",
"Kirill Kirichenko",
"Vladislav Rassokhin",
"Ivan Buryak",
"Maxim Kolmakov",
"Olga Klisho",
"Yuriy Artamonov",
"Alexey Merkulov",
"Alexander Bubenchikov",
"Dmitriy Smirnov",
"Marcel Bruch",
"Konstantin Nisht",
"Andrey Dernov",
"Alexey Kudravtsev",
"Liubov Melnikova",
"Ivan Migalev",
"Dmitriy Panov",
"Alexandr Evstigneev",
"Ivan Chirkov",
"Maksim Pelevin",
"Denis Fokin",
"Vladislav Soroka",
"Greg Shrago",
"Michael Golubev",
"Sergey Vorobyov",
"Alexey Afanasiev",
"Kirill Likhodedov",
"Konstantin Bulenkov",
"Alexander Koshevoy",
"Artem Bochkarev",
"Sergey Patrikeev",
"Andrei Kuznetsov",
"Dmitry Avdeev",
"Rustam Vishnyakov",
"Konstantin Kolosovsky",
"Eugene Petrenko",
"Vladimir Orlov",
"Ivan Semenov",
"Julia Beliaeva",
"Alexander Doroshko",
"Alexander Zolotov",
"Alexey Ushakov",
"Sergey Ignatov",
"Viktor Shatrov",
"Eldar Abusalimov",
"Anton Tarasov",
"Dmitry Krasilschikov",
"Nikolay Rykunov",
"Nadya Zabrodina",
"Nikita Skvortsov",
"Svetlana Zemlyanskaya",
"Peter Gromov",
"Roman Ivanov",
"Pavel Bakhvalov",
"Anton Makeev",
"Ilyas Selimov",
"Mikhail Sokolov",
"Anna Gromova",
"Yaroslav Bedrov",
"Dmitry Jemerov",
"Sergey Anchipolevsky",
"Mikhail Mazurkevich",
"Dennis Ushakov",
"Vladimir Krivosheev",
]

In [127]:
missed = []
colors = {}
for dev in devs:
    if dev.lower() in colorsN:
        colors[dev] = colorsN[dev.lower()]
    elif dev.lower().replace(' ', '.') in colorsN:
        colors[dev] = colorsN[dev.lower().replace(' ', '.')]
    else:
        print(dev, 'missing')
        missed.append(dev)

Ruslan Kuleshov missing
Gleb Drozdov missing
Egor Klepikov missing
Petr Rastegaev missing
Arina Efremova missing
Valentin Kipiatkov missing
Marcel Bruch missing
Liubov Melnikova missing
Yaroslav Bedrov missing


In [135]:
features = {dev: np.zeros(32) for dev in devs}

In [137]:
for dev, c in tqdm(colors.items()):
    for token, count in all_results[c].items():
        if token in mapping:
            features[dev][mapping[token]] += count


  0%|          | 0/102 [00:00<?, ?it/s][A
 29%|██▉       | 30/102 [00:00<00:00, 282.12it/s][A
100%|██████████| 102/102 [00:00<00:00, 317.77it/s][A


In [138]:
features

{'Ruslan Kuleshov': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'Tagir Valeev': array([3.44880e+04, 3.40000e+02, 3.56000e+02, 4.75770e+04, 1.93490e+04,
        1.00970e+04, 1.13800e+04, 7.18280e+04, 2.04470e+04, 4.37900e+03,
        4.43000e+02, 3.58200e+03, 7.11800e+03, 2.88300e+03, 2.06020e+04,
        4.95890e+04, 5.34890e+04, 2.39600e+03, 1.08490e+04, 8.43190e+04,
        2.05890e+05, 4.10000e+01, 3.30000e+03, 3.34930e+04, 6.62550e+04,
        7.90000e+01, 3.80100e+03, 4.89430e+04, 3.00000e+00, 6.75000e+03,
        5.17000e+02, 2.69335e+05]),
 'Anna Kozlova': array([1.156310e+05, 4.122700e+04, 7.631000e+03, 1.253430e+05,
        1.374270e+05, 2.903200e+04, 1.580720e+05, 2.211030e+05,
        8.946600e+04, 3.057400e+04, 3.251000e+03, 2.003600e+04,
        1.053290e+05, 2.229400e+04, 3.033800e+04, 2.806610e+05,
        1.888240e+05, 6.134000e+03, 3.771800e+04, 4.300770e+05,
        4

In [139]:
def norm(f):
    if sum(f) == 0:
        return f
    return f / sum(f)

normalized_features = {dev: norm(f) for dev, f in features.items()}

In [142]:
pickle.dump(normalized_features, open('extracted_data/concatenated_data/normalized_dev_features_32.pkl', 'wb'))

In [128]:
colors

{'Tagir Valeev': 2000,
 'Anna Kozlova': 1841,
 'Nikolay Chashnikov': 33,
 'Nikita Katkov': 13128,
 'Sergey Simonchik': 4682,
 'Artemiy Sartakov': 4664,
 'Eugene Zhuravlev': 2017,
 'Vitaliy Bibaev': 4718,
 'Vassiliy Kudryashov': 4403,
 'Andrey Vokin': 15028,
 'Alexander Kass': 2239,
 'Anton Lobov': 4643,
 'Serge Baranov': 16689,
 'Konstantin Aleev': 6048,
 'Valentin Fondaratov': 952,
 'Philipp Nurullin': 17057,
 'Yann Cebron': 2018,
 'Sergey Malenkov': 2016,
 'Daniil Ovchinnikov': 4505,
 'Konstantin Annikov': 10540,
 'Dmitry Batrak': 32,
 'Alexander Lobas': 4400,
 'Dmitry Zhuravlev': 1132,
 'Aleksey Pivovarov': 2015,
 'Vladimir Petrenko': 13389,
 'Dmitry Batkovich': 4502,
 'Daniil Tsaryov': 16972,
 'Aleksandr Izmaylov': 16934,
 'Alexandr Suhinin': 16895,
 'Roman Shevchenko': 2006,
 'Andrey Starovoyt': 4715,
 'Vladimir Lagunov': 2251,
 'Nicolay Mitropolsky': 4603,
 'Arseniy Nisnevich': 16951,
 'Denis Konoplev': 14212,
 'Elena Shaverdova': 954,
 'Vladislav Tankov': 1976,
 'Sergey Vasiliev

In [38]:
missed

['Ruslan Kuleshov',
 'Gleb Drozdov',
 'Egor Klepikov',
 'Petr Rastegaev',
 'Arina Efremova',
 'Valentin Kipiatkov',
 'Marcel Bruch',
 'Liubov Melnikova',
 'Yaroslav Bedrov']

In [116]:
for name in authors:
    if 'daniil' in name[1]:
        print(name)

('daniil gitelson', 'daniil.guit@gmail.com')
('daniil elovkov', 'daniil.elovkov@jetbrains.com')
('daniil akifev', 'daniil.akifev@jetbrains.com')
('daniil ovchinnikov', 'daniil.ovchinnikov@jetbrains.com')
('danilla', 'daniil.elovkov@jetbrains.com')
('daniil.elovkov', 'daniil.elovkov@gmail.com')
('daniil tsaryov', 'daniil.tsaryov@jetbrains.com')
