In [1]:
import pandas as pd

In [2]:
import json
with open('enriched-extended-issues-with-relationship.json','r') as f:
    data = json.loads(f.read())

In [3]:
import re 
import string
import nltk
from nltk.tokenize import TweetTokenizer

tweet = TweetTokenizer()
stopwords = nltk.corpus.stopwords.words('english')

def clean_text(x):
#     x = re.sub(r'[^A-Za-z0-9 ]+', '', x)
    text = nltk.tokenize.word_tokenize(x)
    text_clean = " ".join([i.lower() for i in text if i not in string.punctuation and i not in stopwords])
    return text_clean

In [4]:
from nltk.tokenize import word_tokenize

stop_words = ['to','the','for', 'this','a', 'and','of','is','we','be','pdd','com', 'puzzle','in','on','by',
             'has', 'was','me', 'http','html','about','yegor','www','from','java','github','new','have','https',
              'code','will','instead','any','don','text','task','more','when','you', 'problem','tickets',"n't",
              '``'
            ]

def get_row(issue):
    def _(f):
        return issue[f] if f in issue else None
    
    def _t(f1, f2):
        return (parse(_(f2)) - parse(_(f1))).total_seconds() if _(f1) and _(f2) else None 
    
    def __(f1, f2):
        return _(f1)[f2] if _(f1) and f2 in _(f1) else None
    
    def _l():
        return [label['name'] for label in _('labels')] if _('labels') else []
    
    def clean(x):
        return ' '.join([word for word in word_tokenize(x) if word not in stop_words])
    
    title = _('title')
    body = _('body')
    issue_id = _('id')
    child_id = __('parentIssue','id')
    text = f'{clean(title)} {clean(body)}'
    url = _('issueLink')
    closed_at = _('closed_at')
    number_of_additions = __('code','additions')
    if url:
        url_part = url.split('/')
        repo = url_part[3] + '/'+ url_part[4]
    else:
        repo = None
    if text and not text.isspace() and repo and closed_at and number_of_additions:  
        return [issue_id, child_id, clean_text(text), repo, closed_at,number_of_additions]
    else:
        return [None,None,None,None,None,None]

In [5]:
overall_data = []

overall_data.extend([get_row(issue) for issue in data])
df = pd.DataFrame(overall_data)

In [6]:
df.columns = ['iid', 'pid', 'text','repo', 'closed_at', 'additions']

In [7]:
df = df.dropna(subset=['text'])
df = df.drop(['pid'], axis=1)

In [8]:
sample = df.sample(n = 500)
df_train_index = sample.index
df_train = df.drop(df_train_index)
df_test = sample

In [9]:
from dateutil.parser import parse

In [10]:
def create_window(puzzles,n = 5):
    results = []
    for i in range(len(puzzles)):
        res = [puzzles[i][0], [puzzles[j][0] for j in range(i + 1, min((i + n + 1), len(puzzles)))]]
        if len(res[1]) >= 1:
            results.append(res)
    return results

In [11]:
# TODO need to add negative samples, they are based on the existing at that moment of time task that wasn't selected or was abandoned
results = []
for idx,row in df_train.groupby(['repo']).agg(lambda x: list(x)).iterrows():
    if len(row['iid'])>= 2:
        text = row['text']
        close_dates = [parse(date) for date in  row['closed_at']]
        res = sorted(list(zip(text, close_dates)), key = lambda x: x[1])
        res = create_window(res)
        results.extend(res)

In [12]:
final_results = []
for row in results:
    result_row = []
    parent = row[0]
    childrens = row[1]
    for i in range(len(childrens)):
        weight = (len(childrens) - i) / len(childrens)
        result_row.append([parent, childrens[i], weight])
    final_results.extend(result_row)

In [13]:
df3 = pd.DataFrame(final_results)

In [14]:
df3

Unnamed: 0,0,1,2
0,test format the 2-45ffc6ab 2 resolved //github...,solve lint errors 152 error currently the 1-b4...,1.000000
1,2out.h:13-13 place classes files the 5-03512da...,2out.h:15-15 move tstsimple tstsimple.cpp the ...,1.000000
2,2out.h:13-13 place classes files the 5-03512da...,2out.h:32-32 move tstsuite tstsuite.cpp the 8-...,0.800000
3,2out.h:13-13 place classes files the 5-03512da...,2out.h:56-56 move assertionequal asequal.cpp t...,0.600000
4,2out.h:13-13 place classes files the 5-03512da...,textreporttest.h:16-16 textreporttest incapsul...,0.400000
...,...,...,...
12111,config.yml:2-3 integrate ci trigger push ... t...,applogger.tsx:44 bsod shown critical errors th...,0.666667
12112,config.yml:2-3 integrate ci trigger push ... t...,index.tsx:33 move custom middlewares the 138-a...,0.333333
12113,applogger.tsx:16 provide logger component catc...,applogger.tsx:44 bsod shown critical errors th...,1.000000
12114,applogger.tsx:16 provide logger component catc...,index.tsx:33 move custom middlewares the 138-a...,0.500000


In [15]:
import numpy as np
txt1 = np.random.randint(0, high=12109, size=3000)
txt2 = np.random.randint(0, high=12109, size=3000)

In [16]:
fr2 = list(zip(df3.iloc[txt1][0].values, df3.iloc[txt1][1].values, np.zeros(3000)))

In [17]:
from nltk import FreqDist
words = nltk.tokenize.word_tokenize(' '.join(df['text'].values))
fdist = FreqDist(words)

In [18]:
vocab_size = len(list(fdist.keys()))
embedding_dim = 30 

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim



class RecommenderNet(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(RecommenderNet, self).__init__()
        self.e = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim * 50 *  2, 512)
        self.hidden = nn.Sequential(
                                 nn.Linear(512,256),
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(256, 128),
                                 nn.ReLU(),
                                 nn.Dropout(0.2)
        )
        self.linear2 = nn.Linear(128, 1)
        
    def forward(self, s1, s2):
        e1 = self.e(s1)
        e2 = self.e(s2)
        e3 = torch.cat([e1, e2], 1)
        ersh = e3.shape
        e3 = torch.reshape(e3, (ersh[0], ersh[1] * ersh[2]))
        out = F.relu(self.linear1(e3))
        out = self.hidden(out)
        out = F.sigmoid(self.linear2(out))
        return out
    
    def _init(self):
        def init(m):
            if type(m) == nn.Linear:
                torch.nn.init.xavier_uniform_(m.weight)
                m.bias.data.fill_(0.01)
        self.u.weight.data.uniform_(-0.05, 0.05)
        self.m.weight.data.uniform_(-0.05, 0.05)
        self.hidden.apply(init)
        init(self.fc)

In [20]:
CUDA_LAUNCH_BLOCKING=1
net = RecommenderNet(vocab_size, embedding_dim)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

net = net.to(device)

In [21]:
criterion = nn.MSELoss(reduction='mean')
optimizer = optim.Adam(net.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.3, patience=2)

In [22]:
vocab = set(fdist.keys())
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [23]:
final_results = final_results + fr2
train_index = int(len(final_results) * 0.7)
import random

random.shuffle(final_results)

# train = final_results[:train_index]
# test = final_results[train_index: ]
# s1, s2, r = zip(*train)
train = final_results
# test = final_results[train_index: ]
s1, s2, r = zip(*train)

In [24]:
len(final_results)

15116

In [25]:
batches = []

batch_size = 100
n_samples = len(train)

#Create batches
for i in range(0, n_samples, batch_size):
    limit = min(i + batch_size, n_samples)
    s1_batch = []
    for s in s1[i:limit]:
        sentense = nltk.tokenize.word_tokenize(s)
        s1_batch.append(torch.tensor([word_to_ix[sentense[w]] if w < len(sentense) else 0 for w in range(50)]))
    s2_batch = []
    for s in s2[i:limit]:
        sentense = nltk.tokenize.word_tokenize(s)
        s2_batch.append(torch.tensor([word_to_ix[sentense[w]] if w < len(sentense) else 0 for w in range(50)]))
    r_batch = torch.tensor(r[i:limit], dtype=torch.float)
    batches.append((torch.stack(s1_batch), torch.stack(s2_batch), r_batch))

In [26]:
epochs = 10

from tqdm import tqdm

for epoch in range(epochs):
  train_loss = 0
  for s1_batch, s2_batch, rates_batch in tqdm(batches):
    net.zero_grad()
    out = net(s1_batch.to(device), s2_batch.to(device)).squeeze()
    loss = criterion(rates_batch.to(device), out)

    loss.backward()
    optimizer.step()
    train_loss += loss
  scheduler.step(loss)
  print("Loss at epoch {} = {}".format(epoch, loss.item()))
print("Last Loss = {}".format(loss.item()))

100%|██████████| 152/152 [00:03<00:00, 49.09it/s]


Loss at epoch 0 = 0.11400288343429565


100%|██████████| 152/152 [00:03<00:00, 49.12it/s]


Loss at epoch 1 = 0.11113207042217255


100%|██████████| 152/152 [00:03<00:00, 49.21it/s]


Loss at epoch 2 = 0.11180835217237473


100%|██████████| 152/152 [00:03<00:00, 48.71it/s]


Loss at epoch 3 = 0.10453560948371887


100%|██████████| 152/152 [00:03<00:00, 49.46it/s]


Loss at epoch 4 = 0.0955808088183403


100%|██████████| 152/152 [00:03<00:00, 49.98it/s]


Loss at epoch 5 = 0.08924049139022827


100%|██████████| 152/152 [00:03<00:00, 50.05it/s]


Loss at epoch 6 = 0.08122327923774719


100%|██████████| 152/152 [00:03<00:00, 50.15it/s]


Loss at epoch 7 = 0.06955324858427048


100%|██████████| 152/152 [00:03<00:00, 49.97it/s]


Loss at epoch 8 = 0.044311027973890305


100%|██████████| 152/152 [00:03<00:00, 50.15it/s]


Loss at epoch 9 = 0.018404267728328705
Last Loss = 0.018404267728328705


In [27]:
results2 = []
for idx,row in df_test.groupby(['repo']).agg(lambda x: list(x)).iterrows():
    if len(row['iid'])>= 2:
        text = row['text']
        close_dates = [parse(date) for date in  row['closed_at']]
        res = sorted(list(zip(text, close_dates)), key = lambda x: x[1])
        res = create_window(res)
        results2.extend(res)
        
final_results2 = []
for row in results2:
    result_row = []
    parent = row[0]
    childrens = row[1]
    for i in range(len(childrens)):
        weight = (len(childrens) - i) / len(childrens)
        result_row.append([parent, childrens[i], weight])
    final_results2.extend(result_row)

In [28]:
test = final_results2

In [29]:
net.eval()

with torch.no_grad():
    s1t, s2t, rt = zip(*test)
    stest_ = []
    stest__ = []
    for s in s1t:
        sentense = nltk.tokenize.word_tokenize(s)
        stest_.append(torch.tensor([word_to_ix[sentense[w]] if w < len(sentense) else 0 for w in range(50)]))
    for s in s2t:
        sentense = nltk.tokenize.word_tokenize(s)
        stest__.append(torch.tensor([word_to_ix[sentense[w]] if w < len(sentense) else 0 for w in range(50)]))
    
    out = net(torch.stack(stest_).to(device), torch.stack(stest__).to(device)).squeeze()
    loss = criterion(torch.tensor(rt, dtype=torch.float).to(device), out)
    print("Loss at test {}".format( loss.item()))

Loss at test 0.09599825739860535


In [30]:
df3_train = df_train.groupby(['repo']).agg(lambda x: list(x))
df3_train['len'] = df3_train['iid'].map(lambda x: len(x))

In [31]:
df3_train = df3_train[df3_train['len'] >= 7]

In [32]:
import operator

def predict(puzzle, currently_existed_puzzles):
    with torch.no_grad():
        sentense = nltk.tokenize.word_tokenize(puzzle)
        getter = operator.itemgetter(*sentense)
        len_v = min(len(sentense), 50)
        vector = getter(word_to_ix)[:len_v]
        vector_puzzle = torch.tensor(np.concatenate([vector, np.zeros(50 - len_v)], axis = 0).astype(np.int))
        stest__ = []
        for p in currently_existed_puzzles:
            sentense = nltk.tokenize.word_tokenize(p)
            getter = operator.itemgetter(*sentense)
            len_v = min(50, len(sentense))
            vector = getter(word_to_ix)[:len_v]
            v=np.concatenate([vector, np.zeros(50 - len_v)], axis = 0).astype(np.int)
            stest__.append(torch.tensor(v))
        
        out = net(vector_puzzle.repeat(len(stest__), 1).to(device), torch.stack(stest__).to(device)).squeeze()
        return out 

In [33]:
def create_window_test(puzzles,n = 10):
    puzzles = np.array(puzzles)
    x = puzzles[np.lib.stride_tricks.sliding_window_view(np.arange(len(puzzles)), n)]
    return x

In [34]:
def ap_i(elems, top_k=6):
    ap = 0
    positives = 0
    for i in range(len(elems)):
        if elems[i] < top_k:
            positives += 1
            ap = positives / (i + 1)
    return ap
    
aps = []
for idx, row in df3_train.iterrows():
    try:
        window = create_window_test(row['text'], 20)
        for row in window:
            current_puzzle = row[0]
            relevant = row[1:6]
            nonrelevant = row[5:]
            pred_r = predict(current_puzzle, relevant).cpu().detach().numpy()
            pred_n = predict(current_puzzle, nonrelevant).cpu().detach().numpy()
            try:
                rq = np.concatenate((pred_r, pred_n), axis = 0)
                res = np.argsort(-rq)
                ap5 = ap_i(res[:5])
                aps.append(ap5)
            except:
                pass
    except:
        pass
    
        
print(f'Mean average precision@5 for all repos with more than 7 commits on test data: {np.mean(aps)}')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  vector_puzzle = torch.tensor(np.concatenate([vector, np.zeros(50 - len_v)], axis = 0).astype(np.int))
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  v=np.concatenate([vector, np.zeros(50 - len_v)], axis = 0).astype(np.int)


Mean average precision@5 for all repos with more than 7 commits on test data: 0.476679670567837


In [35]:
import random

n_line = "\n\n"
num_ind, str_ind = zip(*enumerate(list(df3.index)))
ind = list(set(random.choices(num_ind,k=10)))
df3 = df3.iloc[ind]

aps = []
i = 0
for idx, row in df3_train.iterrows():
    current_puzzle = row['text'][0]
    relevant = row['text'][1:6]
    nonrelevant = row['text'][5:]
    pred_r = predict(current_puzzle, relevant).cpu().detach().numpy()
    pred_n = predict(current_puzzle, nonrelevant).cpu().detach().numpy()
    strings = np.concatenate((relevant, nonrelevant), axis = 0)
    r = np.concatenate((pred_r, pred_n), axis=0)
    res = np.argsort(-r)
    results = n_line.join([f"{i}. {s}" for i,s in enumerate(strings[res][:5])])
    print(f"Currently solved puzzle is \n{current_puzzle},\n\n next five predicted puzzles:\n\n {results}")
    i += 1 
    if i == 3:
        break
    else:
        print('------------------------------------')


Currently solved puzzle is 
2out.h:13-13 place classes files the 5-03512da4 include/2out.h //github.com/dronmdf/2out/blob/master/include/2out.h lines 13-13 resolved place classes files `` the created andrey valyaev 22-may-17 estimate 15 minutes role imp if technical questions ask submit the done `` fixed _removed_ source here pdd //www.yegor256.com/2009/03/04/pdd.html //www.yegor256.com/2017/04/05/pdd-in-action.html,

 next five predicted puzzles:

 0. tstsimple.cpp:21-22 need separate errro/failure result ... the 103-ee5b9b79 src/tstsimple.cpp //github.com/dronmdf/2out/blob/master/src/tstsimple.cpp lines 21-22 resolved need separate errro/failure result error assertion fail failure illegal exit `` the created andrey valyaev 15-jun-17 estimate 15 minutes role imp if technical questions ask submit the done `` fixed _removed_ source here pdd //www.yegor256.com/2009/03/04/pdd.html //www.yegor256.com/2017/04/05/pdd-in-action.html

1. textreporttest.cpp:61-62 this test little fake failure .

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  vector_puzzle = torch.tensor(np.concatenate([vector, np.zeros(50 - len_v)], axis = 0).astype(np.int))
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  v=np.concatenate([vector, np.zeros(50 - len_v)], axis = 0).astype(np.int)


In [36]:
df3_test = df_test.groupby(['repo']).agg(lambda x: list(x))
df3_test['len'] = df3_test['iid'].map(lambda x: len(x))

In [37]:
df3_test = df3_test[df3_test['len'] >= 7]

(26, 5)

In [38]:
aps = []
for idx, row in df3_test.iterrows():
    try:
        window = create_window_test(row['text'],15)
        for row in window:
            current_puzzle = row[0]
            relevant = row[1:6]
            nonrelevant = row[5:]
            pred_r = predict(current_puzzle, relevant).cpu().detach().numpy()
            pred_n = predict(current_puzzle, nonrelevant).cpu().detach().numpy()
            try:
                rq = np.concatenate((pred_r, pred_n), axis = 0)
                res = np.argsort(-rq)
                ap5 = ap_i(res[:5])
                aps.append(ap5)
            except:
                pass
    except:
        pass
        
print(f'Mean average precision@5 for all repos with more than 7 commits on test data: {np.mean(aps)}')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  vector_puzzle = torch.tensor(np.concatenate([vector, np.zeros(50 - len_v)], axis = 0).astype(np.int))
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  v=np.concatenate([vector, np.zeros(50 - len_v)], axis = 0).astype(np.int)


Mean average precision@5 for all repos with more than 7 commits on test data: 0.5956709956709956


In [39]:
import json
with open('enriched-extended-issues-with-relationship.json','r') as f:
    data = json.loads(f.read())
from nltk.tokenize import word_tokenize

stop_words = ['to','the','for', 'this','a', 'and','of','is','we','be','pdd','com', 'puzzle','in','on','by',
             'has', 'was','me', 'http','html','about','yegor','www','from','java','github','new','have','https',
              'code','will','instead','any','don','text','task','more','when','you', 'problem','tickets'
            ]

def get_row(issue):
    def _(f):
        return issue[f] if f in issue else None
    
    def _t(f1, f2):
        return (parse(_(f2)) - parse(_(f1))).total_seconds() if _(f1) and _(f2) else None 
    
    def __(f1, f2):
        return _(f1)[f2] if _(f1) and f2 in _(f1) else None
    
    def _l():
        return [label['name'] for label in _('labels')] if _('labels') else []
    
    def clean(x):
        return ' '.join([word for word in word_tokenize(x) if word not in stop_words])
    
    title = _('title')
    body = _('body')
    issue_id = _('id')
    child_id = __('parentIssue','id')
    text = f'{clean(title)} {clean(body)}'
    if text and not text.isspace():  
        return [issue_id, child_id, text]
    else:
        return [None,None,None]
overall_data = []

overall_data.extend([get_row(issue) for issue in data])
df = pd.DataFrame(overall_data)
df.columns = ['iid', 'pid', 'text']
df2 = df.dropna(subset=['pid'])
all_puzzles = df2['iid'].values + df2['pid'].values
all_puzzles = list(df2['iid'].values)+ list(df2['pid'].values)
def get_depth(iid):
    i = 0
    while len(df2[df2['pid'] == iid].values) != 0:
        i += 1
        iid2 = df2[df2['pid'] == iid].values[0][0]
        if iid2 == iid:
            break
        else:
            iid = iid2
    return i
child_counts = {}
from tqdm import tqdm
for row in tqdm(all_puzzles):
    child_counts[row] = get_depth(row)
ds = []
for k,v in child_counts.items():
    ds.append([k,v])
df3 = pd.DataFrame(ds)
df3.columns = ['iid','count']
df4 = pd.merge(df3,df2, on='iid')[['iid','count','text']]
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df4['text'].values)]
model = Doc2Vec(documents, vector_size=10, window=2, min_count=1, workers=4)

100%|██████████| 3320/3320 [00:06<00:00, 539.98it/s] 


In [40]:
res4 = []
for idx,row in df4.iterrows():
    res4.append([row['iid'],row['count'], model.infer_vector(row['text'].split())])
    
df5 = pd.DataFrame(res4)
df5.columns = ['iid','count','text']
df5 = df5.drop(df5[df5['count'] > 9].index)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
X = df5['text']
y = df5['count']

X_train, X_test, y_train, y_test = train_test_split(np.stack(X.values), y.values, test_size=0.33, random_state=42, stratify = y.values)

model_rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=100)
model_rf.fit(X_train, y_train) 
# from sklearn.cluster import MeanShift
from sklearn_extra.cluster import KMedoids
import numpy as np
    
def top_n_indexes(arr, n):
    idx = np.argpartition(arr, arr.size-n, axis=None)[-n:]
    width = arr.shape[1]
    return [divmod(i, width) for i in idx]

def get_first_unique(values):
    res = {}
    for idx, value in enumerate(values):
        if value not in res:
            res[value] = idx
    return np.array(list(res.values()))

def get_best_task(tasks):
    X_text = tasks
    X_vector = []
    for task in X_text:
        X_vector.append(model.infer_vector(task.split()))
    X_vector = np.stack(X_vector)
    predictions = model_rf.predict(X_vector)
    best_tasks = np.argsort(predictions)[:1000]
    clustering = KMedoids(n_clusters=5, random_state=42).fit(X_vector[best_tasks]).labels_
    unique_indecies = get_first_unique(clustering)
    return X_text[best_tasks[clustering[unique_indecies]]], X_vector[best_tasks[clustering[unique_indecies]]], X_vector, predictions

In [41]:
def ap_i__(predicted, relevant):
    relevant = set(relevant)
    ap = 0
    positives = 0
    for i in range(len(predicted)):
        if predicted[i] in relevant:
            positives += 1
            ap = positives / (i + 1)
    return ap

In [42]:
aps = []
for idx, row in df3_train.iterrows():
    window = create_window_test(row['text'], min(20,len(row['text'])) )
    for row in window:
        current_puzzle = row[0]
        relevant = row[1:6]
        nonrelevant = row[5:]
        pred_r = get_best_task(row)[0]
        aps.append(ap_i__(pred_r, relevant))

print(f'Mean average precision@5 for all repos with more than 7 commits on train data: {np.mean(aps)}')

Mean average precision@5 for all repos with more than 7 commits on train data: 0.41008911521324


In [44]:
aps = []
for idx, row in df3_test.iterrows():
    window = create_window_test(row['text'], min(20,len(row['text'])) )
    for row in window:
        current_puzzle = row[0]
        relevant = row[1:6]
        nonrelevant = row[5:]
        pred_r = get_best_task(row)[0]
        aps.append(ap_i__(pred_r, relevant))

print(f'Mean average precision@5 for all repos with more than 7 commits on train data: {np.mean(aps)}')

Mean average precision@5 for all repos with more than 7 commits on train data: 0.39748427672955966
