## log.json generator

Complete notebook

In [11]:
print("Importing Libraries...")
# Import libraries!!!
import subprocess
import os, sys, re
import json
import gensim
import nltk
import networkx as nx
import similarityCalLib
import pandas as pd
import numpy as np
import time
import requests

from math import log10
from sklearn.manifold import TSNE
from dateutil.parser import parse
from dateutil import relativedelta
from collections import Counter
from nltk.corpus import stopwords
from gensim.models import LdaModel

nltk.download('stopwords')

print("Finished!")


Importing Libraries...
Finished!


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/POSTecHyeoN/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
########################
user_name = "vuejs"
repo_name = "vue"
api_url = "https://api.github.com/"
########################

### Collecting log files

In [24]:
print("COLLECTING LOGS...")

log_file_name = "../log/" + repo_name + "-gitlog-all-parents-numstat-date.log"
reflog_file_name = "../log/" + repo_name + "-reflog.log"
git_info_path = '../GIT-REPOS/' + repo_name + '/.git'


log_file = open(log_file_name, "w")
log_out = subprocess.check_output(['git', '--git-dir', git_info_path,'--no-pager', 'log', '--all', '--parents', '--numstat', '--date-order', '--pretty=fuller', '-c'], encoding="utf-8")
log_file.write(log_out)
log_file.close()


reflog_file = open(reflog_file_name, "w")
reflog_out = subprocess.check_output(['git', '--git-dir', git_info_path,'--no-pager', 'log', '--simplify-by-decoration', '--tags', '--branches', '--remotes', '--date-order', '--decorate', '--pretty=tformat:"%H    %C(auto)%D%Creset"'], encoding="utf-8")
reflog_out = reflog_out.replace('"',"")
reflog_file.write(reflog_out)
reflog_file.close()

print("SUCCESS!!")

COLLECTING LOGS...
SUCCESS!!


### Parsing step

In [25]:
commitDic = {}
commitList = []
commit = None

print("PARSING...")
# parse log files
no = 0
for line in open(log_file_name, "r", encoding='UTF8'):
    line = line.strip("\n")
    if line.startswith("commit"):
        if commit != None:
            commitList.append(commit)
            commitDic[commit["id"]] = commit
            
        commit = {}
        items = line.split()
        commit["no"] = no
        no = no + 1
        commit["id"] = items[1]
        commit["parents"] = items[2:]
        commit["diffStat"] = {"changedFileCount":0, "insertions":0, "deletions":0, "files": {}}
        commit["message"] = ""
        continue
    elif line.startswith("Merge"):
        commit["merge"] = line.split()[1:]
    elif line.startswith("Author:"):
        commit["author"] = line.split("Author:     ")[1]
    elif line.startswith("AuthorDate:"):
        commit["authorDate"] = line.split("AuthorDate: ")[1]
    elif line.startswith("Commit:"):
        commit["committer"] = line.split("Commit:     ")[1]
    elif line.startswith("CommitDate:"):
        commit["date"] = line.split("CommitDate: ")[1]
    elif line.startswith("    "):
        line = line.strip()
        if line != "" and commit["message"] != "":
            commit["message"] += "\n"
        if line.startswith("Signed-off-by:"):
            continue
        commit["message"] += line.strip()
        commit["message"] = commit["message"].strip()
    else: #log
        if line != "":
            [ins, dels, fileName] = line.split("\t")
            if ins == "-":
                ins = 0
                dels = 0
            else:
                ins = int(ins)
                dels = int(dels)
            commit["diffStat"]["changedFileCount"] += 1
            commit["diffStat"]["files"][fileName] = {"insertions":ins, "deletions":dels};
            commit["diffStat"]["insertions"] += ins
            commit["diffStat"]["deletions"] += dels
commitList.append(commit)
commitDic[commit["id"]] = commit

for line in open(reflog_file_name, "r", encoding='UTF8'):
    line = line.strip("\n").strip()
    items = line.split("    ")
    if len(items) > 1:
        branches = []
        tags = []
        refs = items[1].split(", ")
        for ref in refs:
            if ref.startswith("tag: "):
                tags.append(ref[5:])
            elif ref.startswith("HEAD -> "):
                commitDic[items[0]]["isHead"] = True
                tags.append(ref[8:])
            else:
                branches.append(ref)
        commitDic[items[0]]["branches"] = branches
        commitDic[items[0]]["tags"] = tags
        
print("GENERATING JSON...")

json_log_file_name = log_file_name + ".json"

# generate json file        
json_file = open(json_log_file_name, "w")
json_file.write(json.dumps(commitList, indent=4, separators=(',', ': ')))
json_file.close()

print("SUCCESS!!")



PARSING...
GENERATING JSON...
SUCCESS!!


### Crawling issues / PR infos

In [30]:
from IPython.display import clear_output


def retreive_rate(access_token):
    r = requests.get(api_url + "rate_limit" + access_token)
    if(r.ok):
        rateItem = json.loads(r.text or r.content)
        return rateItem["resources"]["core"]["remaining"]
#     else:
#         quit()
        
def store_temp(info, info_str):
    with open("../log/"+ repo_name +"." + info_str + "_temp.json","w", encoding="utf-8") as info_json:
        json.dump(info, info_json, indent="\t")

def store(info, info_str, temp_exists = True):
    if(info_str == "pulls"):
        info_str = "pulls_raw"
    with open("../log/"+ repo_name + "." + info_str + ".json","w", encoding="utf-8") as info_json:
        json.dump(info, info_json, indent="\t")
    if temp_exists:
        os.remove("../log/"+ repo_name +"." + info_str + "_temp.json") if os.path.isfile("../log/"+ repo_name +"." + info_str + "_temp.json") else None
    



## Get Access Token (stored in .gitignore)
with open('./token.txt', "r") as token_file:
    access_token = "?access_token=" + token_file.readline()
    access_token = access_token.rstrip()
    

# issue / issues -> info
def crawler(info_str): 
    info = []
    finished = False
    
    base_url = api_url + "repos/" + user_name + "/" + repo_name + "/" + info_str + "/"
    end_url = "/comments"

    try:
        with open("../log/"+ repo_name +"." + info_str + "_temp.json", "r", encoding="utf-8") as info_json:
            info = json.load(info_json)
    except:
        print("Starting issue crawling...")

    idx_current = len(info)
    i = idx_current
    
    issue_num = None
    
    issue_r = requests.get(base_url[:-1] + access_token + "&state=all")
    if(issue_r.text == "[]"):
        store([], info_str)
        return
        
    
    if(issue_r.ok):
        repoItem = json.loads(issue_r.text or issue_r.content)
        issue_num = int(repoItem[0]["url"].split("/")[-1])
        print("Total Issue Number: " + str(issue_num))
        
    try:  
        while True:
            i += 1
            if i % 100 == 0:
                clear_output()
                print("Currently crawling " + info_str + "...")
            print("Crawled " + base_url + str(i)) if(i % 10 == 0 or i == 1) else None

            r = requests.get(base_url + str(i) + access_token)
            if(r.ok):  # Success!!
                repoItem = json.loads(r.text or r.content)
                info.append(repoItem)
            else:      # Failed!!
                store_temp(info, info_str)  ## First of all, store current info
                if(i == issue_num + 1):
                    store(info,info_str)
                if(retreive_rate(access_token) < 3):  # Maybe rate limit exceed
                    while True:
                        print("Wait until the api rate restores...[3 minutes]")
                        time.sleep(180)
                        remaining_rate = retreive_rate(access_token)              
                        print("Remaining API Rate: " + str(remaining_rate) + " times")
                        if(remaining_rate > 2000):
                            break
                else:
                    if i > issue_num:
                        finished = True
                        break
    except:
        store_temp(info, info_str)

    if  finished:
        print("Issue Crawling finished!!!")
        store(info, info_str)
    else:
        print("Not finished yet...\nCurrent issue #: " + str(len(info)))
        store_temp(info, info_str)
        remaining_rate = retreive_rate(access_token)              
        print("Remaining API Rate: " + str(remaining_rate) + " times")

### Crawling Issues

if (not os.path.isfile("../log/"+ repo_name +".issues.json")):
    crawler("issues")

### Crawling pull infos

if (os.path.isfile("../log/"+ repo_name +".issues.json") and not os.path.isfile("../log/"+ repo_name +".pulls_html_temp.json") and not os.path.isfile("../log/" + repo_name + ".pulls.json")):
    print("Extracting PR infos from issues.json")
    with open("../log/"+ repo_name +".issues.json", "r", encoding="utf-8") as issue_json:
        issues = json.load(issue_json)
        pulls_html = []
        for issue in issues:
            if "pull_request" in issue.keys():
                  pulls_html.append(issue["html_url"])
        print("Storing PR data...")        
        store_temp(pulls_html,"pulls_html")
        print("PR html url Temporary Crawling finished!!")

if(os.path.isfile("../log/" + repo_name + ".pulls_html_temp.json") and not os.path.isfile("../log/" + repo_name + ".pulls.json")):
    with open("../log/" + repo_name + ".pulls_html_temp.json", "r", encoding="utf-8") as pulls_htmls_json:
        pulls_htmls = json.load(pulls_htmls_json)
        pulls = []
        pulls_finished = False
        try:
            with open("../log/" + repo_name + ".pulls_temp.json", "r", encoding="utf-8") as pulls_json:
                pulls = json.load(pulls_json)
        except:
            print("Starting pull crawling...")
            print("PR Total #: " + str(len(pulls_htmls)))
            
        current_num = -1
        if len(pulls) > 0:
            current_num = pulls[-1]["number"]
        
        
        base_url_pulls = api_url + "repos/" + user_name + "/" + repo_name + "/pulls/"
        try:    
            for (idx, html) in enumerate(pulls_htmls):
                
                if(idx % 10 == 0 and idx > 5):
                    clear_output()
                    print("Currently crawling pulls...")

                if current_num < int(html.split("/")[-1]):
                    r = requests.get(base_url_pulls + html.split("/")[-1] + access_token)
                    if(r.ok):  # Success!!
                        print("Crawled " + base_url_pulls + html.split("/")[-1])
                        repoItem = json.loads(r.text or r.content)
                        pulls.append(repoItem)
                    else:      # Failed!!
                        store_temp(pulls, "pulls")  ## First of all, store current info
                        if(retreive_rate(access_token) < 3):  # Maybe rate limit exceed
                            while True:
                                print("Wait until the api rate restores...[3 minutes]")
                                time.sleep(180)
                                remaining_rate = retreive_rate(access_token)              
                                print("Remaining API Rate: " + str(remaining_rate) + " times")
                                if(remaining_rate > 2000):
                                    break
                        else:
                            pulls_finished = True
                            break
                if idx == len(pulls_htmls) - 1:
                    pulls_finished = True
        except:
            store_temp(pulls, "pulls")   
            
        if  pulls_finished:
            print("PR Crawling finished!!!")
            store(pulls, "pulls")
            os.remove("../log/" + repo_name + ".pulls_html_temp.json")
        else:
            print("Not finished yet...\nCurrent pull #: " + str(len(pulls)) + " / " + str(len(pulls_htmls)))
            store_temp(pulls, "pulls")
            remaining_rate = retreive_rate(access_token)              
            print("Remaining API Rate: " + str(remaining_rate) + " times")
            
        if (pulls_htmls == []):
            store([], "pulls")
                                          
print("Crawling Finished!!!!!!!")



Currently crawling pulls...
Crawled https://api.github.com/repos/vuejs/vue/pulls/11346
Crawled https://api.github.com/repos/vuejs/vue/pulls/11349
Crawled https://api.github.com/repos/vuejs/vue/pulls/11351
PR Crawling finished!!!
Crawling Finished!!!!!!!


### Reducing raw PR data to compact PR data

In [31]:


print("reducing data...and")

with open("../log/" + repo_name + ".pulls_raw.json", "r", encoding="utf-8") as pulls_json:
    raw_pulls = json.load(pulls_json)
    pulls_compact_data = []
    for item in raw_pulls:
        newItem = {}
        newItem["number"] = item["number"]
        newItem["state"] = item["state"]
        newItem["title"] = item["title"]
        newItem["body"] = item["body"]
        newItem["message"] = item["title"] if item["body"] == None else item["title"] + " " + item["body"]
        newItem["merge_commit_sha"] = item["merge_commit_sha"]
        newItemHead = {}
        newItemHead["sha"] = item["head"]["sha"]
        newItem["head"] = newItemHead
        newItemBase = {}
        newItemBase["sha"] = item["base"]["sha"]
        newItem["base"] = newItemBase
        newItem["commitsLink"] = item["_links"]["commits"]["href"]
        newItem["merged"] = item["merged"]
        pulls_compact_data.append(newItem)
    
    with open("../log/"+ repo_name + "." + "pulls_compress.json","w", encoding="utf-8") as info_json:
        json.dump(pulls_compact_data, info_json, indent="\t")
        
print("finished!!")

reducing data...and
finished!!


### NLP Analysis step

In [32]:
log_file_name = "../log/" + repo_name + "-gitlog-all-parents-numstat-date.log"
json_log_file_name = log_file_name + ".json"

custom_stopwords = [',', '.', '\'','-','<','>'] # to be added
single_characters = list("abcdefghijklnmopqrstuvwxyz")
custom_stopwords = stopwords.words(('english'))+ custom_stopwords + single_characters

def commit_analysis(filename, outfile):
    with open(filename) as json_file:
        json_data = json.load(json_file)
        for commit in json_data:
            keyword_match = False
            for keyword in keyword_list:
                if commit["message"].lower().find(keyword) != -1:
                    commit["commitType"] = keyword_dict[keyword]
                    keyword_match =  True
                    break
            if not keyword_match:
                commit["commitType"] = "not_mapped"

        #os.remove(filename)
    with open(outfile, "w") as json_file:
        json.dump(json_data, json_file, indent=4)
        
    topic_analysis(outfile, "message")

def topic_analysis(filename, typeName):
    min_count = 40
    commit_list = []
    with open(filename) as json_file:
        json_data = json.load(json_file)
        for commit in json_data:
            corpus = re.sub('[^a-zA-Z0-9_#\.\-]', ' ', commit[typeName].lower().replace("\'s", " "))
            corpus = list(corpus)
            for (idx, char) in enumerate(corpus):
                if(corpus[idx] == "#"):
                    try:
                        num = int(corpus[idx + 1])
                    except:
                        corpus[idx] = " "
            
            corpus = "".join(corpus)
            filtered_corpus = [word for word in corpus.split() if word not in custom_stopwords]

            filtered_corpus = list(map(lambda x: x if x[-1] != "." else x[:-1], filtered_corpus))
            # stopwords.words('english')
            commit["corpus"] = filtered_corpus
            commit_list.append(filtered_corpus)
    os.remove(filename)
    with open(filename, "w") as json_file:
        json.dump(json_data, json_file, indent=4)

    dictionary = gensim.corpora.Dictionary(commit_list)
    print('dictionary size : %d' % len(dictionary))
        
    word_counter = Counter((word for words in commit_list for word in words))
        
    removal_word_idxs = { dictionary.token2id[word] for word, count in word_counter.items() if count > min_count }
    #print(removal_word_idxs)
    dictionary.filter_tokens(removal_word_idxs)
    dictionary.compactify()
    print('filtered dictionary size : %d' % len(dictionary))
        
    """
        for k,v in dictionary.token2id.items():
                print(k,v)  # print key(word), value(idx) pair in filtered dictionary
    """

"""
        common_corpus = [dictionary.doc2bow(text) for text in commit_list]
        lda = LdaModel(common_corpus, num_topics=10, id2word=dictionary)
        for topic in lda.print_topics():
                print(topic)
"""


keyword_list = ["implement", "add", "request", "new", "test", "start", "includ", "initial", "introduc", "creat", "increas", 
"optimiz", "adjust", "update", "delet", "remov", "chang", "refactor", "replac", "modif", "is now", "are now", 
"enhance", "improv", "design", "change", "renam", "eliminat", "duplicat", "restrutur", "simplif", "obsolete", 
"rearrang", "miss", "enhanc", "improv", "bug", "x", "issue", "error", "correct", "proper", "deprecat", "broke",
"clean", "license", "merge", "release", "structure", "integrat", "copyright", "documentation", 
"manual", "javadoc", "comment", "migrat", "repository", "code review", "polish", "upgrade", "style", "formatting", "organiz", "todo"]
keyword_dict = dict()

class_label = "forward_engineering"
for keyword in keyword_list:
        if keyword == "optimiz":
                class_label = "reengineering"
        elif keyword == "bug":
                class_label = "corrective_engineering"
        elif keyword == "clean":
                class_label = "management"
        keyword_dict[keyword] = class_label

"""
keyword_dict["forward_engineering"] = ["implement", "add", "request", "new", "test", "start", 
"includ", "initial", "introduc", "creat", "increas"]
keyword_dict["reengineering"] = ["optimiz", "adjust", "update", "delet", "remov", "chang", "refactor", "replac", "modif", "is now", "are now", 
"enhance", "improv", "design", "change", "renam", "eliminat", "duplicat", "restrutur", "simplif", "obsolete", "rearrang", "miss", "enhanc", "improv"]
keyword_dict["corrective_engineering"] = ["bug", "x", "issue", "error", "correct", "proper", "deprecat", "broke"]
keyword_dict["management"] = ["clean", "license", "merge", "release", "structure", "integrat", "copyright", "documentation", 
"manual", "javadoc", "comment", "migrat", "repository", "code review", "polish", "upgrade", "style", "formatting", "organiz", "todo"]
"""

print("NLP ANALYSIS")
commit_analysis(json_log_file_name, "../log/" + repo_name + ".nlp.json")

topic_analysis("../log/"+ repo_name + "." + "pulls_compress.json", "message")
print("SUCCESS!!")

NLP ANALYSIS
dictionary size : 7763
filtered dictionary size : 7488
dictionary size : 7585
filtered dictionary size : 7340
SUCCESS!!


### Connecting pull / issue info with commit history

In [33]:
with open('./token.txt', "r") as token_file:
    access_token = "?access_token=" + token_file.readline()

def add_issue():
    origin_file_name = "../log/" + repo_name + ".nlp.json"
    
    
    print("Adding issue info to the commit history")
    
    with open(origin_file_name) as origin_commit_file:
        origin_commits = json.load(origin_commit_file)
        for commit in origin_commits:
            message = commit["message"]
            issue_reg = re.compile("#\d+")
            m = issue_reg.findall(message)
            
            related_issues = []            
            if m:
                for issue in m:
                    related_issues.append(issue[1:])
            
            commit["issues"] = related_issues
        
    
        return origin_commits

def add_pull(origin_commits):
    origin_pull_file_name = "../log/" + repo_name + ".pulls_compress.json"
    final_file_name = "../log/" + repo_name + ".nlp.withissue.json"
    
    sha2Index = {}
    
    for (idx, commit) in enumerate(origin_commits):
        sha2Index[commit["id"]] = idx
        
        
    
    
    print("Adding PR info to the commit history")
    with open(origin_pull_file_name) as pull_info_file:
        pulls_info = json.load(pull_info_file)
        
        print("Total pull #: " + str(len(pulls_info)))
        for (idx, pull) in enumerate(pulls_info):
            link = pull["commitsLink"]
            r = requests.get(link + access_token)
            if (r.ok):
                repoItem = json.loads(r.text or r.content)
                for commit_info in repoItem:
                    try:
                        index = sha2Index[commit_info["sha"]]
                    except:
                        continue
                    
                    if "pulls" not in origin_commits[index].keys():
                        origin_commits[index]["pulls"] = [int(pull["number"])]
                    else:
                        origin_commits[index]["pulls"].append(int(pull["number"]))
            else:
                while True:
                    print("Wait until the api rate restores...[3 minutes]")
                    time.sleep(180)
                    remaining_rate = retreive_rate(access_token)              
                    print("Remaining API Rate: " + str(remaining_rate) + " times")
                    if(remaining_rate > 2000):
                        break
            if idx % 10 == 0:
                print("Pull #" + str(idx) + " handled")


    print("Printing result...")
    final_file = open(final_file_name, "w")
    final_file.write(json.dumps(origin_commits, indent=4, separators=(',', ': ')))
    print("finished!!")

                       
        
add_pull(add_issue())
os.remove("../log/" + repo_name + ".nlp.json")

Adding issue info to the commit history
Adding PR info to the commit history
Total pull #: 1793
Pull #0 handled
Pull #10 handled
Pull #20 handled
Pull #30 handled
Pull #40 handled
Pull #50 handled
Pull #60 handled
Pull #70 handled
Pull #80 handled
Pull #90 handled
Pull #100 handled
Pull #110 handled
Pull #120 handled
Pull #130 handled
Pull #140 handled
Pull #150 handled
Pull #160 handled
Pull #170 handled
Pull #180 handled
Pull #190 handled
Pull #200 handled
Pull #210 handled
Pull #220 handled
Pull #230 handled
Pull #240 handled
Pull #250 handled
Pull #260 handled
Pull #270 handled
Pull #280 handled
Pull #290 handled
Pull #300 handled
Pull #310 handled
Pull #320 handled
Pull #330 handled
Pull #340 handled
Pull #350 handled
Pull #360 handled
Pull #370 handled
Pull #380 handled
Pull #390 handled
Pull #400 handled
Pull #410 handled
Pull #420 handled
Pull #430 handled
Pull #440 handled
Pull #450 handled
Pull #460 handled
Pull #470 handled
Pull #480 handled
Pull #490 handled
Pull #500 handl

### Adding tf/idf info

In [34]:
# Global variable for corpus
corpusDfDict = {}


class tfIdfGenerator:
    
                
    commitList = None
    jsonData = None
    
    def __init__(self, jsonData):
        
        self.commitList = []
        self.jsonData = jsonData

        for commit in jsonData:
            self.commitList.append(commit) # for reserving the order of commits   
        commitNum = len(self.commitList)
        
        for idx, commit in enumerate(self.commitList): 
            currentCommitWords = []
            for word in commit["corpus"]:
                if word not in currentCommitWords:
                    if word not in corpusDfDict:
                        corpusDfDict[word] = [len(corpusDfDict), 1]  # idx / idf list
                    else:
                        corpusDfDict[word][1] += 1
                    currentCommitWords.append(word)

        for key in corpusDfDict.keys():
            corpusDfDict[key][1] = log10(commitNum / corpusDfDict[key][1])
        
        
        
    def addTfIdfInfo(self):
        
        
        for commit in self.jsonData:
            currentIdx2TfIdf = {}
            for word in commit["corpus"]:
                idx = corpusDfDict[word][0]
                idf = corpusDfDict[word][1]
                if idx not in currentIdx2TfIdf.keys():
                    currentIdx2TfIdf[idx] = idf
                else:
                    currentIdx2TfIdf[idx] += idf
            if(len(currentIdx2TfIdf) > 0):
                commit["tfidf"] = currentIdx2TfIdf
            else:
                currentIdx2TfIdf[-1] = 0
                commit["tfidf"] = currentIdx2TfIdf
                    
        return self.jsonData
    
            
            
with open("../log/" + repo_name + ".nlp.withissue.json") as jsonCommitFile:
    jsonCommitData = json.load(jsonCommitFile)      
    tfidfCommit = tfIdfGenerator(jsonCommitData)
    
    
with open("../log/" + repo_name + ".pulls_compress.json") as jsonPullFile:
    jsonPullData = json.load(jsonPullFile)
    tfidfPull = tfIdfGenerator(jsonPullData)
    
    
idx2corpus = []

for word in corpusDfDict:
   idx2corpus.append(word)

json_idx2corpus_file = open("../log/" + repo_name + ".corpus.json", "w")
json_idx2corpus_file.write(json.dumps(idx2corpus, indent=4, separators=(',', ': ')))
json_idx2corpus_file.close()





jsonWithTfidfCommit = tfidfCommit.addTfIdfInfo()

commitFile = open("../log/" + repo_name + ".commits.json", "w")
commitFile.write(json.dumps(jsonWithTfidfCommit, indent=4, separators=(',', ': ')))
commitFile.close()


jsonWithTfidfPull = tfidfPull.addTfIdfInfo()

pullFile = open("../log/" + repo_name + ".pulls.json", "w")
pullFile.write(json.dumps(jsonWithTfidfPull, indent=4, separators=(',', ': ')))
pullFile.close()

print("Sucess!!")


# os.remove("../log/" + repo_name + ".nlp.withissue.json")
# os.remove("../log/" + repo_name + "-reflog.log")
# os.remove("../log/" + repo_name + "-gitlog-all-parents-numstat-date.log")
# os.remove("../log/" + repo_name + "-gitlog-all-parents-numstat-date.log.json")

Sucess!!


### preparing for similarity calculation

In [35]:
###########################
### Exporting json data ###
###########################

jsonFileName = "../log/" + repo_name + ".commits.json"


with open(jsonFileName) as jsonFile:
    jsonData = json.load(jsonFile)

print("Commit History Length: ", len(jsonData))

################################################
### Constructing commit history BRANCH graph ###
################################################

dags = nx.Graph()
firstParentDags = nx.DiGraph()

commitDic = {}
commitList = []
no = 0
headId = ""

# sorted in back order
for commit in jsonData:
    commitList.append(commit) # for reserving the order of commits
    commitDic[commit["id"]] = commit
    
    dags.add_node(commit["id"])
    firstParentDags.add_node(commit["id"])
    map(dags.add_node, commit["parents"])
    firstParentDags.add_node(commit["parents"][0]) if len(commit["parents"]) > 0 else None
    
#     map(dags.add_edge, [[commit["id"], p] for p in commit["parents"]])
    isFirstParent = True
    for p in commit["parents"]:
        dags.add_edge(p, commit["id"])
        if isFirstParent:
            firstParentDags.add_edge(p, commit["id"])
            isFirstParent = False
    
    if "isHead" in commit:
        headId = commit["id"]
        
#####################################        
### branch/tag history annotation ###
#####################################

def branchBackTracking(node):
    parents = list(firstParentDags.predecessors(node["id"]))
    if len(parents) == 0:
        return
    parent = commitDic[parents[0]]
    if "branches" in parent:
        return
    parent["branches"] = node["branches"]
    node = parent
    

headCommit = commitDic[headId]

# check head is origin/master. else quit
if not "master" in headCommit.get("tags", []) or not "origin/master" in headCommit.get("branches", []):
    print("HEAD IS NOT ORIGIN/MASTER OR MASTER")
    #quit()


# origin/master
node = headCommit
while(True):
    parents = list(firstParentDags.predecessors(node["id"]))
    if len(parents) == 0:
        break
    parent = commitDic[parents[0]]
    parent["branches"] = parent.get("branches", [])  # branch가 없으면 branch에 빈 리스트 지정
    parent["branches"].append("origin/mrtk_development")
                               
    node = parent


# other branches
for commit in commitList[:-1]:
    if "branches" in commit:
        branchBackTracking(commit)
        
        
### insert all pairs path length into commit ###

allPairsIterator = nx.all_pairs_dijkstra_path_length(dags, cutoff=10)
allPairsDic = {}
longestPathLength = 0
print("")
print("Calculating pair length...")
for (idx, row) in enumerate(allPairsIterator):
    allPairsDic[row[0]] = row[1]
    longestPathLength = max(longestPathLength, max(row[1].values()))
    if idx % 400 == 0:
        print("iter " + str(idx) + " finished")

print("finished!!")
# print(allPairsDic["e163637d1537ccc9ccde1d25ca87b82286aa634d"])

# for key in allPairsDic.keys():
#     print(key, len(allPairsDic[key].keys()))


Commit History Length:  10521
HEAD IS NOT ORIGIN/MASTER OR MASTER

Calculating pair length...
iter 0 finished
iter 400 finished
iter 800 finished
iter 1200 finished
iter 1600 finished
iter 2000 finished
iter 2400 finished
iter 2800 finished
iter 3200 finished
iter 3600 finished
iter 4000 finished
iter 4400 finished
iter 4800 finished
iter 5200 finished
iter 5600 finished
iter 6000 finished
iter 6400 finished
iter 6800 finished
iter 7200 finished
iter 7600 finished
iter 8000 finished
iter 8400 finished
iter 8800 finished
iter 9200 finished
iter 9600 finished
iter 10000 finished
iter 10400 finished
finished!!


### Calculating Similarity 

In [36]:
scoreFullFileName = "../log/" + repo_name + ".score.json"

totalCommitDays = (parse(commitList[0]["date"]) - parse(commitList[-1]["date"])).days
matrix = {}

# ## DEBUGGING - TF IDF LOG
# tfidfLog = ""

print("Final Simialirty calculating...")
print("Total iteration :", len(commitList), "times")
for idx, commit in enumerate(commitList):
    scoreList = {}
    topoDistance = 1;
     
    if idx % 1000 == 0:
        print("Iteration " + str(idx) + " finished!!")
     
    for idx_2, target_commit in enumerate(commitList):
        try:
            dist = allPairsDic[commit["id"]][target_commit["id"]]
            if (dist < 0):
                dist = dist * (-1);
            if dist > topoDistance:
                continue
        except:
            continue
            
        if dist >= topoDistance and dist != 0:
            scoreDic = similarityCalLib.Scores()
            scoreDic.calSimilarityScore(commit, target_commit, totalCommitDays, longestPathLength, allPairsDic[commit["id"]])
            subScoreList = {}
            subScoreList["author"] = round(scoreDic.author * 1000) / 1000
            subScoreList["commitType"] = round(scoreDic.commitType* 1000) / 1000
            subScoreList["file"] = round(scoreDic.file* 1000) / 1000
#             subScoreList["commitDate"] = round(scoreDic.commitDate* 100) / 1000
#             subScoreList["branch"] = round(scoreDic.branch* 100) / 1000
#             subScoreList["cloc"] = round(scoreDic.cloc* 100) / 1000
            subScoreList["message"] = round(scoreDic.message* 1000) / 1000
#             subScoreList["topoDist"] = round(scoreDic.topoDist* 1000) / 1000
#             subScoreList["scoreSum"] = round(scoreDic.scoreSum* 100) / 1000
            scoreList[idx_2] =  subScoreList
    
    matrix[idx] = scoreList
            
            
        
#         if dist >= -5 and dist <= 5 and dist != 0:
#             ### TFIDF LOG
#             num = test(commit["tfidf"], target_commit["tfidf"])

#             tfidfLog += str(num)
#             tfidfLog += "\n"
#             ### EO DEBUGGING
    
    
# print("\nWriting TFIDF LOG...")
# tfidfF = open("../log/tfidf.csv", "w")
# tfidfF.write(tfidfLog)
# tfidfF.close()
# print("Success!!")

print("\nFinished!! Writing file...")
f = open(scoreFullFileName , "w")
f.write(json.dumps(matrix, indent=1, separators=(',', ':')))
f.close();
print("Success!!")

Final Simialirty calculating...
Total iteration : 10521 times
Iteration 0 finished!!


  return np.interp(np.log10(diffDays), (0, np.log10(totalCommitDays) ), (1, 0))


Iteration 1000 finished!!
Iteration 2000 finished!!
Iteration 3000 finished!!
Iteration 4000 finished!!
Iteration 5000 finished!!
Iteration 6000 finished!!
Iteration 7000 finished!!
Iteration 8000 finished!!
Iteration 9000 finished!!
Iteration 10000 finished!!

Finished!! Writing file...
Success!!


### (CURRENTLY OPTIONAL) T-SNE STEP

In [12]:
# tsne = TSNE(n_components = 2, verbose=1, perplexity=40, n_iter=300)
# res = tsne.fit_transform(matrix)
# res.tolist()

# resList = []
# for l in res:
#     resList.append(str(l))
# print(resList)

# coFileName = "../log/" + projectName + ".co.json"


# f = open(coFileName , "w")
# f.write(json.dumps(res.tolist(), indent=1, separators=(',', ':')))
# f.close();
