In [1]:
import json
import re
from datetime import datetime
import concurrent.futures

In [2]:
with open('../linux-commits-2023-11-12.json') as f:
    all_commits = f.readlines()

In [3]:
all_commits_dict_list = []
all_commits_map = {}
for commit in all_commits:
    commit_dict = json.loads(commit)
    all_commits_map[commit_dict['data']['commit'][0:12]] = commit_dict
    all_commits_dict_list.append(commit_dict)

In [4]:
def getFullHash(short_hash):
    for key in all_commits_map.keys():
        if key.startswith(short_hash):
            return key
    return None

In [5]:
getFullHash("449fc48")

'449fc48866f7'

In [6]:
date_format = '%a %b %d %H:%M:%S %Y %z'
def datesDistance(bfc_date, bic_date):
    bfc_date_formated = datetime.strptime(bfc_date, date_format)
    bic_date_formated = datetime.strptime(bic_date, date_format)
    return bfc_date_formated - bic_date_formated

In [22]:
def analyzeCommit(commit):
    global http_links
    global short_commit_hash
    global not_in_git
    try:
        # There are commits without message
        if 'message' not in commit['data']:
            return None
        
        match = re.search("Fixes:[^\S\n]+(\w+)", commit['data']['message'])
        if match != None:
            bic_hash = match.group(1)
    
            # There are fixes with a link to Bugzilla
            if bic_hash.startswith("http"):
                http_links+=1
                return None
    
            if bic_hash == "commit":
                match = re.search("Fixes:[^\S\n]+commit (\w+)", commit['data']['message'])
                bic_hash = match.group(1)
    
            if bic_hash in ['IRQ','NB','SLI','Bug','line','tag','tags','discovery','drivers','igt','Bugzilla','correctly','computation','terminate','Configure','addresses']:
                return None

            # There are fixes with a commit hash shorter than 12 characters
            if len(bic_hash[0:12]) != 12:
                # Double 'Fixes: ' (7 cases)
                match = re.search("Fixes: Fixes:[^\S\n]+(\w+)", commit['data']['message'])
                if match is not None:
                    bic_hash = match.group(1) 
                else:
                    # Special format (3 cases)
                    match = re.search("Fixes: linux-next commit[^\S\n]+(\w+)", commit['data']['message'])
                    if match is not None:
                        bic_hash = match.group(1)
                    else:
                        # 77 cases, no commits hashes (manually checked)
                        if len(bic_hash[0:12]) < 6:
                            short_commit_hash+=1
                        else:
                            print(bic_hash)
                        #bic_hash = getFullHash(bic_hash)
                        #if bic_hash is None: return None
                        return None
            
            if bic_hash[0:12] not in all_commits_map:
                not_in_git+=1
                return None
                
            bic = all_commits_map[bic_hash[0:12]]
            
            delta = datesDistance(commit['data']['CommitDate'],bic['data']['CommitDate'])
            result = {
                'BFC_hash': commit['data']['commit'],
                'BIC_hash':  bic['data']['commit'],
                'BFC_comment': commit['data']['message'].split('\n', 1)[0],
                'BIC_comment': bic['data']['message'].split('\n', 1)[0],
                'daysDistance': delta.days
            }
            return result

    except Exception as e:
        match = re.search("Fixes: (.*)", commit['data']['message'])    
        print("Error matching: ",match.group(0))
        return None

In [23]:
links = []
short_commit_hash = 0
http_links = 0
not_in_git = 0
for commit in all_commits_dict_list:
    result = analyzeCommit(commit)
    if result is not None:
        links.append(result)
print("short_commit_hash:",short_commit_hash)
print("http_links:",http_links)
print("not_in_git",not_in_git)

Discovery
bf18525fd79
6384a4d
a71dc65
be79bd048ab
be79bd048
be79bd048
f25b119c
9d8bf54
0ddf03c
42fed7ba44e
b582ef0
b582ef0
ef7f38359
1abe729
29fdf4fbbe
d9a7666f
278b208375
469bdcefdc
af361079
3d49538364
f9ded3b2e7
d6173df35f
b046ffe
d3ab3ffd1d7
1438c2f60b
03510ca07
e4ad1accb
1d99f2436d
115f3f8
aeea64a
0034b29
8a4d0a687a5
c962184
97a5221
4738c1db15
a8fc927780
028e724
0138d8f075
30a70b0
0723a0473f
0406a40a0
9fb6c9c
3c8b06f981
449fc48
449fc48
b912b2f
454aee17f
b893ea5
157e876ffe
e03a9a55b4e
fe6cc55f3a9
fe6cc55f3a9
fe6cc55f3a9
5f2d04f1f9
ceca7b712
d43c6b6
d43c6b6
b9959fd3
3328715e
7e14ea15
fa9ad96d49
df3893c1
9e8269de
9e8269de
afa77ef
cb7094e8
c97102ba963
810d601f07c
5729507
3ca041ed
e13dd8ce
8931bf620
2eacc23
fa658a98a2
b1ce369e82
86f6cf41272
befdf89
dab464b60
b7779d06
b7779d06
c2d421e1718
dd41cc3
f93f160b5
4a9fdbb
21855ff5
9da0763b
7be914f
6d00b56fe
86f6cf4127
2eacc23
1b136de
2d5a5612bc
8c7424cff6
1a1ccc96abb
a848ade408b
a9b0f861
6d00b56fe
6d00b56fe
da4db94
2bd16e3e23
be4000bc464
d933319

In [9]:
# With futures (60s)
# future_results = []
# with concurrent.futures.ThreadPoolExecutor(64) as executor:
#     for commit in all_commits_dict_list:
#         future = executor.submit(analyzeCommit,commit)
#         future_results.append(future)     
#     #print("short_commit_hash:",short_commit_hash)
#     #print("http_links:",http_links)

# links = []
# for future in future_results:
#     r = future.result() 
#     if r is not None:
#         links.append(r)

In [10]:
len(links)

89458

In [11]:
errors

NameError: name 'errors' is not defined

In [None]:
count = 0
for link in links:
    if link['BIC_hash'] == '1da177e4c3f41524e886b7f1b8a0c1fc7321cac2':
        count+=1
count

## Rare cases
- No commit message: https://github.com/torvalds/linux/commit/7b7abfe3dd81d659a0889f88965168f7eef8c5c6
- No exist in repository (but in GitHub): https://github.com/torvalds/linux/commit/54fe26a900bc528f3df1e4235cb6b9ca5c6d4dc2
- No exist (even in GitHub): 21d2202158e9