In [1]:
import json
from bs4 import BeautifulSoup 
import requests 
import re 
import os

In [2]:
filename = "../linux-commits-2023-11-12_random-filtered-1.json"
annotated_commits_list = []
annotated_commits_dict = dict()
with open(filename) as fd:
    for commit in json.load(fd):
        annotated_commits_list.append(commit)
        c_hash = commit['data']['commit']
        annotated_commits_dict[c_hash] = commit

In [3]:
# First commit
print(annotated_commits_list[0]['data']['CommitDate'])

Sat Jan 8 17:44:05 2022 +0100


In [4]:
# Last commit
print(annotated_commits_list[-1]['data']['CommitDate'])

Tue Dec 27 12:07:32 2022 -0600


In [5]:
def getBsoupDocument(url,cache=False): 
    return BeautifulSoup(getHTMLDocument(url,cache) , 'html.parser')

In [6]:
def getHTMLDocument(url,cache=False):
    fileName = url.split("/")[-1]
    if cache and os.path.isfile('cache/%s.txt'%fileName):
        response = requests.get(url)
        with open('cache/%s.txt'%fileName, 'r') as f:
            content = f.read()
        return content
    else:
        response = requests.get(url)
        with open('cache/%s.txt'%fileName, 'w') as f:
            f.write(response.text)
        return response.text

In [10]:
current_releases = (
    ("6.6","v6.x"),
    ("6.1","v6.x"),
    ("5.15","v5.x"),
    ("5.10","v5.x"),
    ("5.4","v5.x"),
    ("4.19","v4.x")
)

In [8]:
cache = True

In [13]:
base_url = "https://cdn.kernel.org/pub/linux/kernel/"
for exact_release, range_release in current_releases:
    # Change log page (e.g. https://cdn.kernel.org/pub/linux/kernel/v6.x)
    changelog_list_page = getBsoupDocument(base_url+range_release, cache)
    for link in changelog_list_page.find_all('a', string=re.compile("^ChangeLog-"+exact_release)): 
        # link is "ChangeLog-X.X"
        print(base_url+range_release+"/"+link.get('href'))
        # changelog_page_html represent the page of changelog (e.g. https://cdn.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.6)
        changelog_page_html = getHTMLDocument(base_url+range_release+"/"+link.get('href'), cache)
        # Find commit hash using regex. Also save if its a normal commit (Author) or a Merge commit (Merge)
        commits_in_changelog = re.findall("commit ([0-9a-f]{5,40}) upstream", changelog_page_html)
        print(" > Upstream commits found: %d"%len(commits_in_changelog))
        # List to store commits in the changelog that exists in our list of annotated commits
        annotated_commits_found = []
        # Search for changelog commits in our annotated commit collection
        for commit in commits_in_changelog:
            commit_hash = commit[0]
            if commit_hash in annotated_commits_dict:
                annotated_commits_found.append(commit_hash)
        # If at least one commit is found, it is saved in a file "ChangeLog-X.X.X".
        print(" > Upstream commits match with annotated commits: %d"%len(annotated_commits_found))
        if len(annotated_commits_found) > 0:
            with open('results/%s.txt'%link.get('href'), 'w') as f:
                for commit in annotated_commits_found:
                    f.write("%s\n" % commit)
    #      break # Only first Changelog from now
    # break # Only first from now

https://cdn.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.1
 > Upstream commits found: 0
 > Upstream commits match with annotated commits: 0
https://cdn.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.1.1
 > Upstream commits found: 24
 > Upstream commits match with annotated commits: 0
https://cdn.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.1.10
 > Upstream commits found: 3
 > Upstream commits match with annotated commits: 0
https://cdn.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.1.100
 > Upstream commits found: 41
 > Upstream commits match with annotated commits: 0
https://cdn.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.1.101
 > Upstream commits found: 15
 > Upstream commits match with annotated commits: 0
https://cdn.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.1.102
 > Upstream commits found: 15
 > Upstream commits match with annotated commits: 0
https://cdn.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.1.103
 > Upstream commits found: 125
 > Upstream commits match with annotated comm

KeyboardInterrupt: 