In [74]:
import re
from collections import defaultdict

# import the libraries needed

from pydriller import Repository
from pydriller.domain.commit import ModificationType, Commit
import pandas as pd

In [75]:
# set the repository path
repository_path = "hadoop"
# to use remote repo uncomment the following:
# repository_path = "https://github.com/apache/hadoop"

In [76]:
# set the Excel file path
excel_filename = "Issues_assignment1.xlsx"
excel_sheet = "Group3"

In [77]:
def get_commit_keys(filename: str, sheet_name: str) -> list:
    df = pd.read_excel(filename, sheet_name = sheet_name)
    
    keys = df["Key"].tolist()
    return keys

In [78]:
def get_repository(path: str):
    return Repository(path)

In [79]:
def analyze_commit_file_level(data: Commit):
    modified_files: list[dict[str, str|int|None]] = []
    '''
        {
            'CommitHash': 'jchejfvgdbsnma',
            'FileName': 'abc.java',
            'ChangeType: 'Add',
            'TotalAddedLines': 1,
            'TotalDeletedLines': 0,
            'Complexity': 1.2
        }
    '''
    for modified_file in data.modified_files:
        
        added_methods = deleted_methods = modified_methods = 0
        
        total_methods_before = len(modified_file.methods_before)
        total_methods_after = len(modified_file.methods)
        total_changed_methods = len(modified_file.changed_methods)
        
        if total_methods_before < total_methods_after:
            added_methods = total_methods_after - total_methods_before
            deleted_methods = 0
            modified_methods = total_changed_methods - added_methods
        else:
            added_methods = 0
            deleted_methods = total_methods_before - total_methods_after
            modified_methods = total_changed_methods - deleted_methods
        
        row = {
            'CommitHash': commit.hash,
            'FileName': modified_file.filename,
            'ChangeType': modified_file.change_type,
            'AddedLines': modified_file.added_lines,
            'DeletedLines': modified_file.deleted_lines,
            'AddedMethods': added_methods,
            'DeletedMethods': deleted_methods,
            'ModifiedMethods': modified_methods,
            'Complexity': modified_file.complexity,
        }
        modified_files.append(row)
    
    return modified_files
        

In [80]:
def analyze_commit(data: Commit):
    
    # Initialize counters
    added_files = modified_files = deleted_files = 0
    
    # Analyze modifications in each commit
    for modified_file in data.modified_files:
        # count number of files added/removed/modified.
        if modified_file.change_type == ModificationType.ADD:
            added_files += 1
        elif modified_file.change_type == ModificationType.MODIFY:
            modified_files += 1
        elif modified_file.change_type == ModificationType.DELETE:
            deleted_files += 1
        
    return {
        'CommitHash': commit.hash,
        'Author': f"{commit.author.name} <{commit.author.email}>",
        'CommitDate': commit.committer_date,
        'Message': commit.msg,
        'TotalAddedFiles': added_files,
        'TotalModifiedFiles': modified_files,
        'TotalDeletedFiles': deleted_files,
        'UnitSize': data.dmm_unit_size,
        'Complexity': data.dmm_unit_complexity,
        'Interfacing': data.dmm_unit_interfacing,
    }

In [81]:
repository = get_repository(repository_path)
commits = repository.traverse_commits()
issued_keys = get_commit_keys(excel_filename, excel_sheet)

# I used set to avoid duplicates
issued_commits: set[Commit] = set()
for commit in commits:
    if any(commit.msg.__contains__(key) for key in issued_keys):
        issued_commits.add(commit)

# multiple commits can be linked to the same issue_key
print("Found issued commits: " + len(issued_commits).__str__());

Found issued commits: 525


In [82]:
# let's filter the issued_commits to check if there was design change
# feel free to update the regex pattern
design_issue_pattern = r'\b(design|architecture|pattern|fix|refactor)\b'

filtered_commits: set[Commit] = set()
for commit in issued_commits:
    if re.search(design_issue_pattern, commit.msg.lower()):
        filtered_commits.add(commit)

print("Found commits with filter issues: " + len(filtered_commits).__str__())

Found commits with filter issues: 38


In [83]:
# stores filtered_commit with their parents
commit_dict: dict[Commit, set[Commit]] = defaultdict(set)

for filtered_commit in filtered_commits:
    parent_commit_hashes = filtered_commit.parents
    # print(parent_commit_hashes)
    
    parent_commits: set[Commit] = set()
    for commit in repository.traverse_commits():
        # print(commit.hash)
        if any(commit.hash.__eq__(hash) for hash in parent_commit_hashes):
            parent_commits.add(commit)
            
    commit_dict[filtered_commit] = parent_commits     
    
# this one takes sometime

In [84]:
issued_commits_and_parent_mapping_rows: list[dict[str, str]] = []
commit_details_list: list[dict[str, str]] = []
file_details_list = []

for key, value in commit_dict.items():
    for commit in value:
        row = {
            'CommitHash': key.hash,
            'ParentCommitHash': commit.hash,
        }
        
        issued_commits_and_parent_mapping_rows.append(row)
        commit_details_list.append(analyze_commit(commit))
        file_details_list.extend(analyze_commit_file_level(commit))

# a commit can have multiple parents
issued_commits_and_parent_mapping = pd.DataFrame(issued_commits_and_parent_mapping_rows)
issued_commits_and_parent_mapping.head(10)

# contains details on file level

Unnamed: 0,CommitHash,ParentCommitHash
0,ea9f43781e5a9a4a6121dd3be106ced6b811754d,f5f1c81e7dcae0272e71ef4e6bedfc00b8c677d6
1,bb957bd2cdf33271bec663cdc86b44009e66e6bb,01d81f4e51c64d5b316d129b514fa6802770ead9
2,5c7b7adacb47242fe4c82e982cb06e6276f6f862,bb957bd2cdf33271bec663cdc86b44009e66e6bb
3,a90d3205d2a23945eaab8b756cfbeeb4377c3c04,08561f76dbac53aaec12d4170b183e06747c9f75
4,843ee8d59d8bacbca0d87ccf0790772e39d16138,9cb0c963d21bcbefc56716d332a3bbdf090417c0
5,3a0b84d9c994f9a634b32903e5c1876c44f8c16e,6814324c332a7d780f3b844fd6f1c62db2f6c88e
6,730bc746f9ac6e045e94dc2bc622b16de0159b4b,6a6e74acf5c38a4995c4622148721cfe2f1fbdad
7,09469bf47dd1eb1d880f8119bf62c29cf70cdf58,476340c6999df71480164b18abd809c5e710d87a
8,5f08e51b72330b2dd2405896b39179a64a3a7efe,f7b1bb4dccc83eb26e661241ebf9f767f52b291b
9,daed679fc17c9d449912f8a6c4fe116565c28ad1,4cecab7c10758cde1036b436d874b2b3a686a874


In [85]:
# contains details on commit level
commit_details = pd.DataFrame(commit_details_list)
commit_details.head(10)

Unnamed: 0,CommitHash,Author,CommitDate,Message,TotalAddedFiles,TotalModifiedFiles,TotalDeletedFiles,UnitSize,Complexity,Interfacing
0,f5f1c81e7dcae0272e71ef4e6bedfc00b8c677d6,Ray Chiang <rchiang@apache.org>,2016-07-15 14:38:50-07:00,YARN-5272. Handle queue names consistently in ...,0,5,0,0.133333,0.966667,0.966667
1,01d81f4e51c64d5b316d129b514fa6802770ead9,Giridharan Kesavan <gkesavan@apache.org>,2009-06-01 09:54:21+00:00,To fix test targets\n\ngit-svn-id: https://svn...,0,1,0,,,
2,bb957bd2cdf33271bec663cdc86b44009e66e6bb,Giridharan Kesavan <gkesavan@apache.org>,2009-06-01 16:06:19+00:00,Fix test-patch.sh script for eclipse classpath...,0,1,0,,,
3,08561f76dbac53aaec12d4170b183e06747c9f75,Christopher Douglas <cdouglas@apache.org>,2010-01-13 13:26:11+00:00,HADOOP-6155. Deprecate RecordIO anticipating A...,0,49,0,,,
4,9cb0c963d21bcbefc56716d332a3bbdf090417c0,Jian He <jianhe@apache.org>,2016-04-08 11:20:35-07:00,YARN-4740. AM may not receive the container co...,0,2,0,0.206522,0.206522,1.0
5,6814324c332a7d780f3b844fd6f1c62db2f6c88e,Haohui Mai <wheat9@apache.org>,2017-08-01 18:34:07-07:00,HDFS-12107. FsDatasetImpl#removeVolumes floods...,0,1,0,,,
6,6a6e74acf5c38a4995c4622148721cfe2f1fbdad,Lei Xu <lei@apache.org>,2016-05-16 17:05:46-07:00,HDFS-10410. RedundantEditLogInputStream.LOG is...,0,1,0,,,
7,476340c6999df71480164b18abd809c5e710d87a,PJ Fanning <pjfanning@users.noreply.github.com>,2023-03-13 10:08:04+05:30,HADOOP-18658. snakeyaml dependency: upgrade to...,0,2,0,,,
8,f7b1bb4dccc83eb26e661241ebf9f767f52b291b,Steve Loughran <stevel@cloudera.com>,2022-12-15 11:42:36+00:00,HADOOP-18573. Improve error reporting on non-s...,0,2,0,0.0,0.0,1.0
9,4cecab7c10758cde1036b436d874b2b3a686a874,Christopher Douglas <cdouglas@apache.org>,2010-01-13 10:46:39+00:00,HADOOP-6315. Avoid incorrect use of BuiltInfla...,0,3,0,0.11,1.0,1.0


In [86]:
file_details = pd.DataFrame(file_details_list)
file_details.head(10)

Unnamed: 0,CommitHash,FileName,ChangeType,AddedLines,DeletedLines,AddedMethods,DeletedMethods,ModifiedMethods,Complexity
0,f5f1c81e7dcae0272e71ef4e6bedfc00b8c677d6,AllocationFileLoaderService.java,ModificationType.MODIFY,3,1,0,0,1,92.0
1,f5f1c81e7dcae0272e71ef4e6bedfc00b8c677d6,QueueManager.java,ModificationType.MODIFY,5,1,0,0,1,60.0
2,f5f1c81e7dcae0272e71ef4e6bedfc00b8c677d6,TestAllocationFileLoaderService.java,ModificationType.MODIFY,28,0,1,0,0,14.0
3,f5f1c81e7dcae0272e71ef4e6bedfc00b8c677d6,TestFairScheduler.java,ModificationType.MODIFY,11,0,0,0,1,131.0
4,f5f1c81e7dcae0272e71ef4e6bedfc00b8c677d6,TestQueueManager.java,ModificationType.MODIFY,3,0,0,0,1,5.0
5,01d81f4e51c64d5b316d129b514fa6802770ead9,build.xml,ModificationType.MODIFY,5,8,0,0,0,
6,bb957bd2cdf33271bec663cdc86b44009e66e6bb,test-patch.sh,ModificationType.MODIFY,4,3,0,0,0,
7,08561f76dbac53aaec12d4170b183e06747c9f75,CHANGES.txt,ModificationType.MODIFY,2,0,0,0,0,
8,08561f76dbac53aaec12d4170b183e06747c9f75,BinaryRecordInput.java,ModificationType.MODIFY,2,0,0,0,0,23.0
9,08561f76dbac53aaec12d4170b183e06747c9f75,BinaryRecordOutput.java,ModificationType.MODIFY,2,0,0,0,0,20.0
