This script generates the file `contributors_summary_dataset_full.csv` removing bots, and including contributors with direct contributions.

It uses duckdb and includes contributors that participated in the tasks in any of the following roles:
 - Reporter
 - Assignee
 - Commenter
 - Fixer (author or committer in the commits table)
 - Voter

To include bots, set the variable `INCLUDE_BOTS` further below as `True`. The script will generate a new file named `contributors_summary_dataset_full_bots.csv`.

In [1]:
import csv
import copy
from datetime import datetime
BOTS_USERNAMES_JIRA = ['githubbot', 'genericqa', 'HadoopDev', 'hadoopqa', 'hudson', 'jiraposter@reviews.apache.org']
CONTRIBUTORS_BOTS = ["id159", "id733", "id1122", "id1661", "id1706", "id1745"]
import duckdb
import pandas as pd
%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

In [2]:
%sql duckdb:///:memory:

In [3]:
%%sql
IMPORT DATABASE '../datasets/duckdb/';

Unnamed: 0,Count
0,1779


In [4]:
%%sql
snapshots_df <<  SELECT Key, Project, Priority, CreationDate, ResolutionDate, Reporter, Assignee, NoComments, NoAuthors, NoCommits, NoCommitters, SrcAddFiles, SrcDelFiles, SrcModFiles, SrcAddLines, SrcDelLines, TestAddFiles, TestDelFiles, TestModFiles, TestAddLines, TestDelLines, summary_token_number, description_token_number, summary_char_number, description_char_number
FROM new_snapshot;

In [20]:
snapshots_df

Unnamed: 0,key,Project,Priority,CreationDate,ResolutionDate,Reporter,Assignee,NoComments,NoAuthors,NoCommits,...,SrcDelLines,TestAddFiles,TestDelFiles,TestModFiles,TestAddLines,TestDelLines,summary_token_number,description_token_number,summary_char_number,description_char_number
0,HADOOP-4975,HADOOP,Major,2009-01-02 12:13:20,2009-01-23 22:35:41,id1296,id1296,9,0,0,...,0,0,0,0,0,0,7.0,67.0,72.0,437.0
1,HADOOP-4977,HADOOP,Blocker,2009-01-02 23:18:35,2009-01-15 02:58:26,id8,id196,13,0,0,...,0,0,0,0,0,0,3.0,432.0,34.0,3074.0
2,HADOOP-4979,HADOOP,Major,2009-01-05 04:25:51,2009-01-05 11:40:13,id196,id196,7,0,0,...,0,0,0,0,0,0,10.0,54.0,58.0,268.0
3,HADOOP-4982,HADOOP,Major,2009-01-05 22:03:11,2009-01-09 00:59:09,id1189,id1189,5,0,0,...,0,0,0,0,0,0,3.0,16.0,18.0,109.0
4,HADOOP-4983,HADOOP,Critical,2009-01-06 00:36:19,2009-01-21 06:55:01,id262,id980,4,0,0,...,0,0,0,0,0,0,9.0,20.0,49.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10351,YARN-9067,YARN,Major,2018-11-28 01:06:28,2018-11-29 18:52:45,id1214,id1214,13,1,1,...,85,0,0,1,1,1,8.0,60.0,52.0,342.0
10352,YARN-9071,YARN,Critical,2018-11-29 18:50:37,2018-12-05 22:10:08,id551,id568,23,1,1,...,33,0,0,2,39,3,7.0,76.0,47.0,589.0
10353,YARN-9084,YARN,Major,2018-12-05 20:41:44,2018-12-18 23:08:45,id568,id568,5,1,1,...,21,0,0,1,6,2,11.0,43.0,81.0,253.0
10354,YARN-9114,YARN,Major,2018-12-12 07:01:59,2018-12-12 08:50:24,id1161,id738,5,1,1,...,1,0,0,0,0,0,9.0,7.0,43.0,33.0


In [5]:
%%sql
commits_df << SELECT snap.Key, cmmit.Author, cmmit.AuthorDate, cmmit.Committer, cmmit.CommitterDate, cmmit.CommitHash, cmmit.IsSrcFile, cmmit.IsTestFile, COUNT(cmmit.Filename) AS FileCount, SUM(cmmit.AddLines) AS AddLinesSum, SUM(cmmit.DelLines) AS DelLinesSum  
FROM new_snapshot AS snap, new_commit AS cmmit
WHERE snap.Key = cmmit.Key AND (cmmit.IsSrcFile = 1 OR cmmit.IsTestFile = 1) 
GROUP BY snap.Key, cmmit.CommitHash, cmmit.Author, cmmit.Committer, cmmit.AuthorDate, cmmit.CommitterDate, cmmit.IsSrcFile, cmmit.IsTestFile
ORDER BY snap.Key, cmmit.CommitHash;

In [6]:
%%sql
comments_df << SELECT snap.Key, cmment.Author, cmment.CreationDate, cmment.voting, cmment.votes, cmment.commen_token_number, cmment.comment_char_number
FROM new_snapshot AS snap, new_comment AS cmment
WHERE snap.Key = cmment.Key
GROUP BY snap.Key, cmment.Author, cmment.CreationDate, cmment.voting, cmment.votes, cmment.commen_token_number, cmment.comment_char_number;

In [7]:
%%sql
contributors_df << SELECT * 
FROM consolidated_list_of_contributors_official;

In [8]:
INCLUDE_BOTS = False

In [9]:
class IssueLevelContribution:
    def __init__(self, component, key, priority, creation_date, resolution_date, summary_token_number, 
                 description_token_number, summary_char_number, description_char_number, reporting=0.0, 
                 assignment=0.0, comments=0.0, votes=0.0) -> None:
        self.component = component
        self.key = key
        self.priority = priority
        self.creation_date = creation_date 
        self.resolution_date = resolution_date
        self.summary_token_number = summary_token_number
        self.description_token_number = description_token_number
        self.summary_char_number = summary_char_number
        self.description_char_number = description_char_number
        self.reporting = reporting 
        self.assignment =  assignment 
        self.comments = comments
        self.votes = votes

    def add_reporting(self):
        self.reporting = 1

    def add_assignment(self):
        self.assignment = 1

    def add_comment(self):
        self.comments += 1

    def add_vote(self):
        self.votes += 1
    
    def get_issue_key(self):
        return self.key


class CommitLevelContribution:
    def __init__(self, issue_key=None, hash=None, worked_commits=0.0, integrated_commits=0.0, number_of_src_files_changed=0.0, number_of_test_files_changed=0.0, code_churn_size_of_src_files_changed=0.0, code_churn_size_of_test_files_changed=0.0) -> None:
        self.issue_key = issue_key
        self.hash = hash
        self.worked_commits = worked_commits
        self.integrated_commits = integrated_commits
        self.number_of_src_files_changed = number_of_src_files_changed
        self.number_of_test_files_changed = number_of_test_files_changed
        self.code_churn_size_of_src_files_changed = code_churn_size_of_src_files_changed
        self.code_churn_size_of_test_files_changed = code_churn_size_of_test_files_changed

    def add_worked_commit(self):
        self.worked_commits = self.worked_commits + 1

    def add_integrated_commit(self):
        self.integrated_commits = self.integrated_commits + 1

    def add_src_file_changed(self, filecount=1, code_churn_size=0):
        self.number_of_src_files_changed = self.number_of_src_files_changed + filecount
        self.code_churn_size_of_src_files_changed = self.code_churn_size_of_src_files_changed + code_churn_size

    def add_test_file_changed(self, filecount=1, code_churn_size=0):
        self.number_of_test_files_changed = self.number_of_test_files_changed + filecount
        self.code_churn_size_of_test_files_changed = self.code_churn_size_of_test_files_changed + code_churn_size

    def get_issue_key(self):
        return self.issue_key

    def get_commit_hash(self):
        return self.hash

class Contributor:

    def __init__(self, contributor) -> None:
        self.contributor = contributor.id
        self.name_git = contributor.name_git
        self.username_jira	= contributor.username_jira
        self.name_jira = contributor.name_jira
        self.official = contributor.official
        self.pmc = contributor.pmc
        self.organization = contributor.organization
        self.committer = contributor.committer
        self.issue_contributions = {}
        self.commit_contributions = {}

    def add_issue_level_contribution(self, contribution: IssueLevelContribution):
        if self.issue_contributions.get(contribution.key) is None:
            self.issue_contributions[contribution.key] = IssueLevelContribution(contribution.component, 
                                                                                contribution.key, 
                                                                                contribution.priority, 
                                                                                contribution.creation_date, 
                                                                                contribution.resolution_date,
                                                                                contribution.summary_token_number,
                                                                                contribution.description_token_number,
                                                                                contribution.summary_char_number,
                                                                                contribution.description_char_number)

    def get_issue_level_contribution(self, key):
        return self.issue_contributions.get(key)
    
    def add_commit_level_contribution(self, contribution: CommitLevelContribution):
        key_hash = str(contribution.issue_key) + str(contribution.hash)
        if self.commit_contributions.get(key_hash) is None:
            self.commit_contributions[key_hash] = CommitLevelContribution(contribution.issue_key, contribution.hash)

    def get_commit_level_contribution(self, key, hash):
        return self.commit_contributions.get(str(key) + str(hash))

    def get_commits_contributions_of_issue(self, key):
        commit_level_contributions = []
        for commit_level_contribution in self.commit_contributions.values():
            if commit_level_contribution.issue_key == key:
                commit_level_contributions.append(commit_level_contribution)

        return commit_level_contributions


    def to_list(self, key, hash):
        issue_contribution = self.get_issue_level_contribution(key)
        if issue_contribution is None:
            return None
        
        commit_contribution = self.get_commit_level_contribution(key, hash)

        if commit_contribution is None:
            commit_contribution = CommitLevelContribution()

        contribution_list = [self.contributor,
                            self.pmc,
                            self.official,
                            self.organization, 
                            issue_contribution.component, 
                            issue_contribution.key,
                            issue_contribution.priority, 
                            issue_contribution.creation_date,
                            issue_contribution.resolution_date,
                            issue_contribution.summary_token_number,
                            issue_contribution.description_token_number,
                            issue_contribution.summary_char_number,
                            issue_contribution.description_char_number,
                            issue_contribution.reporting,
                            issue_contribution.assignment,
                            issue_contribution.comments,
                            issue_contribution.votes,
                            commit_contribution.worked_commits,
                            commit_contribution.integrated_commits,
                            commit_contribution.number_of_src_files_changed,
                            commit_contribution.number_of_test_files_changed,
                            commit_contribution.code_churn_size_of_src_files_changed,
                            commit_contribution.code_churn_size_of_test_files_changed]
        
        return contribution_list

    def to_data_frame(self):
        column_headers = ['Contributor', 'PMC', 'Official', 'Organization', 'Component', 'Key', 'Priority', 
                          'CreationDate', 'ResolutionDate', 'SummaryTokenNumber', 'DescriptionTokenNumber',
                          'SummaryCharNumber', 'DescriptionCharNumber', 'Reporter', 'Assignee', 'Comments', 
                          'Votes', 'WorkedCommits', 'IntegratedCommits', 'NoSrcFilesChanged', 'NoTestFilesChanged', 
                          'CodeChurnSizeSrcFilesChanged', 'CodeChurnSizeTestFilesChanged']
        
        contributor_summary_dataset = None
        
        for issue_contribution in self.issue_contributions.values():
            if len(self.get_commits_contributions_of_issue(issue_contribution.key)) == 0:
                data_tuple = self.to_list(issue_contribution.key, '')
                if contributor_summary_dataset is None:
                    contributor_summary_dataset = pd.DataFrame([data_tuple], columns=column_headers)
        
                else:
                    new_row = pd.DataFrame([data_tuple], columns=column_headers)
                    contributor_summary_dataset = pd.concat([contributor_summary_dataset, new_row], ignore_index=True)
            else:
                for commit_contribution in self.get_commits_contributions_of_issue(issue_contribution.key):
                    data_tuple = self.to_list(issue_contribution.key, commit_contribution.hash)
                    if contributor_summary_dataset is None:
                        contributor_summary_dataset = pd.DataFrame([data_tuple], columns=column_headers)
            
                    else:
                        new_row = pd.DataFrame([data_tuple], columns=column_headers)
                        contributor_summary_dataset = pd.concat([contributor_summary_dataset, new_row], ignore_index=True)
        
        return contributor_summary_dataset

In [17]:
def process_dataset(with_bots=False):
    contributors_summary = {}
    contributors_dataset = contributors_df
    if not with_bots:
        # removing bots
        contributors_dataset = contributors_dataset[~contributors_dataset['id'].isin(CONTRIBUTORS_BOTS)]
    for index, row in contributors_dataset.iterrows():
        contributors_summary[row.id] = Contributor(row)
    
    issue_contributions = {}
    snapshot_dataset = snapshots_df

    for index, row in snapshot_dataset.iterrows():
        issue_contributions[row.key] = IssueLevelContribution(row.Project, 
                                                              row.key, 
                                                              row.Priority, 
                                                              row.CreationDate, 
                                                              row.ResolutionDate,
                                                              row.summary_token_number,
                                                              row.description_token_number,
                                                              row.summary_char_number,
                                                              row.description_char_number)
        
        if contributors_summary.get(row.Reporter) is not None:
            contributors_summary.get(row.Reporter).add_issue_level_contribution(issue_contributions.get(row.key))
            contributors_summary.get(row.Reporter).get_issue_level_contribution(row.key).add_reporting()

        if contributors_summary.get(row.Assignee) is not None:
            contributors_summary.get(row.Assignee).add_issue_level_contribution(issue_contributions.get(row.key))
            contributors_summary.get(row.Assignee).get_issue_level_contribution(row.key).add_assignment()    

    comment_dataset = comments_df
    
    for index, row in comment_dataset.iterrows():
        if contributors_summary.get(row.Author) is not None:
            contributors_summary.get(row.Author).add_issue_level_contribution(issue_contributions.get(row.key))
            contributors_summary.get(row.Author).get_issue_level_contribution(row.key).add_comment()
            if row.voting > 0:
                contributors_summary.get(row.Author).get_issue_level_contribution(row.key).add_vote()

    commit_contributions = {}
    commit_dataset = commits_df

    for index, row in commit_dataset.iterrows():
        key_hash = str(row.key) + str(row.CommitHash)
        if commit_contributions.get(key_hash) is None:
            commit_contributions[key_hash] = CommitLevelContribution(row.key, row.CommitHash)
        
        if contributors_summary.get(row.Author) is not None:
            contributors_summary.get(row.Author).add_issue_level_contribution(issue_contributions.get(row.key))
            contributors_summary.get(row.Author).add_commit_level_contribution(commit_contributions.get(key_hash))
            contributors_summary.get(row.Author).get_commit_level_contribution(row.key, row.CommitHash).add_worked_commit()
            
            if row.IsSrcFile > 0:
                contributors_summary.get(row.Author).get_commit_level_contribution(row.key, row.CommitHash).add_src_file_changed(row.FileCount, row.AddLinesSum + row.DelLinesSum)
            
            if row.IsTestFile > 0:
                contributors_summary.get(row.Author).get_commit_level_contribution(row.key, row.CommitHash).add_test_file_changed(row.FileCount, row.AddLinesSum + row.DelLinesSum)


        if contributors_summary.get(row.Committer) is not None:
            contributors_summary.get(row.Committer).add_issue_level_contribution(issue_contributions.get(row.key))
            contributors_summary.get(row.Committer).add_commit_level_contribution(commit_contributions.get(key_hash))
            contributors_summary.get(row.Committer).get_commit_level_contribution(row.key, row.CommitHash).add_integrated_commit()

            if row.IsSrcFile > 0:
                contributors_summary.get(row.Committer).get_commit_level_contribution(row.key, row.CommitHash).add_src_file_changed(row.FileCount, row.AddLinesSum + row.DelLinesSum)
            
            if row.IsTestFile > 0:
                contributors_summary.get(row.Committer).get_commit_level_contribution(row.key, row.CommitHash).add_test_file_changed(row.FileCount, row.AddLinesSum + row.DelLinesSum)

    # return contributors_summary
    contributors_summary_dataset = None

    for key in contributors_summary:
        if contributors_summary_dataset is None:
            contributors_summary_dataset = contributors_summary.get(key).to_data_frame()
        
        else:
            new_contributor_summary = contributors_summary.get(key).to_data_frame()
            contributors_summary_dataset = pd.concat([contributors_summary_dataset, new_contributor_summary], ignore_index=True)
    
    if not INCLUDE_BOTS:
        contributors_summary_dataset.to_csv("../datasets/contributors_summary_dataset_full.csv", sep=';', encoding='utf-8', index=False)
    else:
        contributors_summary_dataset.to_csv("../datasets/contributors_summary_dataset_full_bots.csv", sep=';', encoding='utf-8', index=False)

    return (contributors_summary, issue_contributions)

In [18]:
(summary, issues_summary) = process_dataset(INCLUDE_BOTS)

In [19]:
issues_from_contributors = set()
for key in summary:
    issues_from_contributors.update(summary.get(key).issue_contributions.keys())
    commits_from_contributors = summary.get(key).commit_contributions.values()
    for commit in commits_from_contributors:
        issues_from_contributors.add(commit.issue_key)
diff = set(issues_summary.keys()).difference(issues_from_contributors)
print(len(issues_from_contributors), len(issues_summary.keys()), len(diff), diff)

10356 10356 0 set()


In [22]:
old_snapshot = pd.read_csv('../datasets/new_snapshot_file.csv', sep=';')
#getting keys from the old snapshot
old_snapshot_keys = set(old_snapshot['Key'])
diff = set(old_snapshot_keys).difference(issues_from_contributors)
print(len(old_snapshot_keys), len(issues_from_contributors), len(diff), diff)

10375 10356 19 {'MAPREDUCE-7096', 'HADOOP-7321', 'HADOOP-6753', 'YARN-2736', 'YARN-7808', 'MAPREDUCE-1142', 'HADOOP-10873', 'MAPREDUCE-4935', 'HDFS-6118', 'HDFS-9247', 'HADOOP-14232', 'MAPREDUCE-5057', 'HADOOP-9744', 'YARN-5847', 'HADOOP-6765', 'HADOOP-7203', 'HADOOP-13803', 'HDFS-888', 'HADOOP-9228'}
