This script generates the csv file `../datasets/bug_report_collab_dataset.csv` that summarizes collaboration in bug-fixing tasks in the Hadoop project.

It loads data stored in duckdb and in the followinf csvs:
 - `../datasets/contributors_summary_dataset_full.csv`
 - `../datasets/comment_author_groupby_bug.csv`

In [1]:
import csv
import copy
from datetime import datetime
import duckdb
import pandas as pd
import numpy as np
from statistics import median, mean, variance, stdev
%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

In [2]:
%sql duckdb:///:memory:

In [3]:
%%sql
IMPORT DATABASE '../datasets/duckdb/';

Unnamed: 0,Count
0,1779


In [4]:
%%sql
snapshots_df <<  SELECT Key, Project, Priority, CreationDate, ResolutionDate, Reporter, Assignee, NoComments, NoAuthors, NoCommits, NoCommitters, SrcAddFiles, SrcDelFiles, SrcModFiles, SrcAddLines, SrcDelLines, TestAddFiles, TestDelFiles, TestModFiles, TestAddLines, TestDelLines, summary_token_number, description_token_number, summary_char_number, description_char_number
FROM new_snapshot;

In [5]:
%%sql
comments_df << SELECT snap.Key, cmment.Author, cmment.CreationDate, cmment.voting, cmment.votes, cmment.commen_token_number, cmment.comment_char_number
FROM new_snapshot AS snap, new_comment AS cmment
WHERE snap.Key = cmment.Key
GROUP BY snap.Key, cmment.Author, cmment.CreationDate, cmment.voting, cmment.votes, cmment.commen_token_number, cmment.comment_char_number;

In [6]:
comments_df

Unnamed: 0,key,Author,CreationDate,voting,votes,commen_token_number,comment_char_number
0,MAPREDUCE-6765,id1665,2016-11-02 03:51:48.957,0.0,,6.0,36.0
1,MAPREDUCE-6767,id734,2016-08-24 22:54:47.724,0.0,,4.0,22.0
2,MAPREDUCE-6767,id263,2016-08-25 01:57:24.364,0.0,,14.0,69.0
3,MAPREDUCE-6768,id792,2016-08-26 19:03:53.033,0.0,,31.0,152.0
4,MAPREDUCE-6771,id792,2016-08-27 01:09:43.092,0.0,,12.0,70.0
...,...,...,...,...,...,...,...
159733,MAPREDUCE-5775,id159,2014-04-16 08:49:15.242,,n,,
159734,MAPREDUCE-6439,id579,2015-08-05 22:16:54.469,0.0,,43.0,267.0
159735,MAPREDUCE-6454,id545,2015-08-21 00:04:20.922,1.0,-1,127.0,706.0
159736,MAPREDUCE-6670,id315,2016-04-05 16:33:02.003,1.0,+1,10.0,59.0


In [7]:
INCLUDE_BOTS = False
CONTRIBUTORS_BOTS = ["id159", "id733", "id1122", "id1661", "id1706", "id1745"]

In [8]:
if not INCLUDE_BOTS:
    comments_summary_df = pd.read_csv('../datasets/comment_author_groupby_bug.csv', sep=';')
    contributors_summary_df = pd.read_csv('../datasets/contributors_summary_dataset_full.csv', sep=';')
    contributors_summary_df['CreationDate'] = pd.to_datetime(contributors_summary_df['CreationDate'])
    contributors_summary_df['ResolutionDate'] = pd.to_datetime(contributors_summary_df['ResolutionDate'])
else:
    comments_summary_bots_df = pd.read_csv('../datasets/comment_author_groupby_bug_bots.csv', sep=';')
    contributors_summary_bots_df = pd.read_csv('../datasets/contributors_summary_dataset_full_bots.csv', sep=';')
    contributors_summary_bots_df['CreationDate'] = pd.to_datetime(contributors_summary_bots_df['CreationDate'])
    contributors_summary_bots_df['ResolutionDate'] = pd.to_datetime(contributors_summary_bots_df['ResolutionDate'])

In [71]:
class Role:
    def __init__(self, role_type, authorId):
        self.type = role_type
        self.authorId = authorId

    def __repr__(self):
        return f'authorId: {self.authorId}, type: {self.type}'

class RoleList:
    def __init__(self):
        self.roles = {}
        self.roles_by_authorId = {}

    def add_role(self, role):
        # making sure the list cannnot have multiple roles of the same type
        if role.type not in self.roles:
            self.roles[role.type] = []
        roles_by_type = self.roles[role.type]
        
        # but a role type can have multiple people (with the exception of reporter and assignee)
        # making sure the authorId appears only once in the list of that type of role
        if role.authorId not in [r.authorId for r in roles_by_type]:
            roles_by_type.append(role)

        # making sure the authorId appears only once in the list
        if role.authorId not in self.roles_by_authorId:
            self.roles_by_authorId[role.authorId] = []
        
        # making sure we every person has only one role of a given type
        roles_by_authorId = self.roles_by_authorId[role.authorId]
        if role.type not in [r.type for r in roles_by_authorId]:
            roles_by_authorId.append(role)

    def get_authors_by_role_type(self, role_type):
        if role_type in self.roles:
            roles = self.roles[role_type]
            return [role.authorId for role in roles]
        else:
            return []
    
    def get_roles_by_authorId(self, authorId):
        if authorId in self.roles_by_authorId:
            roles = self.roles_by_authorId[authorId]
            return [role.type for role in roles]
        
    def get_set_of_authors(self):
        authors = set()
        for role_type in self.roles.keys():
            authors.update(self.get_authors_by_role_type(role_type))
        return authors
    
    def get_diversity_index(self):
        role_count = len(self.roles)
        # getting the unique authors playing the roles
        authors = self.get_set_of_authors()
        author_count = len(authors)
        return "(%s,%s)" % (role_count, author_count)
    
    def get_unique_roles(self):
        participated_authors = set(self.get_authors_by_role_type('reporter'))
        #process reporter and assignee first
        unique_roles_count = 1 
        assignee = self.get_authors_by_role_type('assignee')
        diff = set(assignee).difference(participated_authors)
        if len(diff) > 0:
            unique_roles_count += 1
        participated_authors.update(assignee)

        # then process fixers
        fixers = self.get_authors_by_role_type('fixer')
        diff = set(fixers).difference(participated_authors)
        if len(diff) > 0:
            unique_roles_count += 1
        participated_authors.update(fixers)

        # commenter and voters are a special case because voters are also commenters
        # so we need to check if there are commenters that are not voters
        commenters = self.get_authors_by_role_type('commenter')
        voters = self.get_authors_by_role_type('voter')
        only_commenters = set(commenters).difference(participated_authors)
        if len(only_commenters) > 0:
            unique_roles_count += 1
        
        # then check if there are only voters. 
        # If the set is not empty, then we check if there are still commenters that are not voters    
        only_voters = set(voters).difference(participated_authors)
        if len(only_voters) > 0:
            diff = only_commenters.difference(only_voters)
            if len(diff) > 0:
                unique_roles_count += 1
        return unique_roles_count

    def __repr__(self):
        return f'roles: {self.roles}'

class Comment:
    def __init__(self, key, authorId, creationDate, voting=0, voting_value=None, char_number=0, token_number=0):
        self.key = key
        self.authorId = authorId
        self.creationDate = creationDate
        self.char_number = char_number
        self.token_number = token_number
        self.voting = voting
        self.voting_value = voting_value

class CollaborationParticipant:
    def __init__(self, authorId):
        self.authorId = authorId
        self.reportings_by_report = {}
        self.n_reportings = 0
        self.assignments_by_report = {}
        self.n_assignments = 0
        self.comments_by_report = {}
        self.n_comments = 0
        self.fixes_by_report = {}
        self.n_fixes = 0
        self.votes_by_report = {}
        self.n_votes = 0
    
    def add_bug_report(self, key):
        if key not in self.comments_by_report:
            self.comments_by_report[key] = []
            self.fixes_by_report[key] = 0
            self.votes_by_report[key] = 0
            self.reportings_by_report[key] = 0
            self.assignments_by_report[key] = 0
        
    def add_comment(self, comment):
        if comment.key not in self.comments_by_report:
            self.comments_by_report[comment.key] = []
        self.comments_by_report[comment.key].append(comment)
        self.n_comments += 1
        if comment.voting > 0:
            self.votes_by_report[comment.key] += 1
            self.n_votes += 1

    def add_reporting(self, key):
        if key not in self.reportings_by_report:
            self.reportings_by_report[key] = 0
        self.reportings_by_report[key] += 1
        self.n_reportings += 1

    def add_assignment(self, key):
        if key not in self.assignments_by_report:
            self.assignments_by_report[key] = 0
        self.assignments_by_report[key] += 1
        self.n_assignments += 1
    
    def get_comments_by_report(self, key):
        return self.comments_by_report[key]
    
    def get_number_of_comments_by_report(self, key):
        return len(self.comments_by_report[key])
    
    def set_fix(self, key, fixnum ):
        self.fixes_by_report[key] = fixnum
        self.n_fixes += fixnum

    def get_number_of_fixes_by_report(self, key):
        return self.fixes_by_report[key]
    
    def get_number_of_votes_by_report(self, key):
        return self.votes_by_report[key]
    
    def get_assignment_by_report(self, key):
        return self.assignments_by_report[key]
    
    def get_reporting_by_report(self, key):
        return self.reportings_by_report[key]

    def get_number_of_interactions_by_report(self, key):
        return self.get_number_of_comments_by_report(key) + self.get_number_of_votes_by_report(key) + self.get_assignment_by_report(key) + self.get_reporting_by_report(key) + self.get_number_of_fixes_by_report(key)

    def __repr__(self):
        return f'id: {self.authorId}, n_comments: {self.n_comments}, n_fixes: {self.n_fixes}, n_votes: {self.n_votes}, n_assignments: {self.n_assignments}, n_reportings: {self.n_reportings}'
    

class BugReportCollaboration:
    def __init__(self, key, priority, component, creation_date, resolution_date, summary_token_number, 
                 description_token_number, summary_char_number, description_char_number, reporterId):
        self.key = key
        self.priority = priority
        self.component = component
        self.bugReportCreationDate = creation_date
        self.bugReportResolutionDate = resolution_date
        self.summaryTokenNumber = summary_token_number
        self.descriptionTokenNumber = description_token_number
        self.summaryCharNumber = summary_char_number
        self.descriptionCharNumber = description_char_number
        self.reporterId = reporterId
        self.assigneeId = None
        self.n_comments = 0
        self.n_fixes = 0
        self.n_votes = 0
        self.comments = []
        self.participants = {}
        
        # time metrics
        self.firstCommentDate = None
        self.lastCommentDate = None
        self.discussionEndDate = None
        self.discussionStartDate = None
        self.discussionDuration = None
        self.bugFixingTime = None
        self.delayBeforeResponse = None 
        self.delayEndDiscussionCloseBug = None

        # size metrics
        self.averageCommentSize = None
        self.maxCommentSize = None
        self.averageTokenNumber = None
        self.maxTokenNumber = None

        # comments metrics
        self.averageCommentsPerParticipant = None
        self.medianCommentsPerParticipant = None

        # roles
        self.roles = RoleList()
        self.possibleRoleTypes = ['reporter', 'assignee', 'commenter', 'fixer', 'voter']

        # diversity metrics
        self.npc = 0  # number of participants in the collaboration 
        self.npi = 0 # normalized number of participants in the collaboration      
        self.nrc = 0 # number of roles in the collaboration
        self.nri = 0 # normalized number of roles in the collaboration 
        self.urn = 0 # number of unique roles in the collaboration
        self.uri = 0 # normalized number of unique roles in the collaboration
        self.interEquality = None
        self.roleDiversity = None
        
    def add_participant(self, participant):
        if participant.authorId not in self.participants:
            self.participants[participant.authorId] = participant

            n_comments = participant.get_number_of_comments_by_report(self.key)
            n_fixes = participant.get_number_of_fixes_by_report(self.key)
            n_votes = participant.get_number_of_votes_by_report(self.key)

            if participant.get_assignment_by_report(self.key) > 0:
                role = Role('assignee', participant.authorId)
                self.roles.add_role(role)
            if participant.get_reporting_by_report(self.key) > 0:
                role = Role('reporter', participant.authorId)
                self.roles.add_role(role)
            if n_comments > 0:
                role = Role('commenter', participant.authorId)
                self.roles.add_role(role)
            if n_fixes > 0:
                role = Role('fixer', participant.authorId)
                self.roles.add_role(role)
            if n_votes > 0:
                role = Role('voter', participant.authorId)
                self.roles.add_role(role)

            self.add_comments(participant.get_comments_by_report(self.key))
            
            self.n_fixes += n_fixes
            self.n_votes += n_votes

    def get_number_of_participants(self):
        # it includes the reporter, assignee and fixers as participants in the discussion
        return len(self.participants)
    
    def get_number_of_roles_by_report(self, key):
        return len(self.get_roles_by_report(key))
    
    def get_number_of_commenters(self):
        commenters = self.roles.get_authors_by_role_type('commenter')
        return len(commenters)
        
    def get_all_distinct_fixers(self):
        return self.roles.get_authors_by_role_type('fixer')
    
    def get_all_distinct_voters(self):
        return self.roles.get_authors_by_role_type('voter')

    def get_assignee(self):
        assignee_id = -1
        role_assignee = self.roles.get_authors_by_role_type('assignee')
        if len(role_assignee) > 0:
            assignee_id = role_assignee[0]
        return assignee_id

    def get_reporter(self):
        reporter_id = -1
        role_reporter = self.roles.get_authors_by_role_type('reporter')
        if len(role_reporter) > 0:
            reporter_id = role_reporter[0]
        return reporter_id
    
    def add_comments(self, comments):
        self.comments.extend(comments)
        self.n_comments = len(self.comments)

    def compute_discussion_duration(self):
        self.firstCommentDate = min(self.comments, key=lambda x: x.creationDate)
        self.lastCommentDate = max(self.comments, key=lambda x: x.creationDate)
        
        self.discussionStartDate = self.firstCommentDate.creationDate
        self.discussionEndDate = self.lastCommentDate.creationDate
        
        time_in_seconds = (self.discussionEndDate - self.discussionStartDate).total_seconds()
        
        # computing discussion duration in hours
        self.discussionDuration = round(time_in_seconds / 3600, 2)

    def compute_other_time_metrics(self):
        #computing bug fixing time in hours
        bft_time_in_seconds = (self.bugReportResolutionDate - self.bugReportCreationDate).total_seconds()
        self.bugFixingTime = round(bft_time_in_seconds / 3600, 2)

        # computing delay before response in hours
        dbr_time_in_seconds = (self.firstCommentDate.creationDate - self.bugReportCreationDate).total_seconds()
        self.delayBeforeResponse = round(dbr_time_in_seconds / 3600, 2)
        
        # computing delay to close the bug after the end of the discussion in hours
        dec_time_in_seconds = (self.bugReportResolutionDate - self.lastCommentDate.creationDate).total_seconds()
        self.delayEndDiscussionCloseBug = round(dec_time_in_seconds / 3600, 2)

    def compute_discussion_size_metrics(self):
        # computing the average comment size
        total_char_number = sum([comment.char_number for comment in self.comments])
        self.averageCommentSize = round(total_char_number / self.n_comments, 1)
        self.averageTokenNumber = round(sum([comment.token_number for comment in self.comments]) / self.n_comments, 1)

        # computing the maximum comment size
        self.maxCommentSize = max(self.comments, key=lambda x: x.char_number).char_number
        self.maxTokenNumber = max(self.comments, key=lambda x: x.token_number).token_number

    def compute_list_of_interactions(self):
        return [participant.get_number_of_interactions_by_report(self.key) for participant in self.participants.values()]

    def compute_collab_diversity_metrics(self):      
        self.averageCommentsPerParticipant = round(len(self.comments) / self.get_number_of_commenters(), 1)

        comments = [participant.get_number_of_comments_by_report(self.key) for participant in self.participants.values()]
        self.medianCommentsPerParticipant = round(median(comments), 1)

        self.npc = len(self.participants)
        
        self.nrc = len(self.roles.roles)
        self.nri = round(self.nrc / len(self.possibleRoleTypes), 2)

        if self.npc != len(self.roles.get_set_of_authors()):
            print("number of participants is different from the number of authors playing roles-> " + str(self))
        
        self.urc = self.roles.get_unique_roles()
        self.uri = round(self.urc / len(self.possibleRoleTypes), 2)

        interactions = self.compute_list_of_interactions()
        self.interEquality = self.compute_collab_equality(interactions)
        self.roleDiversityIndex = self.compute_role_diversity_index()        

    def compute_and_set_npi(self, denominator):
        self.npi = min(round(self.npc / denominator, 2), 1.00)

    def compute_role_diversity_index(self):
        return self.roles.get_diversity_index()

    def compute_collab_equality(self, interactions):
        inter_array = np.array(interactions, dtype=np.float64)
        return self.compute_equality(inter_array)

    def compute_equality(self, array):
        """ Calculate the diversity based on the Gini coefficient of a numpy array."""
        # based on bottom eq:
        # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
        # from:
        # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
        # All values are treated equally, arrays must be 1d:
        array = array.flatten()
        if np.amin(array) < 0:
            # Values cannot be negative:
            array -= np.amin(array)
        # Values cannot be 0:
        array += 0.0000001
        # Values must be sorted:
        array = np.sort(array)
        # Index per array element:
        index = np.arange(1,array.shape[0]+1)
        # Number of array elements:
        n = array.shape[0]
        # returning 1  - Gini coefficient to obtain equality instead of inequality
        return 1.0 - round((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array)), 2)

    def to_list(self):
        bug_report_comment_thread = [
            self.key,
            self.get_reporter(),
            self.get_assignee(),
            len(self.comments),
            self.npc,
            self.nrc,
            self.urc,
            self.npi,
            self.nri,
            self.uri,
            self.get_number_of_commenters(),
            len(self.get_all_distinct_voters()),
            len(self.get_all_distinct_fixers()),
            self.firstCommentDate.creationDate,
            self.lastCommentDate.creationDate,
            self.discussionDuration,
            self.bugReportCreationDate,
            self.bugReportResolutionDate,
            self.priority,
            self.component,
            self.summaryTokenNumber,
            self.descriptionTokenNumber,
            self.summaryCharNumber,
            self.descriptionCharNumber,
            self.averageCommentSize,
            self.maxCommentSize,
            self.averageTokenNumber,
            self.maxTokenNumber,
            self.interEquality,
            self.roleDiversityIndex,
            self.delayBeforeResponse,
            self.delayEndDiscussionCloseBug,
            self.averageCommentsPerParticipant,
            self.medianCommentsPerParticipant,
            self.bugFixingTime
        ]
        return bug_report_comment_thread
    
    def __repr__(self):
        return f'key: {self.key}, n_comments: {len(self.comments)}, n_commenters: {self.get_number_of_commenters()}, n_participants: {len(self.participants)}'

In [72]:
def build_bug_reports_other_data(contributors_summary_df):
    bug_reports_data = {}
    for index, row in contributors_summary_df.iterrows():
        key = row['Key']
        if key not in bug_reports_data:
            bug_reports_data[key] = (row['Priority'], row['Component'], row['CreationDate'], row['ResolutionDate'], row['SummaryTokenNumber'], row['DescriptionTokenNumber'], row['SummaryCharNumber'], row['DescriptionCharNumber'])
        if row['Reporter'] == 1.0:
            bug_reports_data[key] = (row['Priority'], row['Component'], row['CreationDate'], row['ResolutionDate'], row['SummaryTokenNumber'], row['DescriptionTokenNumber'], row['SummaryCharNumber'], row['DescriptionCharNumber'], row['Contributor'])
    return bug_reports_data

In [73]:
def build_participants_contrib_list(contributors_summary_df, comments_df, with_bots=False):
    valid_keys = []
    participants = {}
    for index, row in contributors_summary_df.iterrows():
        valid_keys.append(row['Key'])
        if with_bots:
            if row['Contributor'] not in participants:
                participants[row['Contributor']] = CollaborationParticipant(row['Contributor'])
            participant = participants[row['Contributor']]
            participant.add_bug_report(row['Key'])
            if row['Reporter'] > 0:
                participant.add_reporting(row['Key'])
            if row['Assignee'] > 0:
                participant.add_assignment(row['Key'])
            participant.set_fix(row['Key'], row['WorkedCommits'] + row['IntegratedCommits'])
        else:
            if row['Contributor'] not in CONTRIBUTORS_BOTS:
                if row['Contributor'] not in participants:
                    participants[row['Contributor']] = CollaborationParticipant(row['Contributor'])
                participant = participants[row['Contributor']]
                participant.add_bug_report(row['Key'])
                if row['Reporter'] > 0:
                    participant.add_reporting(row['Key'])
                if row['Assignee'] > 0:
                    participant.add_assignment(row['Key'])
                participant.set_fix(row['Key'], row['WorkedCommits'] + row['IntegratedCommits'])
    
    if with_bots == True:
        for index, row in comments_df.iterrows():
            if row['key'] in valid_keys:
                if row['Author'] not in participants:
                    participant = CollaborationParticipant(row['Author'])
                    participants[row['Author']] = participant
                participant = participants[row['Author']]
                comment = Comment(row['key'], row['Author'], row['CreationDate'], row['voting'], 
                                  row['votes'], row['comment_char_number'], row['commen_token_number'])
                participant.add_comment(comment)
    else:
        for index, row in comments_df.iterrows():
            if row['key'] in valid_keys and row['Author'] not in CONTRIBUTORS_BOTS:
                if row['Author'] not in participants:
                    participant = CollaborationParticipant(row['Author'])
                    participants[row['Author']] = participant
                participant = participants[row['Author']]
                comment = Comment(row['key'], row['Author'], row['CreationDate'],  row['voting'], 
                                  row['votes'], row['comment_char_number'], row['commen_token_number'])
                participant.add_comment(comment)

    return participants

In [74]:
def aggregate_by_bug_report(participants, comments_summary_df, contributors_summary_df, out_filename):
    bug_reports = {}
    bug_reports_data = build_bug_reports_other_data(contributors_summary_df)
    
    # adding participants' interactions to bug reports
    for index, row in contributors_summary_df.iterrows():
        if row.Key not in bug_reports:
            priority, component, creation_date, resolution_date, summary_token_number, description_token_number, summary_char_number, description_char_number, reporterId = bug_reports_data[row.Key]
            bug_reports[row.Key] = BugReportCollaboration(row.Key, priority, component, creation_date, resolution_date, summary_token_number, description_token_number, summary_char_number, description_char_number, reporterId)
        bug_report = bug_reports[row.Key]
        if row.Contributor not in participants:
            print('Contributor not found: ' + row.Contributor)
            return -1
        participant = participants[row.Contributor]
        bug_report.add_participant(participant)
        
    # computing threshold of number of participants for normalization
    parts = [bug_report.get_number_of_participants() for bug_report in bug_reports.values()]
    part_array = np.array(parts, dtype=np.float64)
    
    #calculate interquartile range 
    q3, q1 = np.percentile(part_array, [75 ,25])
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    
    bug_report_df = None
    for bug_report in bug_reports.values():
        if bug_report.n_comments > 0:
            bug_report.compute_discussion_duration()
            bug_report.compute_other_time_metrics()
            bug_report.compute_discussion_size_metrics()
            bug_report.compute_collab_diversity_metrics()
            bug_report.compute_and_set_npi(upper_bound)

            ColumnNames = ['Key', 'ReporterId', 'AssigneeId', 'NComments', 'NPC', 'NRC', 'URC', 'NPI', 'NRI', 'URI', 'NCommenters', 'NVoters', 'NFixers',
                           'FirstCommentDate', 'LastCommentDate', 'DiscussionDuration',
                           'BugReportCreationDate', 'BugReportResolutionDate', 'Priority', 'Component',
                           'SummaryTokenNumber', 'DescriptionTokenNumber', 'SummaryCharNumber', 'DescriptionCharNumber', 
                           'AvgCommentSize', 'MaxCommentSize', 'AvgTokenNumber', 'MaxTokenNumber',
                           'IEI', 'RDI', 'DBR', 'DEC', 'AverageCommentsPerParticipant', 'MedianCommentsPerParticipant', 'BFT']
            if bug_report_df is None:
                bug_report_df = pd.DataFrame([bug_report.to_list()], columns=ColumnNames)
            else:
                new_row = pd.DataFrame([bug_report.to_list()], columns=ColumnNames)
                bug_report_df = pd.concat([bug_report_df, new_row], ignore_index=True)
    
    #saving to csv
    bug_report_df.to_csv(out_filename, sep=';', encoding='utf-8', index=False)
    return (bug_report_df, bug_reports)

In [75]:
if not INCLUDE_BOTS:
    outfile = '../datasets/bug_report_collab_dataset.csv'
    participants = build_participants_contrib_list(contributors_summary_df, comments_df)
    (bug_report_df, bug_reports) = aggregate_by_bug_report(participants, comments_summary_df, contributors_summary_df, outfile)
else:
    outfile = '../datasets/bug_report_collab_dataset_bots.csv'
    participants = build_participants_contrib_list(contributors_summary_bots_df, comments_bots_df, with_bots=True)
    (bug_report_df, bug_reports) = aggregate_by_bug_report(participants, comments_summary_bots_df, contributors_summary_bots_df, outfile)

In [76]:
bug_report_df.describe()

Unnamed: 0,NComments,NPC,NRC,URC,NPI,NRI,URI,NCommenters,NVoters,NFixers,...,AvgCommentSize,MaxCommentSize,AvgTokenNumber,MaxTokenNumber,IEI,DBR,DEC,AverageCommentsPerParticipant,MedianCommentsPerParticipant,BFT
count,10356.0,10356.0,10356.0,10356.0,10356.0,10356.0,10356.0,10356.0,10356.0,10356.0,...,10353.0,10354.0,10353.0,10354.0,10356.0,10356.0,10356.0,10356.0,10356.0,10356.0
mean,9.810641,3.361723,4.52443,2.836327,0.474798,0.904886,0.567265,3.139533,1.186752,0.699981,...,104.562011,375.931427,18.404675,63.919645,0.740849,487.4765,-885.641079,2.938808,2.466783,2077.063202
std,9.133981,1.54885,0.695354,0.826386,0.194513,0.139071,0.165277,1.535335,0.761889,0.49652,...,95.072245,814.986316,16.249347,161.059246,0.138729,3341.83648,3995.190859,1.774835,1.697431,6359.903674
min,1.0,1.0,2.0,1.0,0.14,0.4,0.2,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.31,0.0,-67291.44,1.0,0.0,0.0
25%,4.0,2.0,4.0,2.0,0.29,0.8,0.4,2.0,1.0,0.0,...,48.0,93.0,9.0,17.0,0.64,0.08,-20.2275,1.8,1.5,40.645
50%,7.0,3.0,5.0,3.0,0.43,1.0,0.6,3.0,1.0,1.0,...,81.7,211.0,14.6,36.0,0.73,0.62,-0.0,2.5,2.0,190.065
75%,12.0,4.0,5.0,3.0,0.57,1.0,0.6,4.0,2.0,1.0,...,132.5,450.0,23.2,75.0,0.83,21.455,0.0,3.5,3.0,934.4025
max,157.0,17.0,5.0,5.0,1.0,1.0,1.0,16.0,6.0,3.0,...,2693.0,49850.0,527.5,11891.0,1.0,47781.62,48138.66,27.0,27.0,66861.57


In [77]:
bug_reports['HADOOP-15355'].participants


{'id18': id: id18, n_comments: 39, n_fixes: 0.0, n_votes: 1, n_assignments: 11, n_reportings: 9,
 'id77': id: id77, n_comments: 1094, n_fixes: 306.0, n_votes: 176, n_assignments: 78, n_reportings: 58,
 'id920': id: id920, n_comments: 1, n_fixes: 0.0, n_votes: 0, n_assignments: 0, n_reportings: 0,
 'id1189': id: id1189, n_comments: 1159, n_fixes: 206.0, n_votes: 173, n_assignments: 41, n_reportings: 102}