In [1]:
import pandas
#from jira import JIRA
import shutil
import warnings

warnings.filterwarnings("ignore")

Variables

In [2]:
PROJECTS = ['hadoop', 'hdfs', 'yarn', 'mapreduce']
PATH_ORIGINAL_DATASET_FILES = "original-dataset/"
PATH_SOCIAL_DATASET_FILES = "social-dataset/"
PATH_GENERATED_FILES = PATH_SOCIAL_DATASET_FILES + "generated_dataset/"
BOTS_USERNAMES_JIRA = ['githubbot', 'genericqa', 'HadoopDev', 'hadoopqa', 'hudson', 'jiraposter@reviews.apache.org']

---------------------------------------- Utilized classes --------------------------------------------------------------


In [3]:
class ApacheMember:
    def __init__(self, username, name, organization, roles, timezone, pmc, committer):
        self.username = username
        self.name = name
        self.organization = organization
        self.roles = roles
        self.timezone = timezone
        self.pmc = pmc
        self.committer = committer

    def csv_line(self):
        return self.username + ';' + self.name \
               + ';' + str(self.organization) \
               + ';' + str(self.roles) \
               + ';' + str(self.organization) \
               + ';' + str(self.timezone) \
               + ';' + str(self.pmc) \
               + ';' + str(self.committer) + '\n'

def remove_nan_str_pandas(string):
    return "" if pandas.isnull(string) else str(string)

class ContributorInfo():
    def __init__(self, id, name_git, username_jira, name_jira):
        self.id = id
        self.name_git = name_git
        self.username_jira = username_jira
        self.name_jira = name_jira

    def to_csv_line(self):
        return remove_nan_str_pandas(self.id) + ';' + remove_nan_str_pandas(self.name_git) + ';' + \
               remove_nan_str_pandas(self.username_jira) + ';' + remove_nan_str_pandas(self.name_jira) + '\n'

--------------------------------------------------------- Support functions --------------------------------------------------------------


In [4]:
# Read the file and return a pandas dataset
# type -> snapshot, comment-log, changelog or commit-log
def get_info_project_in_dataset(type):
    dataset = []
    for project in PROJECTS:
        dataset.append(
            pandas.read_csv(PATH_ORIGINAL_DATASET_FILES + type + "/" + project + "-bug-fix-dataset.csv", index_col=None, header=0,
                                delimiter=';')
        )
    return pandas.concat(dataset, ignore_index=True)

In [5]:
# Generate files with of all contributors info
def do_mine_contributors_names():
    snapshot_dataset = get_info_project_in_dataset("snapshot")

    commentlog_dataset = get_info_project_in_dataset("comment-log")

    changelog_dataset = get_info_project_in_dataset("changelog")

    commitlog_dataset = get_info_project_in_dataset("commit-log")

    jira_team_dataset = pandas.DataFrame(columns=['Name'])
    git_team_dataset = pandas.DataFrame(columns=['Name'])

    jira_team = set()
    git_team = set()

    for index, row in snapshot_dataset.iterrows():
        jira_team.add(row.Reporter)
        jira_team.add(row.Assignee)

    for index, row in commentlog_dataset.iterrows():
        jira_team.add(row.Author)

    for index, row in changelog_dataset.iterrows():
        jira_team.add(row.Author)

    for index, row in commitlog_dataset.iterrows():
        git_team.add(row.Author)
        git_team.add(row.Committer)

    jira_team_dataset = pandas.DataFrame(jira_team)
    jira_team_dataset.columns = ['Name']

    git_team_dataset = pandas.DataFrame(git_team)
    git_team_dataset.columns = ['Name']

    jira_team_dataset.to_csv(PATH_SOCIAL_DATASET_FILES + "jira-community-contributors-names.csv", sep=';', encoding='utf-8',
                             index=False, na_rep='NULL')
    git_team_dataset.to_csv(PATH_SOCIAL_DATASET_FILES + "git-community-contributors-names.csv", sep=';', encoding='utf-8',
                            index=False, na_rep='NULL')

    return jira_team_dataset, git_team_dataset

In [6]:
# Unify jira records that have the same name (create the association Name -> Usernames).
def group_by_name_usernames_of_jira(df_jira_usernames=None):
    if df_jira_usernames is None:
        df_jira_usernames = pandas.read_csv(PATH_SOCIAL_DATASET_FILES + "database_user_jira.csv", index_col=None,
                                            header=0, delimiter=';')
    mapped = {}
    cons = open(PATH_GENERATED_FILES + 'jira_hadoop_users.csv', 'w')
    cons.write('username;Name;bot\n')

    count = 1

    for index, row in df_jira_usernames.iterrows():
        if pandas.isnull(row.Name) or not row.Name or row.Name == '--------':
            name = 'Without name - ' + str(count)
            count += 1
            row.Name = name
            mapped[name] = row
            continue

        if row.Name in mapped.keys():
            mapped[row.Name].username += ' | ' + row.username
        else:
            mapped[row.Name] = row

    new_df_jira_usernames = pandas.DataFrame(columns=['Username', 'Name', 'bot'])
    for key in mapped.keys():
        cons.write(str(mapped[key].username) + ';' + str(mapped[key].Name) + ';' + str(mapped[key].bot) + '\n')

        new_df_jira_usernames = pandas.concat([new_df_jira_usernames, pandas.DataFrame({'Username': str(mapped[key].username),
                                            'Name': str(mapped[key].Name), 'bot': mapped[key].bot}, index=[0])], ignore_index=True)

    cons.close()

    return new_df_jira_usernames

In [7]:
# Join the info of hadoop member tabs (official contributors): pmc e committers
def union_pmc_and_committers_members_file():
    # list of contributors taken from the apache website
    pmc = pandas.read_csv(PATH_SOCIAL_DATASET_FILES + "hadoop-members-10-05-2020_aba_pmc.csv", index_col=None, header=0,
                          delimiter=';')
    # manually generated list with information on all contributors who committed to projects
    committers = pandas.read_csv(PATH_SOCIAL_DATASET_FILES + "hadoop-members-10-05-2020_aba_committers.csv", index_col=None,
                                 header=0, delimiter=';')

    df = pandas.DataFrame(columns=['username', 'Name', 'organization', 'roles', 'timezone', 'isPmc', 'isCommitter'])

    mapped = {}
    cons = open(PATH_SOCIAL_DATASET_FILES + 'hadoop-members-10-05-2020-full.csv', 'w')

    cons.write('username;Name;organization;roles;timezone;isPmc;isCommitter')

    for index, row in pmc.iterrows():
        member = ApacheMember(row.username, row.Name, row.organization, row.roles, row.timezone, True, False)
        mapped[row.username] = member

    for index, row in committers.iterrows():
        if row.username in mapped.keys():
            mapped[row.username].committer = True
            continue

        member = ApacheMember(row.username, row.Name, row.organization, row.roles, row.timezone, False, True)
        mapped[row.username] = member

    sorted_keys = list(mapped.keys())
    for key in sorted_keys:
        cons.write(mapped[key].csv_line())
        df = pandas.concat([df, pandas.DataFrame({'username': mapped[key].username, 'Name': str(mapped[key].name),
                   'organization': str(mapped[key].organization), 'roles': str(mapped[key].roles),
                   'timezone': str(mapped[key].timezone), 'isPmc': str(mapped[key].pmc),
                   'isCommitter': str(mapped[key].committer)}, index=[0])], ignore_index=True)

    cons.close()
    return df

In [8]:
# combines the list of contributors to the apache webpage with the git contributors
def union_hadoop_official_members_with_committers_of_dataset(git_team_dataset, official_committers_hadoop_dataset = None):
    if official_committers_hadoop_dataset is None:
        official_committers_hadoop_dataset = \
            pandas.read_csv(PATH_SOCIAL_DATASET_FILES + 'hadoop-members-10-05-2020-full.csv', index_col=None,
                            header=0, delimiter=';')

    committers_and_hadoop_members = official_committers_hadoop_dataset.copy()

    shutil.copy(PATH_SOCIAL_DATASET_FILES + 'hadoop-members-10-05-2020-full.csv',
                PATH_SOCIAL_DATASET_FILES + 'committers_and_hadoop_members_git.csv')

    cons = open(PATH_SOCIAL_DATASET_FILES + 'committers_and_hadoop_members_git.csv', 'a')

    committers_of_dataset = set(git_team_dataset['Name'].tolist())
    hadoop_members = official_committers_hadoop_dataset['Name'].tolist()

    for name in committers_of_dataset:
        if name not in hadoop_members:
            committers_and_hadoop_members = pandas.concat([committers_and_hadoop_members, pandas.DataFrame({'username': '----', 'Name': name,
                                                                                  'organization': '----',
                                                                                  'roles': '----',
                                                                                  'timezone': '----', 'isPmc': '---',
                                                                                  'isCommitter': '--'}, index=[0])],
                                                                                 ignore_index=True)

            cons.write('----;' + name + ';----;----;----;----;----;----\n')

    cons.close()

    return committers_and_hadoop_members

In [9]:
# Join information from git and jira, generating a unique id for each
def join_contributors_info():
    git = pandas.read_csv(PATH_SOCIAL_DATASET_FILES + "4-git_team_after_manual_map.csv",
                          index_col=None, header=0,
                          delimiter=';')

    jira = pandas.read_csv(PATH_GENERATED_FILES + "jira_hadoop_users.csv",
                           index_col=None, header=0,
                           delimiter=';')

    arq = open(PATH_GENERATED_FILES + 'list_of_contributors.csv', 'w')
    arq.write("id;name_git;username_jira;name_jira\n")

    count = 1
    mapped = {}

    for index, row in jira.iterrows():
        id = "id" + str(count)

        info = None

        git_register = git[git['Name'].str.contains(str(row.Name), case=False)]
        name = ''
        if not git_register.empty:
            info = ContributorInfo(id, git_register['Name'].iloc[0], row.username, row.Name)
            name = git_register['Name'].iloc[0]
        else:
            info = ContributorInfo(id, '', row.username, row.Name)
            name = row.Name

        if name in mapped.keys():
            name = name + id

        mapped[name] = info
        count += 1

    for index, row in git.iterrows():
        if row.Name not in mapped.keys():
            id = "id" + str(count)
            mapped[row.Name] = ContributorInfo(id, row.Name, '', '')
            count += 1

    for key in mapped.keys():
        arq.write(mapped[key].to_csv_line())

    arq.close()

In [10]:
# Manual mapping done by the authors in the generated dataset, analyzing all records
def get_manual_correction():
    git_name = {
        "Owen O'Malley": "owen.omalley",
        "Chris Douglas": "chris.douglas",
        "Christopher Douglas": "chris.douglas",
        "Akira Ajisaka": "ajisakaa",
        "Arun C Murthy, Arun C. Murthy": "acmurthy",
        "Arun Murthy": "acmurthy",
        "Anu Engineer": "anu",
        "Bibin A Chundatt": "bibinchundatt",
        "Billie Rinaldi": "billie.rinaldi",
        "Konstantin Boudnik": "cos",
        "Márton Elek": "elek",
        "Eric Payne, Eric E Payne": "eepayne",
        "He Xiaoqiao": "hexiaoqiao",
        "Lohit Vijaya": "renulohit",
        "Nanda kumar": "nandakumar131",
        "Patrick Hunt": "phunt",
        "Prabhu Joseph": "Prabhu Joseph",
        "Scott Chun-Yang Chen": "schen",
        "Subru Krishnan": "subru",
        "Takanobu Asanuma": "tasanuma0829",
        "Andrew Purtell": "apurtell",
        "Thomas Marquardt": "tmarquardt",
        "Vidura Mudalige": "vbmudalige",
        "Devaraj K, Devarajulu K": "devaraj.k",
        'Arpit Agarwal': 'arpitagarwal',
        'sanford ryza': 'sandyr',
        'Bharat Viswanadham': 'bharatviswa',
        # some contributors use username in git
        "bibinchundatt": "bibinchundatt",
        "cnauroth": 'cnauroth',
        #"yufei": "yufei",
        "sidharta s": "sidharta-s",
        "tgraves": "tgraves",
        "drankye": "drankye",
        "yliu": "hitliuyi",
        'arp': 'arpitagarwal',
        'rohithsharmaks':'rohithsharma',
        'mattf': 'mattf',
        'bharat': 'bharatviswa',
        'yufei': 'yufeigu',
        # adjust for conversion of special characters to lower case
        "vinod kumar vavilapalli (i am also known as @tshooter.)".lower(): "vinodkv"
    }

    jira_usernames = {}

    for key in git_name.keys():
        if git_name[key] in jira_usernames.keys():
            jira_usernames[git_name[key]] += ', ' + key
        else:
            jira_usernames[git_name[key]] = key

    # wrong association
    jira_usernames['rajuvishnu'] = ''
    jira_usernames['sohu0011'] = ''
    jira_usernames['jsaraiya'] = ''
    jira_usernames['azuriel'] = ''
    jira_usernames['Fan04290'] = ''
    jira_usernames['mgiri935'] = ''
    jira_usernames['hom'] = ''
    jira_usernames['Xiangyi'] = ''
    jira_usernames['abalitsky'] = ''
    jira_usernames['lewuathe'] = ''
    jira_usernames['rakesh_techie'] = ''

    return git_name, jira_usernames

In [11]:
# Final procedure for generating the list of contributors
def final_processing_info_contributors():
    registers = pandas.read_csv(PATH_GENERATED_FILES + "list_of_contributors.csv",
                           index_col=None, header=0,
                           delimiter=';')

    git_names_correction, jira_usernames_correction = get_manual_correction()

    drop_index = []

    for index, row in registers.iterrows():
        if row.name_git in git_names_correction.keys():
            if pandas.isnull(row.username_jira):
                drop_index.append(index)
                print('droping: ' + row.name_git)
            else:
                registers.at[index, 'name_git'] = ''

        if row.username_jira in jira_usernames_correction.keys():
            if pandas.isnull(row.name_git):
                registers.at[index, 'name_git'] = str(jira_usernames_correction[row.username_jira])
            else:
                registers.at[index, 'name_git'] = '{0}, {1}'.format(row.name_git,
                                                                    jira_usernames_correction[row.username_jira])

            if jira_usernames_correction[row.username_jira] != '': # A correct association
                print('replacing "{0}" for "{1}"'.format(row.name_git, jira_usernames_correction[row.username_jira]))

    registers = registers.drop(drop_index)

    registers.to_csv(PATH_GENERATED_FILES + 'final_list_of_contributors.csv', sep=';', encoding='utf-8',
                     index=False, na_rep='')

In [12]:
# retrieves unique user id based on jira username
def get_id_by_username_jira(registers, username):
    if pandas.isnull(username):
        return '-1'

    username = username.strip().lower()

    usernames = registers.loc[registers.username_jira.str.contains(str(username), na=False, case=False, regex=False)]

    for index, row in usernames.iterrows():
        if ' | ' in row.username_jira:
            users = row.username_jira.split(' | ')
            users = [user.lower() for user in users]
            if username in users:
                return row.id
        else:
            if row.username_jira.lower() == username:
                return row.id

    print('Não mapeado: {0}'.format(username))
    return '-1'

In [13]:
# retrieves unique user id based on git name
def get_id_by_name_git(registers, name):
    if pandas.isnull(name):
        return '-1'

    name = name.strip().lower()

    names = registers.loc[registers.name_git.str.contains(str(name), na=False, case=False, regex=False)]

    for index, row in names.iterrows():
        if ', ' in row.name_git:
            users = row.name_git.split(', ')
            for user in users:
                if name == user.strip().lower():
                    return row.id
        else:
            if row.name_git.strip().lower() == name:
                return row.id

    print('Não mapeado: {0}'.format(name))
    return '-1'


In [14]:
# Creates new datasets based on the original, but using unique ids to identify contributors
def replace_username_or_name_by_id():
    registers = pandas.read_csv(PATH_GENERATED_FILES + "final_list_of_contributors.csv",
                                index_col=None, header=0,
                                delimiter=';')

    snapshot_dataset = get_info_project_in_dataset("snapshot")

    commentlog_dataset = get_info_project_in_dataset("comment-log")

    changelog_dataset = get_info_project_in_dataset("changelog")

    commitlog_dataset = get_info_project_in_dataset("commit-log")

    for index, row in snapshot_dataset.iterrows():
        snapshot_dataset.at[index, 'Reporter'] = get_id_by_username_jira(registers, row.Reporter)
        snapshot_dataset.at[index, 'Assignee'] = get_id_by_username_jira(registers, row.Assignee)

    snapshot_dataset.to_csv(PATH_GENERATED_FILES + 'new_snapshot_file.csv', sep=';', encoding='utf-8',
                            index=False, na_rep='')
    
    for index, row in commentlog_dataset.iterrows():
        commentlog_dataset.at[index, 'Author'] = get_id_by_username_jira(registers, row.Author)

    commentlog_dataset.to_csv(PATH_GENERATED_FILES + 'new_comment_file.csv', sep=';', encoding='utf-8',
                              index=False, na_rep='')
    
    for index, row in changelog_dataset.iterrows():
        changelog_dataset.at[index, 'Author'] = get_id_by_username_jira(registers, row.Author)

    changelog_dataset.to_csv(PATH_GENERATED_FILES + 'new_changelog_file.csv', sep=';', encoding='utf-8',
                             index=False, na_rep='')

    for index, row in commitlog_dataset.iterrows():

        commitlog_dataset.at[index, 'Author'] = get_id_by_name_git(registers, row.Author)
        commitlog_dataset.at[index, 'Committer'] = get_id_by_name_git(registers, row.Committer)

    commitlog_dataset.to_csv(PATH_GENERATED_FILES + 'new_commit_file.csv', sep=';', encoding='utf-8',
                             index=False, na_rep='')

------------------------------------------------------- Main Flow ----------------------------------------------------


In [15]:
def data_preparation():
    # 1 - retrieves the names of git (commit dataset) and jira (commentlog, changelog and snapshot)
    print("**** Step 1 ****")
    jira_team_dataset, git_team_dataset = do_mine_contributors_names()

    """ If you have already executed the line above once, you can choose to execute the lines below, 
        as it reduces processing time using data from files already generated.
    """
    """jira_team_dataset = pandas.read_csv(PATH_SOCIAL_DATASET_FILES + "jira-community-contributors-names.csv", index_col=None,
                                        header=0,
                                        delimiter=';')

    git_team_dataset = pandas.read_csv(PATH_SOCIAL_DATASET_FILES + "git-community-contributors-names.csv", index_col=None,
                                       header=0,
                                       delimiter=';')"""

    # 2 - Handles JIRA data
    print("**** Step 2 ****")
    """ 
        Previously, Apache provided the Jira API to download information from its users. However, this API is currently no 
        longer available. The records of the file used were obtained when it was still possible to retrieve this information
        through the API.
    """
    jira_team_dataset = group_by_name_usernames_of_jira()

    print("**** Step 3 ****")
    # 3 - Handles Git data

    official_committers_hadoop_dataset = union_pmc_and_committers_members_file()

    union_hadoop_official_members_with_committers_of_dataset(git_team_dataset, official_committers_hadoop_dataset)

    print("**** Step 4 ****")

    """
        Step made MANUALLY -> manual verification of the previously generated list, aiming to unify the records related to the 
        same contributor. Of those that were not associated with existing records, only one username could be found on the Internet
        (Dongming Liang).
        The file generated was 4-git_team_after_manual_map.csv
    """

    print("**** Step 5 ****")
    # 5 - Unify the listing of JIRA and Git records to generate unique IDs
    join_contributors_info()

    # 6 - Manual review required to verify the mappings performed.
    print("**** Step 6 ****")
    final_processing_info_contributors()

    # 7 - Generation of the new dataset using unique ids
    print("**** Step 7 ****")
    replace_username_or_name_by_id()

In [16]:
data_preparation()

**** Step 1 ****


**** Step 2 ****
**** Step 3 ****
**** Step 4 ****
**** Step 5 ****
**** Step 6 ****
replacing "Scott Chun-Yang Chen" for "Scott Chun-Yang Chen"
replacing "Chris Nauroth, cnauroth" for "cnauroth"
replacing "Vidura Mudalige" for "Vidura Mudalige"
replacing "Márton Elek" for "Márton Elek"
replacing "Owen O'Malley" for "Owen O'Malley"
replacing "Akira Ajisaka" for "Akira Ajisaka"
replacing "Devaraj K, Devarajulu K" for "Devaraj K, Devarajulu K"
replacing "Bibin A Chundatt, bibinchundatt" for "Bibin A Chundatt, bibinchundatt"
replacing "Bharat Viswanadham, bharat" for "Bharat Viswanadham, bharat"
replacing "Arun C Murthy, Arun C. Murthy, Arun Murthy" for "Arun C Murthy, Arun C. Murthy, Arun Murthy"
replacing "He Xiaoqiao" for "He Xiaoqiao"
replacing "Vinod Kumar Vavilapalli, Vinod Kumar Vavilapalli (I am also known as @tshooter.), vinod kumar vavilapalli (i am also known as @tshooter.)" for "vinod kumar vavilapalli (i am also known as @tshooter.)"
replacing "Billie Rinaldi" for "Billie Rin