# New Contributor Analysis

In [1]:
import psycopg2
import pandas as pd 
import sqlalchemy as salc
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import datetime
warnings.filterwarnings('ignore')

dbschema='augur_data' # Searches left-to-right
engine = salc.create_engine(
    'postgres+psycopg2://augur:mcguire18@mudcats.augurlabs.io:5433/augur_jenkins',
    connect_args={'options': '-csearch_path={}'.format(dbschema)})
save_files = False

## Repo Filter

In [2]:
repo_set = {25502, 25605, 25635, 25563, 25583, 25580, 24607, 25449, 25558, 25561, 25560, 25573, 25632, 25577, 25625, 25503}


In [3]:
df = pd.DataFrame()

for repo_id in repo_set: 

    pr_query = salc.sql.text(f"""        
    

                          SELECT * FROM (
        SELECT ID AS
            cntrb_id,
            A.created_at AS created_at,
            date_part('month', A.created_at::DATE) AS month,
            date_part('year', A.created_at::DATE) AS year,
            A.repo_id,
            repo_name,
        ACTION,
        rank() OVER (
                PARTITION BY id
                ORDER BY A.created_at ASC
            )
        FROM
            (
                (
                SELECT
                    canonical_id AS ID,
                    created_at AS created_at,
                    repo_id,
                    'issue_opened' AS ACTION 
                FROM
                    augur_data.issues
                    LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = issues.reporter_id
                    LEFT OUTER JOIN ( SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, cntrb_canonical AS canonical_email, data_collection_date, cntrb_id AS canonical_id FROM augur_data.contributors WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical ) canonical_full_names ON canonical_full_names.canonical_email = contributors.cntrb_canonical 
                WHERE
                    repo_id = {repo_id}
                    AND pull_request IS NULL 
                GROUP BY
                    canonical_id,
                    repo_id,
                    issues.created_at 
                ) UNION ALL
                (
                SELECT
                    canonical_id AS ID,
                    TO_TIMESTAMP( cmt_author_date, 'YYYY-MM-DD' ) AS created_at,
                    repo_id,
                    'commit' AS ACTION 
                FROM
                    augur_data.commits
                    LEFT OUTER JOIN augur_data.contributors ON cntrb_email = cmt_author_email
                    LEFT OUTER JOIN ( SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, cntrb_canonical AS canonical_email, data_collection_date, cntrb_id AS canonical_id FROM augur_data.contributors WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical ) canonical_full_names ON canonical_full_names.canonical_email = contributors.cntrb_canonical 
                WHERE
                    repo_id = {repo_id} 
                GROUP BY
                    repo_id,
                    canonical_email,
                    canonical_id,
                    commits.cmt_author_date 
                ) UNION ALL
                (
                SELECT
                    message.cntrb_id AS ID,
                    created_at AS created_at,
                    commits.repo_id,
                    'commit_comment' AS ACTION 
                FROM
                    augur_data.commit_comment_ref,
                    augur_data.commits,
                    augur_data.message
                    LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id
                    LEFT OUTER JOIN ( SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, cntrb_canonical AS canonical_email, data_collection_date, cntrb_id AS canonical_id FROM augur_data.contributors WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical ) canonical_full_names ON canonical_full_names.canonical_email = contributors.cntrb_canonical 
                WHERE
                    commits.cmt_id = commit_comment_ref.cmt_id 
                    AND commits.repo_id = {repo_id} 
                    AND commit_comment_ref.msg_id = message.msg_id 
                GROUP BY
                    ID,
                    commits.repo_id,
                    commit_comment_ref.created_at 
                ) UNION ALL
                (
                SELECT
                    issue_events.cntrb_id AS ID,
                    issue_events.created_at AS created_at,
                    repo_id,
                    'issue_closed' AS ACTION 
                FROM
                    augur_data.issues,
                    augur_data.issue_events
                    LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = issue_events.cntrb_id
                    LEFT OUTER JOIN ( SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, cntrb_canonical AS canonical_email, data_collection_date, cntrb_id AS canonical_id FROM augur_data.contributors WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical ) canonical_full_names ON canonical_full_names.canonical_email = contributors.cntrb_canonical 
                WHERE
                    issues.repo_id = {repo_id} 
                    AND issues.issue_id = issue_events.issue_id 
                    AND issues.pull_request IS NULL 
                    AND issue_events.cntrb_id IS NOT NULL 
                    AND ACTION = 'closed' 
                GROUP BY
                    issue_events.cntrb_id,
                    repo_id,
                    issue_events.created_at 
                ) UNION ALL
                (
                SELECT
                    pr_augur_contributor_id AS ID,
                    pr_created_at AS created_at,
                    repo_id,
                    'open_pull_request' AS ACTION 
                FROM
                    augur_data.pull_requests
                    LEFT OUTER JOIN augur_data.contributors ON pull_requests.pr_augur_contributor_id = contributors.cntrb_id
                    LEFT OUTER JOIN ( SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, cntrb_canonical AS canonical_email, data_collection_date, cntrb_id AS canonical_id FROM augur_data.contributors WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical ) canonical_full_names ON canonical_full_names.canonical_email = contributors.cntrb_canonical 
                WHERE
                    pull_requests.repo_id = {repo_id} 
                GROUP BY
                    pull_requests.pr_augur_contributor_id,
                    pull_requests.repo_id,
                    pull_requests.pr_created_at 
                ) UNION ALL
                (
                SELECT
                    message.cntrb_id AS ID,
                    msg_timestamp AS created_at,
                    repo_id,
                    'pull_request_comment' AS ACTION 
                FROM
                    augur_data.pull_requests,
                    augur_data.pull_request_message_ref,
                    augur_data.message
                    LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id
                    LEFT OUTER JOIN ( SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, cntrb_canonical AS canonical_email, data_collection_date, cntrb_id AS canonical_id FROM augur_data.contributors WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical ) canonical_full_names ON canonical_full_names.canonical_email = contributors.cntrb_canonical 
                WHERE
                    pull_requests.repo_id = {repo_id}
                    AND pull_request_message_ref.pull_request_id = pull_requests.pull_request_id 
                    AND pull_request_message_ref.msg_id = message.msg_id 
                GROUP BY
                    message.cntrb_id,
                    pull_requests.repo_id,
                    message.msg_timestamp 
                ) UNION ALL
                (
                SELECT
                    issues.reporter_id AS ID,
                    msg_timestamp AS created_at,
                    repo_id,
                    'issue_comment' AS ACTION 
                FROM
                    issues,
                    issue_message_ref,
                    message
                    LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id
                    LEFT OUTER JOIN ( SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, cntrb_canonical AS canonical_email, data_collection_date, cntrb_id AS canonical_id FROM augur_data.contributors WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical ) canonical_full_names ON canonical_full_names.canonical_email = contributors.cntrb_canonical 
                WHERE
                    issues.repo_id = {repo_id}
                    AND issue_message_ref.msg_id = message.msg_id 
                    AND issues.issue_id = issue_message_ref.issue_id 
                GROUP BY
                    issues.reporter_id,
                    issues.repo_id,
                    message.msg_timestamp 
                ) 
            ) A,
            repo 
        WHERE
        ID IS NOT NULL 
            AND A.repo_id = repo.repo_id 
        GROUP BY
            A.ID,
            A.repo_id,
            A.ACTION,
        A.created_at,
        repo.repo_name
        ) b
        WHERE RANK IN (1,2)

""")
    df_first_repo = pd.read_sql(pr_query, con=engine)
    if not df.empty: 
        df = pd.concat([df, df_first_repo]) 
    else: 
        # first repo
        df = df_first_repo

        
current_time = datetime.datetime.now()
months_df = pd.DataFrame()

months_query = salc.sql.text(f"""        
  SELECT
            *
        FROM
        (
        SELECT
            date_part( 'year', created_month :: DATE ) AS year,
            date_part( 'month', created_month :: DATE ) AS MONTH
        FROM
            (SELECT * FROM ( SELECT created_month :: DATE FROM generate_series (TIMESTAMP '2015-01-01', TIMESTAMP '{current_time}', INTERVAL '1 month' ) created_month ) d ) x 
        ) y
""")
months_df = pd.read_sql(months_query, con=engine)
    #if not df.empty: 
        #df = pd.concat([df, df_first_repo]) 
   # else: 
        # first repo
        #df = df_first_repo
display(months_df)
display(df)

Unnamed: 0,year,month
0,2015.0,1.0
1,2015.0,2.0
2,2015.0,3.0
3,2015.0,4.0
4,2015.0,5.0
...,...,...
61,2020.0,2.0
62,2020.0,3.0
63,2020.0,4.0
64,2020.0,5.0


Unnamed: 0,cntrb_id,created_at,month,year,repo_id,repo_name,action,rank
0,280116,2020-01-15 00:00:00+00:00,1.0,2020.0,25632,jenkins-x-statistics,commit,1
1,280116,2020-01-16 00:00:00+00:00,1.0,2020.0,25632,jenkins-x-statistics,commit,2
0,277109,2020-02-01 00:00:00+00:00,2.0,2020.0,25635,jenkins-x-boot-helmfile-config,commit,1
1,277109,2020-02-05 00:00:00+00:00,2.0,2020.0,25635,jenkins-x-boot-helmfile-config,commit,2
2,277116,2020-01-31 00:00:00+00:00,1.0,2020.0,25635,jenkins-x-boot-helmfile-config,commit,1
...,...,...,...,...,...,...,...,...
213,281127,2018-05-11 00:00:00+00:00,5.0,2018.0,25502,jenkins-x-platform,commit,1
214,281128,2018-01-03 00:00:00+00:00,1.0,2018.0,25502,jenkins-x-platform,commit,1
215,281129,2019-12-02 00:00:00+00:00,12.0,2019.0,25502,jenkins-x-platform,commit,1
216,281153,2020-04-24 09:41:36+00:00,4.0,2020.0,25502,jenkins-x-platform,issue_comment,1


In [4]:
#add yearmonths
df[['month', 'year']] = df[['month', 'year']].astype(int).astype(str)
df['yearmonth'] = df['month'] + '/' + df['year']
df['yearmonth'] = pd.to_datetime(df['yearmonth'])


months_df[['year','month']] = months_df[['year','month']].astype(float).astype(int).astype(str)
months_df['yearmonth'] = months_df['month'] + '/' + months_df['year']
months_df['yearmonth'] = pd.to_datetime(months_df['yearmonth'])

df['new_contributors'] = 1


months_df

Unnamed: 0,year,month,yearmonth
0,2015,1,2015-01-01
1,2015,2,2015-02-01
2,2015,3,2015-03-01
3,2015,4,2015-04-01
4,2015,5,2015-05-01
...,...,...,...
61,2020,2,2020-02-01
62,2020,3,2020-03-01
63,2020,4,2020-04-01
64,2020,5,2020-05-01


In [5]:
#specify dates for filtering
begin_date = '2018-01-01'
end_date = '2020-05-30'

months_df = months_df.set_index(months_df['yearmonth'])
months_df = months_df.loc[ begin_date : end_date].reset_index(drop = True)

# Start Visualization Methods

In [6]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import Label, LabelSet, ColumnDataSource, Legend, TableColumn, DateFormatter, DataTable
from bokeh.palettes import mpl, magma, viridis, Colorblind
from bokeh.transform import dodge

def vertical_bar_chart(input_df, months_df,repo_name='', group_by='month', contributor_type = 'All', y_max=None, y_axis='new_contributors', title="{}: {} {} Time Contributors Per {}", save_file=False, rank = 1):

    contributor_types = ['All', 'drive_by', 'repeat']
    ranks = [1, 2]

    for rank in ranks:
        for types in contributor_types:
            if (rank == 2 and types == 'drive_by') or (rank == 2 and types == 'repeat'):
                continue
            rank = rank
            contributor_type = types
            
            output_notebook()
            driver_df = input_df.copy()
            
            if repo_name:
                driver_df = driver_df.loc[driver_df['repo_name'] == repo_name]
            else:
                repo_name = "All repos"
            
            mask = (driver_df['yearmonth'] < begin_date)
            driver_df= driver_df[~driver_df['cntrb_id'].isin(driver_df.loc[mask]['cntrb_id'])]
            
            if contributor_type == 'drive_by':
                driver_df= driver_df[~driver_df['cntrb_id'].isin(driver_df.loc[driver_df['rank'] == 2]['cntrb_id'])]
                contributor_type = 'Drive By'
                caption = 'This graph shows drive by contributors in the specified time period. Drive by contributors are contributors who make one, and only one contribution to the repository, and that contribution is in the specified time period. New contributors are individuals who make their first contribution in the specified time period. Of course, then, “All drive-by’s are by definition first time contributors”. However, not all first time contributors are drive-by’s.'
            elif contributor_type == 'repeat':
                driver_df= driver_df[driver_df['cntrb_id'].isin(driver_df.loc[driver_df['rank'] == 2]['cntrb_id'])]
                if rank == 1:
                    caption = 'This graph shows repeat contributors in the specified time period. Repeat contributors are contributors who have made more than one contribution to the repository and their first contribution is in the specified time period. New contributors are individuals who make their first contribution in the specified time period.'
            else: 
                if rank == 1:
                    caption = 'This graph shows all the first time contributors, whether they contribute once, or contribute multiple times. New contributors are individuals who make their first contribution in the specified time period.'
                elif rank == 2:
                    caption = 'This graph shows the second contribution of all contributors in the specified time period. These contributors are also repeat contributors, since it is their second contribution. Repeat contributors are contributors who have made more than one contribution to the repository and their first contribution is in the specified time period.'
                    y_axis_label = 'Second Time Contributors'
                    
            mask = (driver_df['yearmonth'] < end_date)
            driver_df = driver_df.loc[mask]

            driver_df = driver_df.loc[driver_df['rank'] == rank]

            #adds all months to driver_df so the lists of dates will include all months and years    
            driver_df = pd.concat([driver_df, months_df])
            
            
            data = pd.DataFrame()
            if group_by == 'year':
                    #x-axis dates
                    data['dates'] = driver_df[group_by].unique()

                    #new contributor counts for y-axis
                    data['new_contributor_counts'] = driver_df.groupby([group_by]).sum().reset_index()[y_axis]

                    #used to format x-axis and title
                    group_by_format_string = "Year"

            elif group_by == 'month':

                #x-axis dates
                dates = np.datetime_as_string(driver_df['yearmonth'], unit='M')
                dates = np.unique(dates)
                data['dates'] = dates

                #new contributor counts for y-axis
                data['new_contributor_counts'] = driver_df.groupby(['yearmonth']).sum().reset_index()[y_axis]

                #used to format x-axis and title
                group_by_format_string = "Month"
                
                
            if len(data['new_contributor_counts']) >= 15:
                plot_width = 46 * len(data['new_contributor_counts'])
            else:
                plot_width = 670

            numbers = ['Zero', 'First', 'Second']
            num_conversion_dict = {}
            for i in range(1, len(numbers)):
                num_conversion_dict[i] = numbers[i]
            number =  '{}'.format(num_conversion_dict[rank])

            p = figure(x_range=data['dates'], plot_height=400, plot_width = plot_width, title=title.format(repo_name, contributor_type.capitalize(), number, group_by_format_string), 
                    toolbar_location=None, y_range=(0, max(data['new_contributor_counts'])* 1.15), margin = (0, 0, 200, 0))

            p.vbar(x=data['dates'], top=data['new_contributor_counts'], width=0.8)

            source = ColumnDataSource(data=dict(dates=data['dates'], new_contributor_counts=data['new_contributor_counts']))
            labels = LabelSet(x='dates', y='new_contributor_counts', text='new_contributor_counts', y_offset=4,
                      text_font_size="13pt", text_color="black",
                      source=source, text_align='center')
            p.add_layout(labels)

            caption = Label(x=-10, y=-120, x_units='screen', y_units='screen',
                    text='{}'.format(caption), render_mode='css',
                    background_fill_color='white', text_font_size = '15pt')
            p.add_layout(caption)

            p.xgrid.grid_line_color = None
            p.y_range.start = 0
            p.axis.minor_tick_line_color = None
            p.outline_line_color = None

            p.title.align = "center"
            p.title.text_font_size = "18px"
            
            p.yaxis.axis_label = 'Second Time Contributors' if rank == 2 else 'New Contributors'
            p.xaxis.axis_label = group_by_format_string 

            p.xaxis.axis_label_text_font_size = "18px"
            p.yaxis.axis_label_text_font_size = "16px"

            p.xaxis.major_label_text_font_size = "16px"
            p.xaxis.major_label_orientation = 45.0

            p.yaxis.major_label_text_font_size = "16px"

            show(p)

In [7]:
vertical_bar_chart(df, months_df, group_by = 'month', repo_name= 'jenkins-x-platform')

In [8]:
#repo_name: If not repo_name is specified, data for all repos will be shown
#group_by: 'created_month' to group by month and 'created_year' to group by year
def vertical_stacked_bar_chart(input_df, months_df, contributor_type = 'All',repo_name=None, group_by='month', y_axis='new_contributors', title = "{}: {} {} Time Contributors Per {}", save_file=False, rank = 1):
    
    contributor_types = ['All', 'drive_by', 'repeat']
    ranks = [1, 2]

    for rank in ranks:
        for types in contributor_types:
            if (rank == 2 and types == 'drive_by') or (rank == 2 and types == 'repeat'):
                continue
            rank = rank
            contributor_type = types
    
            output_notebook()

            driver_df = input_df.copy()
            

            if repo_name:
                driver_df = driver_df.loc[driver_df['repo_name'] == repo_name]
            else:
                repo_name = "All repos"
                
            mask = (driver_df['yearmonth'] < begin_date)
            driver_df= driver_df[~driver_df['cntrb_id'].isin(driver_df.loc[mask]['cntrb_id'])]

            if contributor_type == 'drive_by':
                driver_df= driver_df[~driver_df['cntrb_id'].isin(driver_df.loc[driver_df['rank'] == 2]['cntrb_id'])]
                contributor_type = 'Drive By'
                caption = 'This graph shows drive by contributors in the specified time period, and indicates the kind of contribution they made. Drive by contributors are contributors who make one, and only one contribution to the repository, and that contribution is in the specified time period. New contributors are individuals who make their first contribution in the specified time period. Of course, then, “All drive-by’s are by definition first time contributors”. However, not all first time contributors are drive-by’s.'
            elif contributor_type == 'repeat':
                driver_df= driver_df[driver_df['cntrb_id'].isin(driver_df.loc[driver_df['rank'] == 2]['cntrb_id'])]
                if rank == 1:
                    caption = 'This graph shows repeat contributors in the specified time period, and indicates the kind of contribution they made. Repeat contributors are contributors who have made more than one contribution to the repository and their first contribution is in the specified time period. New contributors are individuals who make their first contribution in the specified time period.'
            else:
                if rank == 1:
                    caption = 'This graph shows all first time contributors in the specified time period, and indicates the kind of contribution they made. New contributors are individuals who make their first contribution in the specified time period.'
                elif rank == 2:
                    caption = 'This graph shows the second contribution of all contributors in the specified time period, and indicates the kind of contribution they made. This graph shows the second contribution of all contributors in the specified time period. These contributors are also repeat contributors, since it is their second contribution. Repeat contributors are contributors who have made more than one contribution to the repository and their first contribution is in the specified time period. Second time contributors are individuals who make their first and second contribution in the specified time period.'
                    y_axis_label = 'Second Time Contributors'
                    
            mask = (driver_df['yearmonth'] < end_date)
            driver_df = driver_df.loc[mask]


            driver_df = driver_df.loc[driver_df['rank'] == rank]

            #adds all months to driver_df so the lists of dates will include all months and years    
            driver_df = pd.concat([driver_df, months_df])
            
            data = pd.DataFrame()
            if group_by == 'year': 

                #x-axis dates
                data['dates'] = driver_df[group_by].unique()

                #new contributor counts for each type of action
                data['issue_opened_counts'] = pd.concat([driver_df.loc[driver_df['action'] == 'issue_opened'], months_df]).groupby([group_by]).sum().reset_index()[y_axis]
                data['issue_closed_counts'] = pd.concat([driver_df.loc[driver_df['action'] == 'issue_closed'], months_df]).groupby([group_by]).sum().reset_index()[y_axis]
                data['issue_comment_counts'] = pd.concat([driver_df.loc[driver_df['action'] == 'issue_comment'], months_df]).groupby([group_by]).sum().reset_index()[y_axis]
                data['commit_counts'] = pd.concat([driver_df.loc[driver_df['action'] == 'commit'], months_df]).groupby([group_by]).sum().reset_index()[y_axis]
                data['pull_request_opened_counts'] = pd.concat([driver_df.loc[driver_df['action'] == 'open_pull_request'], months_df]).groupby([group_by]).sum().reset_index()[y_axis]
                data['pull_request_comment_counts'] = pd.concat([driver_df.loc[driver_df['action'] == 'pull_request_comment'], months_df]).groupby([group_by]).sum().reset_index()[y_axis]

                #new contributor counts for all actions
                data['new_contributor_counts'] = driver_df.groupby([group_by]).sum().reset_index()[y_axis]

                #used to format x-axis and graph title
                group_by_format_string = "Year"

            elif group_by == 'month':

                #x-axis dates
                dates = np.datetime_as_string(driver_df['yearmonth'], unit='M')
                dates = np.unique(dates)
                data['dates'] = dates

                #new_contributor counts for each type of action
                data['issue_opened_counts'] = pd.concat([driver_df.loc[driver_df['action'] == 'issue_opened'], months_df]).groupby('yearmonth').sum().reset_index()[y_axis]
                data['issue_closed_counts'] = pd.concat([driver_df.loc[driver_df['action'] == 'issue_closed'], months_df]).groupby('yearmonth').sum().reset_index()[y_axis]
                data['issue_comment_counts'] = pd.concat([driver_df.loc[driver_df['action'] == 'issue_comment'], months_df]).groupby('yearmonth').sum().reset_index()[y_axis]
                data['commit_counts'] = pd.concat([driver_df.loc[driver_df['action'] == 'commit'], months_df]).groupby('yearmonth').sum().reset_index()[y_axis]
                data['pull_request_opened_counts'] = pd.concat([driver_df.loc[driver_df['action'] == 'open_pull_request'], months_df]).groupby('yearmonth').sum().reset_index()[y_axis]
                data['pull_request_comment_counts'] = pd.concat([driver_df.loc[driver_df['action'] == 'pull_request_comment'], months_df]).groupby(['yearmonth']).sum().reset_index()[y_axis]

                #new contributor counts for all actions
                data['new_contributor_counts'] = driver_df.groupby(['yearmonth']).sum().reset_index()[y_axis]


                #used to format x-axis and graph title
                group_by_format_string = "Month"

            
            if len(data['new_contributor_counts']) >= 15:
                plot_width = 46 * len(data['new_contributor_counts']) + 200
            else:
                plot_width = 870

                
            data_source = {'dates'          : data['dates'],
                    'Issue Opened'          : data['issue_opened_counts'],
                    'Issue Closed'          : data['issue_closed_counts'],
                    'Issue Comment'         : data['issue_comment_counts'],
                    'Commit'                : data['commit_counts'],
                    'Pull Request Opened'   : data['pull_request_opened_counts'],
                    'Pull Request Comment'  : data['pull_request_comment_counts'],
                    'New Contributor Counts': data['new_contributor_counts']}

            actions = ["Issue Opened", "Issue Closed", "Issue Comment", "Commit", "Pull Request Opened", "Pull Request Comment"]

            colors = Colorblind[len(actions)]

            source = ColumnDataSource(data=data_source)

            numbers = ['Zero', 'First', 'Second']
            num_conversion_dict = {}
            for i in range(1, len(numbers)):
                num_conversion_dict[i] = numbers[i]
            number =  '{}'.format(num_conversion_dict[rank])

            p = figure(x_range=data['dates'], plot_height=400, plot_width = plot_width, title=title.format(repo_name, contributor_type.capitalize(), number, group_by_format_string), 
                       toolbar_location=None, y_range=(0, max(data['new_contributor_counts'])* 1.15), margin = (0, 0, 250, 0))

            vbar = p.vbar_stack(actions, x='dates', width=0.8, color=colors, source=source)

            labels = LabelSet(x='dates', y='New Contributor Counts', text='New Contributor Counts', y_offset=4, text_font_size="14pt", 
                              text_color="black", source=source, text_align='center')
            p.add_layout(labels)

            legend = Legend(items=[(date, [action]) for (date, action) in zip(actions, vbar)], location=(0, 120), label_text_font_size = "16px")
            p.add_layout(legend, 'right')

            #add captions
            caption = Label(x=-10, y=-120, x_units='screen', y_units='screen',
                         text='{}'.format(caption), render_mode='css',
                         background_fill_color='white', text_font_size = '15pt')
            p.add_layout(caption)

            p.xgrid.grid_line_color = None
            p.y_range.start = 0
            p.axis.minor_tick_line_color = None
            p.outline_line_color = None

            p.title.align = "center"
            p.title.text_font_size = "18px"

            p.yaxis.axis_label = 'Second Time Contributors' if rank == 2 else 'New Contributors'
            p.xaxis.axis_label = group_by_format_string 

            p.xaxis.axis_label_text_font_size = "18px"
            p.yaxis.axis_label_text_font_size = "16px"

            p.xaxis.major_label_text_font_size = "16px"
            p.xaxis.major_label_orientation = 45.0

            p.yaxis.major_label_text_font_size = "16px"
            show(p)

In [9]:
vertical_stacked_bar_chart(df, months_df, repo_name = 'jenkins-x-platform', group_by = 'month')

In [10]:
from math import pi
from bokeh.transform import cumsum

def pie_chart(input_df, repo_name=None):
    
    output_notebook()
    
    driver_df = input_df.copy()
   
    if repo_name:
        driver_df = driver_df.loc[driver_df['repo_name'] == repo_name]
        
    mask = (driver_df['yearmonth'] < begin_date)
    driver_df= driver_df[~driver_df['cntrb_id'].isin(driver_df.loc[mask]['cntrb_id'])]

    mask = (driver_df['yearmonth'] < end_date)
    driver_df = driver_df.loc[mask]
     
    first_time_contributors = driver_df.loc[driver_df['rank'] == 1].count()['new_contributors']
    second_time_contributors = driver_df.loc[driver_df['rank'] == 2].count()['new_contributors']    
    
    x = {'Drive_By': first_time_contributors - second_time_contributors,
         'Repeat' : second_time_contributors}

    data = pd.Series(x).reset_index(name='value').rename(columns={'index':'description'})
    data['angle'] = data['value']/data['value'].sum() * 2*pi
    data['color'] = ('#0072B2', '#E69F00')
    data['percentage'] = ((data['angle']/(2*pi))*100).round(2)
 
    source = ColumnDataSource(data)
    p = figure(plot_height=350, plot_width = 650, title=" {}: Number of Returning Contributors out of {}".format(repo_name, first_time_contributors), 
               toolbar_location=None, x_range=(-0.5, 1.3), tools = 'hover', tooltips = "@description", margin = (0, 0, 250, 0))

    wedge = p.wedge(x=0.87, y=1, radius=0.4, start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color=None, fill_color='color', legend_field='description', source=data)
  
    start_point = 0.88
    for i in range(0, len(data['percentage'])):
        #percentages
        label=Label(x=-0.10, y= start_point + 0.13*(len(data['percentage']) - 1 - i), text='{}%'.format(data.iloc[i]['percentage']), 
                    render_mode='css', text_font_size = '15px', text_font_style= 'bold')
        p.add_layout(label)
        
        #contributors
        label=Label(x=0.27, y= start_point + 0.13*(len(data['percentage']) - 1 - i), text='{}'.format(data.iloc[i]['value']), 
                    render_mode='css', text_font_size = '15px', text_font_style= 'bold')
        p.add_layout(label)
        
    #percentages header    
    label=Label(x=-0.17, y= start_point + 0.13*(len(data['percentage'])), text='Percentages', render_mode='css', 
                text_font_size = '15px', text_font_style= 'bold')
    p.add_layout(label)
    
    #legend header
    label=Label(x=-0.43, y= start_point + 0.13*(len(data['percentage'])), text='Category', render_mode='css', 
                text_font_size = '15px', text_font_style= 'bold')
    p.add_layout(label)
    
    #contributors header
    label=Label(x=0.15, y= start_point + 0.13*(len(data['percentage'])), text='# Contributors', render_mode='css', 
                text_font_size = '15px', text_font_style= 'bold')
    p.add_layout(label)
    
    caption= 'This pie chart shows the percentage of new contributors who were drive-by or repeat contributors. Drive by contributors are contributors who make one, and only one contribution to the repository, and that contribution is in the specified time period.  Repeat contributors are contributors who have made more than one contribution to the repository and their first contribution is in the specified time period.'
    caption = Label(x=0, y=-27, x_units='screen', y_units='screen',
             text='{}'.format(caption), render_mode='css',
             background_fill_color='white', text_font_size = '14pt')
    p.add_layout(caption)
    
    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color = None 
    
    p.title.align = "center"
    p.title.text_font_size = "18px"
    
    p.legend.location = "center_left"
    p.legend.border_line_color = None
    p.legend.label_text_font_style = 'bold'
    p.legend.label_text_font_size = "15px"
    
    show(p)

In [11]:
pie_chart(df, repo_name='jenkins-x-platform')

In [12]:
def vertical_stacked_bar_chart_2(input_df, months_df,repo_name=None, group_by='month', y_axis='new_contributors', title = "{}: Drive By and Repeat Contributor Counts per {}", save_file=False):
    
    output_notebook()
    
    driver_df = input_df.copy()
    if repo_name:
        driver_df = driver_df.loc[driver_df['repo_name'] == repo_name]
    else:
        repo_name = "All repos"
        
    mask = (driver_df['yearmonth'] < begin_date)
    driver_df= driver_df[~driver_df['cntrb_id'].isin(driver_df.loc[mask]['cntrb_id'])]
    
    mask = (driver_df['yearmonth'] < end_date)
    driver_df = driver_df.loc[mask]
        
    #create dataframes with drive by's and repeat contributors
    drive_by_df= driver_df[~driver_df['cntrb_id'].isin(driver_df.loc[driver_df['rank'] == 2]['cntrb_id'])]
    repeat_df= driver_df[driver_df['cntrb_id'].isin(driver_df.loc[driver_df['rank'] == 2]['cntrb_id'])]
    
    #adds all months to driver_df so the lists of dates will include all months and years    
    driver_df = pd.concat([driver_df, months_df])
    
    data = pd.DataFrame()
    if group_by == 'year': 
        
        #x-axis dates
        data['dates'] = driver_df[group_by].unique()
        data['dates'] = driver_df[group_by].unique()
        
        #new contributor counts for each type of contributor
        data['drive_by_counts'] = pd.concat([drive_by_df, months_df]).groupby([group_by]).sum().reset_index()[y_axis]
        data['repeat_counts'] = pd.concat([repeat_df.loc[repeat_df['rank'] == 1], months_df]).groupby([group_by]).sum().reset_index()[y_axis]
        
        #new contributor counts for all contributor counts
        total_counts = []
        for i in range(0, len(data['drive_by_counts'])):
            total_counts.append(data.iloc[i]['drive_by_counts'] + data.iloc[i]['repeat_counts'])
        data['total_counts'] = total_counts

        #used to format x-axis and graph title
        group_by_format_string = "Year"
        
        #font size of drive by and repeat labels
        label_text_font_size = "14pt"
        
    elif group_by == 'month':
        #x-axis dates
        dates = np.datetime_as_string(driver_df['yearmonth'], unit='M')
        dates = np.unique(dates)
        data['dates'] = dates
        
        #new_contributor counts for each type of contributor
        data['drive_by_counts'] = pd.concat([drive_by_df, months_df]).groupby('yearmonth').sum().reset_index()[y_axis]
        data['repeat_counts'] = pd.concat([repeat_df.loc[repeat_df['rank'] == 1], months_df]).groupby('yearmonth').sum().reset_index()[y_axis]
        
        #new contributor counts for all contributor types
        total_counts = []
        for i in range(0, len(data['drive_by_counts'])):
            total_counts.append(data.iloc[i]['drive_by_counts'] + data.iloc[i]['repeat_counts'])
        data['total_counts'] = total_counts
    
        #used to format x-axis and graph title
        group_by_format_string = "Month"
        
        #font size of drive by and repeat labels
        label_text_font_size = "13pt"
        
        
    if len(data['total_counts']) >= 13:
        plot_width = 46 * len(data['total_counts']) + 210
    else:
        plot_width = 780

    data_source = {'Dates' : data['dates'],
            'Drive By'     : data['drive_by_counts'],
            'Repeat'       : data['repeat_counts'],
            'All'          : data['total_counts']}
    
    groups = ["Drive By", "Repeat"]
    
    colors = ['#56B4E9', '#E69F00']

    source = ColumnDataSource(data=data_source)
    
    p = figure(x_range=data['dates'], plot_height=500, plot_width = plot_width, title=title.format(repo_name, group_by_format_string), 
               toolbar_location=None, y_range=(0, max(total_counts)* 1.15), margin = (0, 0, 250, 0))
        
    vbar = p.vbar_stack(groups, x='Dates', width=0.8, color=colors, source=source)
   
    #add total counts above bars
    labels = LabelSet(x='Dates', y='All', text='All', y_offset=8, text_font_size="14pt", 
                      text_color="black", source=source, text_align='center')
    p.add_layout(labels)
    
    #add drive by count labels
    labels = LabelSet(x='Dates', y='Drive By', text='Drive By', y_offset=-22, text_font_size=label_text_font_size, 
              text_color="black", source=source, text_align='center')
    p.add_layout(labels)
    
    #add repeat count labels
    labels = LabelSet(x='Dates', y='All', text='Repeat', y_offset=-22, text_font_size=label_text_font_size, 
              text_color="black", source=source, text_align='center')
    p.add_layout(labels)
   
    #add legend
    legend = Legend(items=[(date, [group]) for (date, group) in zip(groups, vbar)], location=(0, 200), label_text_font_size = "16px")
    p.add_layout(legend, 'right')
    
    #add caption showing what drive by and repeat contributors are
    caption = 'This graph shows the number of new contributors in the specified time period, and indicates how many were drive-by and repeat contributors. Drive by contributors are contributors who make one, and only one contribution to the repository, and that contribution is in the specified time period. A repeat contributor is someone who contributed for the first time and atleast one more time in the specified time period.'
    
    caption = Label(x=-10, y=-120, x_units='screen', y_units='screen',
                 text='{}'.format(caption), render_mode='css',
                 background_fill_color='white', text_font_size = '15pt')
    p.add_layout(caption)

    p.xgrid.grid_line_color = None
    p.y_range.start = 0
    p.axis.minor_tick_line_color = None
    p.outline_line_color = None
    
    p.title.align = "center"
    p.title.text_font_size = "18px"
    
    p.yaxis.axis_label = '# Contributors'
    p.xaxis.axis_label = group_by_format_string 
    
    p.xaxis.axis_label_text_font_size = "18px"
    p.yaxis.axis_label_text_font_size = "16px"
    
    p.xaxis.major_label_text_font_size = "16px"
    p.xaxis.major_label_orientation = 45.0
    
    p.yaxis.major_label_text_font_size = "16px"
    
    p.legend.label_text_font_size = "20px"
    show(p)

In [13]:
vertical_stacked_bar_chart_2(df, months_df,repo_name = 'jenkins-x-platform', group_by = 'month')