In [1]:
from github import Github
from plotly.offline import plot
from IPython.display import display, HTML
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd 
import numpy as np 

In [2]:
# Set up github access and get members from an organization
pd.set_option('display.max_rows', None)
# Input github access token
g = Github("6dec4a753453cdbf66d526d8a646004deea5d11c")
# Create a list of members 
memberList = []
# Get every member from an organization 
for member in g.get_organization("PyGithub").get_members():
    memberList.append(member.login)

In [3]:
# Get all of the member's repositories and add them to a dataframe
df = pd.DataFrame(columns=['Username','Repository','DateCreated','Language', 'Size'])
for member in memberList:
    currentUser = g.get_user(member)
    for repo in currentUser.get_repos():
        new_row = {'Username': member.strip(), 'Repository': repo.name.strip(), 'DateCreated': repo.created_at.year, 'Language':repo.language, 'Size':repo.size, 'Contributors':repo.get_contributors().totalCount}
        df = df.append(new_row, ignore_index=True)

In [45]:
# add suppression to copying warning 
pd.options.mode.chained_assignment = None
# Sort our dataframe by users and datecreated 
df = df.sort_values(by=['Username', 'DateCreated'])
# Get rid of all of the empty repos or ones with no language 
df = df.loc[(df[['Size','Contributors']] != 0).all(axis=1)]
df = df.loc[df.Language != 'None']
# Shift our datecreated to years of experience by subtracting the year from their first year 
for member in memberList:
    mask = (df['Username'] == member)
    df['DateCreated'][mask] = df['DateCreated'] - (df.loc[df['Username'] == member])['DateCreated'].min()
maxYear = df['DateCreated'].max()

In [46]:
pd.set_option('precision', 0)
# create a dataframe for each year 
dataframe_collection = [] 
for year in range(maxYear):
    tempArray = df.loc[df['DateCreated'] == year]
    tempArray = tempArray.drop(['Username', 'Repository',"DateCreated"], axis=1)
    yearArray = pd.DataFrame(columns=['Language','Repositories','Average Size','Contributors'])
    yearArray['Language'] = df.Language.unique()
    uniqueValues = df.Language.unique()
    # find the amount of times the languages is used, the mean of the repo, and the contributors that they interact with 
    for language in uniqueValues:
        mask = (yearArray['Language'] == language)
        yearArray['Repositories'][mask] = tempArray.loc[tempArray['Language'] == language].Language.count()
        yearArray['Average Size'][mask] = tempArray.loc[tempArray['Language'] == language].Size.mean()
        yearArray['Contributors'][mask] = tempArray.loc[tempArray['Language'] == language].Contributors.mean()
        yearArray['Average Size'].fillna(0, inplace=True)
        yearArray['Repositories'].fillna(0, inplace=True)
        yearArray['Contributors'].fillna(0, inplace=True)
    dataframe_collection.append(yearArray)

In [47]:
# Combine each language into one value
for dataIndex in range(len(dataframe_collection) - 1):
    dataframe_collection[dataIndex + 1]['Repositories'] = dataframe_collection[dataIndex + 1]['Repositories'] + dataframe_collection[dataIndex]['Repositories']
    dataframe_collection[dataIndex + 1]['Average Size'] = (dataframe_collection[dataIndex + 1]['Average Size'] + dataframe_collection[dataIndex]['Average Size']/2)
    dataframe_collection[dataIndex + 1]['Contributors'] = (dataframe_collection[dataIndex + 1]['Contributors'] + dataframe_collection[dataIndex]['Contributors'])

In [62]:
# Cleaning up some outliers 
outliers = ['None', 'Erlang', 'HCL', 'VimL','Go','Roff','Jupyter Notebook','Python']
for dataIndex in range(len(dataframe_collection)):
    for outlier in outliers:
        dataframe_collection[dataIndex] = dataframe_collection[dataIndex].replace(to_replace=outlier, value=np.nan).dropna()
        dataframe_collection[dataIndex]['Year'] = dataIndex
finalDataBase = pd.concat(dataframe_collection)
finalDataBase = finalDataBase.sort_values(by=['Language'])
languages = finalDataBase.Language.unique()
years = ['0','1','2','3','4','5','6','7','8']

In [63]:
# make figure
fig_dict = {
    "data": [],
    "layout": {},
    "frames": []
}

In [64]:
years = np.arange(maxYear)
fig_dict["layout"]["xaxis"] = {"range": [0, 45], "title": "Number of Active Repositories"}
fig_dict["layout"]["yaxis"] = {"title": "Number of Contributors", "range": [0, 400]}
fig_dict["layout"]["hovermode"] = "closest"
fig_dict["layout"]["sliders"] = {
    "args": [
        "transition", {
            "duration": 400,
            "easing": "cubic-in-out"
        }
    ],
    "initialValue": "0",
    "plotlycommand": "animate",
    "values": years,
    "visible": True
}

In [65]:
# fill in most of layout
fig_dict["layout"]["updatemenus"] = [
    {
        "buttons": [
            {
                "args": [None, {"frame": {"duration": 500, "redraw": False},
                                "fromcurrent": True, "transition": {"duration": 300,
                                                                    "easing": "quadratic-in-out"}}],
                "label": "Play",
                "method": "animate"
            },
            {
                "args": [[None], {"frame": {"duration": 0, "redraw": False},
                                  "mode": "immediate",
                                  "transition": {"duration": 0}}],
                "label": "Pause",
                "method": "animate"
            }
        ],
        "direction": "left",
        "pad": {"r": 10, "t": 87},
        "showactive": False,
        "type": "buttons",
        "x": 0.1,
        "xanchor": "right",
        "y": 0,
        "yanchor": "top"
    }
]

sliders_dict = {
    "active": 0,
    "yanchor": "top",
    "xanchor": "left",
    "currentvalue": {
        "font": {"size": 20},
        "prefix": "Years of Experience:",
        "visible": True,
        "xanchor": "right"
    },
    "transition": {"duration": 300, "easing": "cubic-in-out"},
    "pad": {"b": 10, "t": 50},
    "len": 0.9,
    "x": 0.1,
    "y": 0,
    "steps": []
}

In [66]:
year = 0
for language in languages:
    dataset_by_year = finalDataBase[finalDataBase["Year"] == year]
    dataset_by_year_and_language = dataset_by_year[dataset_by_year["Language"] == language]
    data_dict = {
        "x": list(dataset_by_year_and_language["Repositories"]),
        "y": list(dataset_by_year_and_language["Contributors"]),
        "mode": "markers",
        "text": list(dataset_by_year_and_language["Language"]),
        "marker": {
            "sizemode": "area",
            "sizeref": 5,
            "size": list(dataset_by_year_and_language["Average Size"])
        },
        "name": language
    }
    fig_dict["data"].append(data_dict)

In [67]:
dataset_by_year_and_language
# make frames
for year in years:
    frame = {"data": [], "name": str(year)}
    for language in languages:
        dataset_by_year = finalDataBase[finalDataBase["Year"] == int(year)]
        dataset_by_year_and_language = dataset_by_year[dataset_by_year["Language"] == language]
        data_dict = {
            "x": list(dataset_by_year_and_language["Repositories"]),
            "y": list(dataset_by_year_and_language["Contributors"]),
            "mode": "markers",
            "text": list(dataset_by_year_and_language["Language"]),
            "marker": {
                "sizemode": "area",
                "sizeref": 5,
                "size": list(dataset_by_year_and_language["Average Size"])
            },
            "name": language
        }
        frame["data"].append(data_dict)

    fig_dict["frames"].append(frame)
    slider_step = {"args": [
        [year],
        {"frame": {"duration": 300, "redraw": False},
         "mode": "immediate",
         "transition": {"duration": 300}}
    ],
        "label": str(year),   
        "method": "animate"}
    sliders_dict["steps"].append(slider_step)

In [68]:
fig_dict["layout"]["sliders"] = [sliders_dict]
fig = go.Figure(fig_dict)
plot(fig)

'temp-plot.html'