### Data Umbrella PyMC Open Source Working Sessions
#### July to August, 2022

#### Notes
- Input data: `'../data/data_derived/pymc_2022_derived.csv'`
- Output data: None

---

In [1]:
from datetime import date

today = date.today()

print("Today's date:", today)

Today's date: 2022-08-26


In [2]:
import pandas as pd
import numpy as np
from dateutil import parser
import watermark
import feather
from pathlib import Path

In [3]:
import plotly.express as px
import plotly.graph_objects as go

In [4]:
%load_ext watermark
%watermark -n -v -m -g -iv

Python implementation: CPython
Python version       : 3.9.7
IPython version      : 7.25.0

Compiler    : Clang 11.1.0 
OS          : Darwin
Release     : 19.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 4
Architecture: 64bit

Git hash: 5e5c7748f53923df7c844b6c8dda0adbf61fbded

numpy    : 1.20.2
pandas   : 1.3.3
dateutil : 2.8.2
plotly   : 5.4.0
watermark: 2.2.0
feather  : 0.1.2



# Plotly version

In [5]:
#!conda install -c plotly plotly=5.1.0
#!conda install -c plotly plotly=5.2.0
#conda install -c plotly plotly=5.4.0

In [6]:
# v 5.1.0 released Jun 28, 2021
# v 5.4.0 released Nov 15, 2021
#!pip freeze | grep plotly

---

## Read in data

In [7]:
dfall = pd.read_csv('../data/data_derived/pymc_2022_derived.csv')

In [8]:
df_use=dfall.copy()

In [9]:
dfall.columns

Index(['location', 'continent_o', 'country_o', 'iso_alpha', 'iso_num', 'lat',
       'lng', 'timestamp', 'city', 'state-province', 'country', 'gender',
       'pronouns', 'pre_oh', 'session_1', 'session_2', 'session_3', 'post_oh',
       'total_events', 'submitted_pr', 'joined_discord', 'role', 'urp',
       'sessions', 'learn_of_sprint', 'prior_sprints', 'os_experience',
       'prior_os_pr', 'git', 'prog_langs', 'python_experience', 'used_pymc',
       'pymc_repo_familiar', 'why_in_sprint', 'prep_work',
       'primary_spoken_language', 'language_translation', 'count_rows',
       'count', 'status_n', 'status_c'],
      dtype='object')

In [10]:
dfall.head(2)

Unnamed: 0,location,continent_o,country_o,iso_alpha,iso_num,lat,lng,timestamp,city,state-province,...,used_pymc,pymc_repo_familiar,why_in_sprint,prep_work,primary_spoken_language,language_translation,count_rows,count,status_n,status_c
0,Rochester USA,North America,United States,USD,840,43.157285,-77.615214,6/13/2022 18:54:55,Rochester,New York,...,I am not at all familiar with PyMC.,"I am familiar, I have looked at the codebase.","Looking to grow my skills as a developer, and ...",,English,No,1,1,0,Did Not Attend
1,Rochester USA,North America,United States,USD,840,43.157285,-77.615214,7/7/2022 10:47:41,Rochester,New York,...,I am not at all familiar with PyMC.,I am not at all familiar with this library.,Open source data is very valuable and I'd like...,Not sure,"English, ASL",No,1,1,0,Did Not Attend


In [11]:
#dfall.groupby(level="Attended")
dfall['status_c'].value_counts()

Did Not Attend    38
Attended          38
Name: status_c, dtype: int64

In [12]:
dfall.groupby(["status_c", "gender"]).size().reset_index(name="status")

Unnamed: 0,status_c,gender,status
0,Attended,Man,24
1,Attended,Woman,14
2,Did Not Attend,Man,20
3,Did Not Attend,Prefer not to say,2
4,Did Not Attend,Woman,16


In [13]:
dfall.groupby(["gender"]).size().reset_index()

Unnamed: 0,gender,0
0,Man,44
1,Prefer not to say,2
2,Woman,30


## Dictionary

In [14]:
import plotly.express as px

# Dictionary
plot_dict = {'box': px.box, 
             'violin': px.violin,
             'scatter': px.scatter,
             'line': px.line,
             'pie': px.pie,
             'bar': px.bar,
             'scatter_geo': px.scatter_geo,
             'funnel': px.funnel,
            }

In [15]:
dfall.groupby(["gender"]).size().reset_index()

Unnamed: 0,gender,0
0,Man,44
1,Prefer not to say,2
2,Woman,30


In [16]:
dfall.groupby(["status_c"]).size().reset_index()

Unnamed: 0,status_c,0
0,Attended,38
1,Did Not Attend,38


In [17]:
dfall.groupby(["submitted_pr"]).size().reset_index()

Unnamed: 0,submitted_pr,0
0,0.0,52
1,1.0,24


In [18]:
dfall.groupby(["gender","status_c"]).size().reset_index()

Unnamed: 0,gender,status_c,0
0,Man,Attended,24
1,Man,Did Not Attend,20
2,Prefer not to say,Did Not Attend,2
3,Woman,Attended,14
4,Woman,Did Not Attend,16


In [19]:
dfall.groupby(["gender","status_c", "submitted_pr"]).size().reset_index()

Unnamed: 0,gender,status_c,submitted_pr,0
0,Man,Attended,0.0,9
1,Man,Attended,1.0,15
2,Man,Did Not Attend,0.0,20
3,Prefer not to say,Did Not Attend,0.0,2
4,Woman,Attended,0.0,5
5,Woman,Attended,1.0,9
6,Woman,Did Not Attend,0.0,16


In [20]:
# get data ready
# import pandas as pd
# stages = ["Registered", "Attended", "Submitted PR"]
# df_group1 = pd.DataFrame(dict(number=[30, 14, 9], stage=stages))
# df_group1['Gender'] = 'Woman'

# df_group2 = pd.DataFrame(dict(number=[44, 24, 15], stage=stages))
# df_group2['Gender'] = 'Man'

# df_group3 = pd.DataFrame(dict(number=[2, 0, 0], stage=stages))
# df_group3['Gender'] = 'Unspecified'

# df = pd.concat([df_group1, df_group2, df_group3], axis=0)

# # REFACTORED code

# fig = plot_dict['funnel'](df, x='number', y='stage', color='Gender')
# #fig = plot_dict['box'](df, x='number', y='stage', color='Gender')

# fig.show()

In [44]:
# refactored code
import plotly.express as px
from plotly import graph_objects as go

fig = go.Figure()

fig.add_trace(go.Funnel(
    name = 'Women',
    #orientation='h',
    y =  ["Registered", "Attended", "Submitted Pull Request"],
    x = [30, 14, 9],
    textinfo = "value+percent previous"))

fig.add_trace(go.Funnel(
    name = 'Men',
    #orientation = "h",
    y =  ["Registered", "Attended", "Submitted Pull Request"],
    x = [44, 24, 15],
    #textposition = "inside",
    textinfo = "value+percent initial"))


fig.add_trace(go.Funnel(
    name = 'Prefer not to say',
    #orientation = "h",
    y =  ["Registered", "Attended", "Submitted Pull Request"],
    x = [2, 0, 0],
    #textposition = "inside",
    textinfo = "value+percent initial"))

#fig.write_html(f'../graphs/1_funnel.html', include_plotlyjs="cdn")
fig.show()

In [22]:
df_use=dfall.copy()
df_use.head(3)

Unnamed: 0,location,continent_o,country_o,iso_alpha,iso_num,lat,lng,timestamp,city,state-province,...,used_pymc,pymc_repo_familiar,why_in_sprint,prep_work,primary_spoken_language,language_translation,count_rows,count,status_n,status_c
0,Rochester USA,North America,United States,USD,840,43.157285,-77.615214,6/13/2022 18:54:55,Rochester,New York,...,I am not at all familiar with PyMC.,"I am familiar, I have looked at the codebase.","Looking to grow my skills as a developer, and ...",,English,No,1,1,0,Did Not Attend
1,Rochester USA,North America,United States,USD,840,43.157285,-77.615214,7/7/2022 10:47:41,Rochester,New York,...,I am not at all familiar with PyMC.,I am not at all familiar with this library.,Open source data is very valuable and I'd like...,Not sure,"English, ASL",No,1,1,0,Did Not Attend
2,New York USA,North America,United States,USD,840,40.712728,-74.006015,6/13/2022 19:56:59,New York,New York,...,I am not at all familiar with PyMC.,I am not at all familiar with this library.,gain experience and learn,Yes,English,No,1,1,1,Attended


In [23]:
#df_use.groupby(["Attended"]).size().reset_index(name="status")

#df_use['Attended'].mask(df_use['Attended'] != '1', '0', inplace=True)
##df_use['status']=df['Attended']
#df_use['status'].mask(df_use['Attended'] == '0', 'Did Not Attend', inplace=True)
#df_use['status'].mask(df_use['Attended'] == '1', 'Attended', inplace=True)

##df_use.groupby(["Attended", "country"]).size().reset_index(name="status")
#df_use.groupby(["status", "country"]).size().reset_index()


In [24]:
import plotly.express as px

In [46]:
# Note: creating pie chart using variable

df_use=dfall.copy()

demographic = "gender"
demographic_label = "Gender"

demographic = 'contributor_status'
demographic_label = 'New or Returning Contributor'

demographic = 'role'
demographic_label = 'Role'

filter_var = "status_n"
filtered_list = [1]
subset = df_use[df_use[filter_var].isin(filtered_list)]
subset

fig = px.pie(subset, values='count', names=demographic, title=f"{demographic_label}")

#fig.write_html(f'../graphs/2_pie_gender.html', include_plotlyjs="cdn")

fig.show()


In [26]:
#import plotly.express as px

filter_var = "status_n"
filtered_list = [1]
subset = df_use[df_use[filter_var].isin(filtered_list)]
#subset

fig = px.pie(subset, values='count', names='gender', title='Gender')

#fig.write_html(f'../graphs/2_pie_gender.html', include_plotlyjs="cdn")

fig.show()

In [27]:
import plotly.express as px

# #df_use['count']=1

# filtered_list = [1, 2, 3, 4, 5]

# subset = df_use[df_use["status"].isin(filtered_list)]
# subset

filter_var = "status_n"
filtered_list = [1]
subset = df_use[df_use[filter_var].isin(filtered_list)]
#subset

#fig = px.pie(subset, values='count', names='contributor_status', title='New or Returning Contributor')
#fig.write_html(f'../graphs/3_pie_returning.html', include_plotlyjs="cdn")

fig.show()

In [28]:
import plotly.express as px

filtered_list = [1]

subset = df_use[df_use["status_n"].isin(filtered_list)]
subset

fig = px.pie(subset, values='count', names='role', title='Role')

fig.show()

## Refactoring Code Here (Piecharts)

In [29]:
import plotly.express as px
import pandas as pd

df_use = pd.read_csv('../data/data_derived/pymc_2022_derived.csv')

fig = px.pie(df_use, values='count_rows', names='status_c', title='Status',
            labels={'count_rows':'Count'})
fig.update_traces(textposition='inside', textinfo='value+percent+label')

fig.show()

## Refactoring Code Here (Barplots)

In [30]:
import plotly.express as px

# Select the ones you want
df_use=dfall.copy()

grouped_yr_status = df_use.groupby(['country','status_c']).count()
grouped_yr_status=grouped_yr_status.reset_index()

df4 = grouped_yr_status[['country', 'status_c', 'count']]

fig = px.bar(df4, 
             y="country", 
             x="count", 
             color="status_c", 
             text='count',
             #textposition='inside',
             orientation='h',
            )

fig.update_layout(
    title="by Country: Attendee Status",
    xaxis_title="Count",
    yaxis_title="Country",
    legend_title="Status",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    ),
    yaxis_categoryorder='category ascending',
                  width=700,
                  height=700,    
    
)
 
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'})
fig.update_layout(showlegend=True)
#fig.write_html(f'../graphs/bar_all_country.html', include_plotlyjs="cdn")

fig.show()

In [31]:
import plotly.express as px

# Select the ones you want
df_use=dfall.copy()

# df_use['Participant_Status']='xxx'

# df_use['Participant_Status'].mask(df_use['status'] == 0, '2_Did Not Attend', inplace=True)
# df_use['Participant_Status'].mask(df_use['status'] == 1, '1_Attended', inplace=True)

# drop_cols = ['status']
# df_use = df_use.drop(drop_cols, axis=1, inplace=True)

# #df_use['status'].mask(df_use['status'] == '0', 'Did Not Attend', inplace=True)
# #df_use['status'].mask(df_use['status'] == '1', 'Attended', inplace=True)

# df_use = df_use.rename(columns={'Participant_Status': 'status'})


grouped_yr_status = df_use.groupby(['country','status_c']).count()
grouped_yr_status=grouped_yr_status.reset_index()

df4 = grouped_yr_status[['country', 'status_c', 'count']]

fig = px.bar(df4, 
             y="country", 
             x="count", 
             color="status_c", 
             text='count',
             #textposition='inside',
             orientation='h',
            )

fig.update_layout(
    title="by Country: Attendee Status",
    xaxis_title="Count",
    yaxis_title="Country",
    legend_title="Status",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    ),
    yaxis_categoryorder='category ascending',
                  width=700,
                  height=700,    
    
)
 
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'})
fig.update_layout(showlegend=True)
#fig.write_html(f'../graphs/bar_all_country.html', include_plotlyjs="cdn")

fig.show()

In [32]:
# Select the ones you want
df_use=dfall.copy()

filtered_list = [1]

subset = df_use[df_use["status_n"].isin(filtered_list)]
subset.head(4)


Unnamed: 0,location,continent_o,country_o,iso_alpha,iso_num,lat,lng,timestamp,city,state-province,...,used_pymc,pymc_repo_familiar,why_in_sprint,prep_work,primary_spoken_language,language_translation,count_rows,count,status_n,status_c
2,New York USA,North America,United States,USD,840,40.712728,-74.006015,6/13/2022 19:56:59,New York,New York,...,I am not at all familiar with PyMC.,I am not at all familiar with this library.,gain experience and learn,Yes,English,No,1,1,1,Attended
3,New York USA,North America,United States,USD,840,40.712728,-74.006015,6/20/2022 20:25:26,New York,New York,...,I have used some PyMC for machine learning ana...,I have never looked at the source code.,To collaborate in building upon the data scien...,Yes,English,No,1,1,1,Attended
5,New York USA,North America,United States,USD,840,40.712728,-74.006015,6/23/2022 9:25:35,New York,New York,...,I am not at all familiar with PyMC.,"I am familiar, I have looked at the codebase.",I am interested in getting started in open sou...,Yes,English,No,1,1,1,Attended
6,New York USA,North America,United States,USD,840,40.712728,-74.006015,6/28/2022 12:35:09,New York,New York,...,I am not at all familiar with PyMC.,I am not at all familiar with this library.,To expend a bit my skills and knowledge on dat...,Yes,English,No,1,1,1,Attended


In [33]:
import plotly.express as px

grouped_yr_status = subset.groupby(['country','status_c']).count()
grouped_yr_status=grouped_yr_status.reset_index()

df4 = grouped_yr_status[['country', 'status_c', 'count']]

fig = px.bar(df4, 
             y="country", 
             x="count", 
             color="status_c", 
             text='count',
             #textposition='inside',
             orientation='h',
            )

fig.update_layout(
    title="by Country: Attendee Status",
    xaxis_title="Count",
    yaxis_title="Country",
    legend_title="Status",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    ),
    yaxis_categoryorder='category ascending',
                  width=700,
                  height=700,    
    
)
 
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'})
fig.update_layout(showlegend=True)
#fig.write_html(f'../graphs/barplot_country.html', include_plotlyjs="cdn")

fig.show()

In [34]:
subset.head(3)

Unnamed: 0,location,continent_o,country_o,iso_alpha,iso_num,lat,lng,timestamp,city,state-province,...,used_pymc,pymc_repo_familiar,why_in_sprint,prep_work,primary_spoken_language,language_translation,count_rows,count,status_n,status_c
2,New York USA,North America,United States,USD,840,40.712728,-74.006015,6/13/2022 19:56:59,New York,New York,...,I am not at all familiar with PyMC.,I am not at all familiar with this library.,gain experience and learn,Yes,English,No,1,1,1,Attended
3,New York USA,North America,United States,USD,840,40.712728,-74.006015,6/20/2022 20:25:26,New York,New York,...,I have used some PyMC for machine learning ana...,I have never looked at the source code.,To collaborate in building upon the data scien...,Yes,English,No,1,1,1,Attended
5,New York USA,North America,United States,USD,840,40.712728,-74.006015,6/23/2022 9:25:35,New York,New York,...,I am not at all familiar with PyMC.,"I am familiar, I have looked at the codebase.",I am interested in getting started in open sou...,Yes,English,No,1,1,1,Attended


In [35]:
# do by primary spoken language

import plotly.express as px

grouped_yr_status = subset.groupby(['country','status_c','primary_spoken_language']).count()
grouped_yr_status=grouped_yr_status.reset_index()

df4 = grouped_yr_status[['country', 'status_c','primary_spoken_language', 'count']]

fig = px.bar(df4, 
             y="primary_spoken_language", 
             x="count", 
             color="country", 
             text='count',
             #textposition='inside',
             orientation='h',
            )

fig.update_layout(
    title="by Primary Spoken Language(s)",
    xaxis_title="Count",
    yaxis_title="Language",
    legend_title="Status",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    ),
    yaxis_categoryorder='category ascending',
                  width=700,
                  height=700,    
    
)
 
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'})
fig.update_layout(showlegend=True)
#fig.write_html(f'../graphs/spoken_languages.html', include_plotlyjs="cdn")

fig.show()

In [36]:
# do contributor status

import plotly.express as px

grouped_yr_status = subset.groupby(['country','status_c','submitted_pr']).count()
grouped_yr_status=grouped_yr_status.reset_index()

df4 = grouped_yr_status[['country', 'status_c','submitted_pr', 'count']]

fig = px.bar(df4, 
             y="submitted_pr", 
             x="count", 
             color="country", 
             text='count',
             #textposition='inside',
             orientation='h',
            )

fig.update_layout(
    title="by Contributor Status (New or Returning)",
    xaxis_title="Count",
    yaxis_title="Contributor Status",
    legend_title="Status",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    ),
    yaxis_categoryorder='category ascending',
                  width=700,
                  height=700,    
    
)
 
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'})
fig.update_layout(showlegend=True)
#fig.write_html(f'../graphs/contr_status_country.html', include_plotlyjs="cdn")

fig.show()

In [37]:
# do role

import plotly.express as px

grouped_yr_status = subset.groupby(['role','status_c']).count()
grouped_yr_status=grouped_yr_status.reset_index()

df4 = grouped_yr_status[['role', 'status_c', 'count']]

fig = px.bar(df4, 
             y="role", 
             x="count", 
             color="status_c", 
             text='count',
             #textposition='inside',
             orientation='h',
            )

fig.update_layout(
    title="by Role",
    xaxis_title="Count",
    yaxis_title="Role",
    legend_title="Status",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    ),
    yaxis_categoryorder='category ascending',
                  width=700,
                  height=700,    
    
)
 
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'})
fig.update_layout(showlegend=True)
#fig.write_html(f'../graphs/work_role.html', include_plotlyjs="cdn")

fig.show()

In [38]:
# How did people find about about the sprint?

# do role

import plotly.express as px


grouped_yr_status = subset.groupby(['learn_of_sprint','status_c']).count()
grouped_yr_status=grouped_yr_status.reset_index()

df4 = grouped_yr_status[['learn_of_sprint', 'status_c', 'count']]

fig = px.bar(df4, 
             y="learn_of_sprint", 
             x="count", 
             color="status_c", 
             text='count',
             #textposition='inside',
             orientation='h',
            )

fig.update_layout(
    title="How did you learn of sprint?",
    xaxis_title="Count",
    yaxis_title="Avenue",
    legend_title="Status",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    ),
    yaxis_categoryorder='category ascending',
                  width=700,
                  height=700,    
    
)
 
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'})
fig.update_layout(showlegend=True)
#fig.write_html(f'graphs/learn_of_sprint.html', include_plotlyjs="cdn")

fig.show()

In [39]:
import plotly.express as px

# Select the ones you want
df_use=dfall.copy()

# df_use['Participant_Status']='xxx'

# df_use['Participant_Status'].mask(df_use['status'] == 0, '2_Did Not Attend', inplace=True)
# df_use['Participant_Status'].mask(df_use['status'] == 1, '1_Attended', inplace=True)

# drop_cols = ['status']
# df_use = df_use.drop(drop_cols, axis=1, inplace=True)

# #df_use['status'].mask(df_use['status'] == '0', 'Did Not Attend', inplace=True)
# #df_use['status'].mask(df_use['status'] == '1', 'Attended', inplace=True)

# df_use = df_use.rename(columns={'Participant_Status': 'status'})

variable = "status_c"

grouped_yr_status = df_use.groupby(['learn_of_sprint','status_c']).count()
grouped_yr_status=grouped_yr_status.reset_index()

df4 = grouped_yr_status[['learn_of_sprint', 'status_c', 'count']]

fig = px.bar(df4, 
             y="learn_of_sprint", 
             x="count", 
             color=variable, 
             text='count',
             #textposition='inside',
             orientation='h',
            )

fig.update_layout(
    title=f"by Country: Attendee {variable}",
    xaxis_title="Count",
    yaxis_title="Outreach",
    legend_title="Status",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    ),
    yaxis_categoryorder='category ascending',
                  width=700,
                  height=700,    
    
)
 
fig.update_layout(barmode='group', 
                  yaxis={'categoryorder':'total ascending'})
fig.update_layout(showlegend=True)
#fig.write_html(f'../graphs/learn_all.html', include_plotlyjs="cdn")

fig.show()

---

In [40]:
import plotly.express as px

# Select the ones you want
df_use=dfall.copy()

grouped_yr_status = df_use.groupby(['country','learn_of_sprint','status_c']).count()
grouped_yr_status=grouped_yr_status.reset_index()

df4 = grouped_yr_status[['country','learn_of_sprint', 'status_c', 'count']]

fig = px.bar(df4, 
             #y="learn_of_sprint", 
             x="count", 
             #color="country", 
             y="country",
             color="learn_of_sprint",
             text='count',
             #textposition='inside',
             orientation='h',
            )

fig.update_layout(
    title="All Applicants (Sprint Outreach)",
    xaxis_title="Count",
    yaxis_title="Country",
    legend_title="Status",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    ),
    yaxis_categoryorder='category ascending',
                  width=700,
                  height=700,    
    
)
 
fig.update_layout(barmode='stack', 
                  yaxis={'categoryorder':'total ascending'})
fig.update_layout(showlegend=True)
#fig.write_html(f'../graphs/learn_all_country.html', include_plotlyjs="cdn")

fig.show()

---

In [41]:
import pandas as pd

dfall = pd.read_csv('../data/data_derived/pymc_2022_derived.csv')


def group_data(df_use, byvar1, byvar2 ):
    df_grouped = df_use.groupby([byvar1, byvar2, 'status_c']).count()
    df_grouped = df_grouped.reset_index()

    df_use = df_grouped[[byvar1, byvar2, 'status_c', 'count_rows']]

    return df_use

df_use = group_data(dfall, 'country','learn_of_sprint')
df_use = group_data(dfall, 'country','role')


df_use.head(3)

Unnamed: 0,country,role,status_c,count_rows
0,Brasil,Data Scientist,Attended,1
1,Brasil,"Data Scientist, Statistician, Student",Attended,1
2,Canada,"Data Scientist, Student",Attended,1


In [42]:
import pandas as pd

def group_data(df_use, byvar):
    data_url = 'https://raw.githubusercontent.com/data-umbrella/data-umbrella-sprints-dashboard/main/data/data_derived/pymc_2022_derived.csv'
    df_use = pd.read_csv(data_url)
    byvar_list = byvar 
    byvar_list.append("status_c")
    df_grouped = df_use.groupby(byvar_list).count()
    df_grouped = df_grouped.reset_index()

    byvar_list.append("count_rows")
    df_use = df_grouped[byvar_list]

    return df_use

df_use = group_data(dfall, ['country','learn_of_sprint'])
#df_use = group_data(dfall, ['country','role'])
#df_use = group_data(dfall, ['country'])


df_use.head(3)

Unnamed: 0,country,learn_of_sprint,status_c,count_rows
0,Brasil,"LinkedIn, Meetup",Attended,1
1,Brasil,Meetup,Attended,1
2,Canada,Slack,Attended,1


In [43]:
import pandas as pd
import plotly.express as px


def group_data(df_use, byvar):
    byvar_list = byvar 
    byvar_list.append("status_c")
    df_grouped = df_use.groupby(byvar_list).count()
    df_grouped = df_grouped.reset_index()

    byvar_list.append("count_rows")
    df_use = df_grouped[byvar_list]

    return df_use


def graph_region(region_df, graph_type: str, dimension1: str, dimension2: str, dimension3: str) -> None:
    """
    Parameters
    ----------
        region_df: (dataframe object) reshaped data frame object with mortage, delinquency and population data
        graph_type: (string) "box", "violin", "scatter", "line", "pie", "bar", "funnel", "scatter_geo"
        dimension1: (str) one of 'Time' or 'Geography'
        dimension2: (str) one of 'AverageMortgageAmount', 'AverageMortgageAmount' or 'PopulationSize'
        
    Returns:
    --------
        None
    """
    
    # Dictionary of plots
    plot_dict = {'box': px.box, 
                 'violin': px.violin,
                 'scatter': px.scatter,
                 'line': px.line,
                 'pie': px.pie,
                 'bar': px.bar,
                 'funnel': px.funnel,
                 'scatter_geo': px.scatter_geo,
                 }
        
    try:
        fig = plot_dict[graph_type](region_df, 
                                     x=dimension1, 
                                     y=dimension2, 
                                     color = dimension3,
                                     hover_name = dimension3,
                                     text=dimension1,
                                     #textposition='inside',
                                     orientation='h',
                                   )
            
        # Format figure 
        title_string = f'Chart: {graph_type} plot of {dimension1} and {dimension2} by {dimension3}'
        fig.update_layout(title = title_string)
        #fig.update_xaxes(tickangle=-45)
        #fig.update_layout(yaxis_categoryorder='category ascending')
        fig.update_layout(barmode='stack', 
                  yaxis={'categoryorder':'total ascending'})
        fig.show()
    
    except KeyError:
        print("Key not found. Make sure that 'graph_type' is in ['box','violin', 'scatter', 'line', 'pie', 'bar','funnel', 'scatter_geo']")
    except ValueError:
        print("Dimension is not valid. dimension1 is one of 'Time' or 'Geography'")
        print("dimension2 is one of 'AverageMortgageAmount', 'DelinquencyRate', 'PopulationSize'")
        
        
if __name__ == '__main__':  
      
    # Read the data into a dataframe 
    url = 'https://raw.githubusercontent.com/data-umbrella/data-umbrella-sprints-dashboard/main/data/data_derived/pymc_2022_derived.csv'
    data_use = pd.read_csv(url, index_col=0)

    # See the first few rows
    #display(data_use.head(1))
    
    # Plot:
    df_use = group_data(data_use, ['country'])
    graph_region(df_use, 'bar', "count_rows", "country",  "status_c")
    
    # Plot:
    df_use = group_data(data_use, ['gender'])
    graph_region(df_use, 'bar', "count_rows", "gender", "status_c")
    
    # Plot:
    df_use = group_data(data_use, ['learn_of_sprint'])
    graph_region(df_use, 'bar',  "count_rows", "learn_of_sprint", "status_c")
    
    # Plot: (too many rows)
#     df_use = group_data(data_use, ['role'])
#     graph_region(df_use, 'bar', "count_rows","role",  "status_c")

    # Plot
    df_use = group_data(data_use, ['submitted_pr'])
    graph_region(df_use, 'bar', "count_rows", "submitted_pr",  "status_c")
    