In [1]:
import plotly
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

import pandas

#from util import ESConnection
#from elasticsearch_dsl import Search, Q

#es_conn = ESConnection()

init_notebook_mode(connected=True)


INIT_DATE_4Y = '2013-10-01'
INIT_DATE_1Y = '2016-10-01'
END_DATE = '2017-10-01'

In [2]:
from plotly.graph_objs import *
trace0 = Scatter(
    x=[1, 2, 3, 4],
    y=[10, 15, 13, 17]
)
trace1 = Scatter(
    x=[1, 2, 3, 4],
    y=[16, 5, 11, 9]
)
data = Data([trace0, trace1])

iplot(data)

## Basic metrics to be in the report:

**Git**
* Evolution and trends over time [per quarter] of commits by gender
  * Commits by gender (columns: hash, gender)
* Evolution and trends [per quarter] of developers over time by gender
  * Developers by gender (columns: name, uuid, gender)
* Evolution and trends of type of contributions (code or others) by gender over time
  * Type of file touched by developers (columns: filetype, gender)

**Gerrit**
* Evolution of code reviews over time by gender
  * Count votes by gender (vote, gender)
* Evolution of code reviews developers over time by gender
  * Count people voting (name, uuid, vote)
* Evolution of core reviews over time by gender
  * Votes +2 or -2 (vote +2/-2 and gender)
* Evolution of core review developers over time by gender
  * People voting +2 or -2 (name, uuid, vote +2/-2, gender)

**Others**
* Evolution of attracted developers over time by gender
  * First commit by gender
* Time working in the community by gender
  * Time difference between the first and last contribution by all developers (so how long developers remain in OpenStack?).

# SOME FUNCTIONS

In [2]:
def query_metric_over_time(index, metric_name, metric_field, filters = []):
    
    s = Search(using=es_conn, index=index)  # Index selection
    for filtering in filters:
        s = s.filter(filtering)
    s.aggs.bucket('gender', 'terms', field='gender')\
          .bucket('time', 'date_histogram', field='date', interval='quarter')\
          .metric(metric_name, 'cardinality', field=metric_field, precision_threshold=10000)
    result = s.execute()

    value = result.to_dict()["aggregations"]['gender']['buckets']
    
    df = pandas.DataFrame()
    for i in value:
        df2 = (pandas.DataFrame.from_dict(i["time"]["buckets"]))
        df2["gender"] = i["key"]
        df2[metric_name] = df2[metric_name].apply(lambda row:row["value"])
        df = pandas.concat([df, df2])
        
    return df

In [3]:
def query_total_piechart(index, metric_name, metric_field, filters = []):
    s = Search(using=es_conn, index=index)  # Index selection
    for filtering in filters:
        s = s.filter(filtering)
    #s = s.filter('range', date={'gt': start_date, 'lt':'now/M'}) # filter date
    s.aggs.bucket('gender', 'terms', field='gender')\
          .metric(metric_name, 'cardinality', field=metric_field, precision_threshold=1000000)
    result = s.execute()
    
    buckets = result.to_dict()["aggregations"]["gender"]["buckets"]
    pie_chart_labels = []
    pie_chart_values = []
    pie_chart_count = []
    for bucket in buckets:
        pie_chart_labels.append(bucket["key"])
        pie_chart_values.append(bucket[metric_name]["value"])
        pie_chart_count.append(bucket["doc_count"])
        
    return pie_chart_labels, pie_chart_values, pie_chart_count

In [4]:
def draw_evolutionary_chart(label, values):
    female_changesets = plotly.graph_objs.Scatter(
    x = df[df["gender"]=="female"][label],
    y = df[df["gender"]=="female"][values],
    name = 'female'
    )

    male_changesets = plotly.graph_objs.Scatter(
        x = df[df["gender"]=="male"][label],
        y = df[df["gender"]=="male"][values],
        name = 'male'
    )

    unknown_changesets = plotly.graph_objs.Scatter(
        x = df[df["gender"]=="NotKnown"][label],
        y = df[df["gender"]=="NotKnown"][values],
        name = 'unknown'
    )

    iplot([female_changesets, male_changesets, unknown_changesets])
    # Using only plot and not iplot creates and HTML page with JS to play with the chart

In [5]:
def draw_piechart(pie_chart_labels, pie_chart_values, title):
            
    fig = {
    'data': [{'labels': pie_chart_labels,
              'values': pie_chart_values,
              'type': 'pie'}],
    'layout': {'title': title}
     }
    iplot(fig)

# GERRIT

In [6]:
INDEX = "gerrit_eventized"
filter_date_4y = Q('range', date={'gte': INIT_DATE_4Y, 'lt': END_DATE})
filter_date_1y = Q('range', date={'gt': INIT_DATE_1Y, 'lt': END_DATE})

## Changeset Submissions by Gender

### Evolution of submissions sent over time by gender


In [7]:
METRIC_NAME = "changesets"
METRIC_FIELD = "id"
filter_changeset_submission = Q('term', eventtype='CHANGESET_SENT') # filter by event: vote a code review
df = query_metric_over_time(INDEX, METRIC_NAME, METRIC_FIELD, [filter_date_4y, filter_changeset_submission])

In [8]:
draw_evolutionary_chart("key_as_string", METRIC_NAME)

In [9]:
df

Unnamed: 0,changesets,doc_count,key,key_as_string,gender
0,11646,11647,1380585600000,2013-10-01T00:00:00.000Z,male
1,14878,14843,1388534400000,2014-01-01T00:00:00.000Z,male
2,15121,15127,1396310400000,2014-04-01T00:00:00.000Z,male
3,16318,16304,1404172800000,2014-07-01T00:00:00.000Z,male
4,14421,14408,1412121600000,2014-10-01T00:00:00.000Z,male
5,18582,18573,1420070400000,2015-01-01T00:00:00.000Z,male
6,20717,20686,1427846400000,2015-04-01T00:00:00.000Z,male
7,23271,23224,1435708800000,2015-07-01T00:00:00.000Z,male
8,23414,23483,1443657600000,2015-10-01T00:00:00.000Z,male
9,27086,27294,1451606400000,2016-01-01T00:00:00.000Z,male


### Aggregated changeset submissions by gender

In [10]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD, [filter_date_4y,
                                                                                             filter_changeset_submission])

In [11]:
draw_piechart(pie_chart_labels, pie_chart_values, 'Changeset Submissions by Gender (last 4 years)')

In [12]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD, [filter_date_1y,
                                                                                             filter_changeset_submission])


In [13]:
draw_piechart(pie_chart_labels, pie_chart_values, 'Changeset Submissions by Gender (last year)')

## Population of people submitting changesets
### Evolution of submitters over time by gender
  * Count people submitting (id, uuid)

In [14]:
METRIC_NAME = "submitters"
METRIC_FIELD = "uuid"
df = query_metric_over_time(INDEX, METRIC_NAME, METRIC_FIELD, [filter_date_4y, filter_changeset_submission])
draw_evolutionary_chart("key_as_string", METRIC_NAME)

### Aggregated number of submitters


In [15]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD, [filter_date_4y,
                                                                                             filter_changeset_submission])
draw_piechart(pie_chart_labels, pie_chart_values, 'Changeset Submitters by Gender (last 4 years)')

In [15]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD,  [filter_date_1y,
                                                                                              filter_changeset_submission])
draw_piechart(pie_chart_labels, pie_chart_values, 'Changeset Submitters by Gender (last year)')

* Evolution of code reviews developers over time by gender
  * Count people voting (name, uuid, vote)


## Number of votes by gender

In [16]:
METRIC_NAME = "reviewer"
METRIC_FIELD = "uuid"
filter_vote = Q('term', eventtype='CHANGESET_PATCHSET_APPROVAL_Code-Review') # filter by event: vote a code review

df = query_metric_over_time(INDEX, METRIC_NAME, METRIC_FIELD, [filter_date_4y, filter_vote])
draw_evolutionary_chart("key_as_string", "doc_count")

In [17]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD,  [filter_date_4y,
                                                                                              filter_vote])
draw_piechart(pie_chart_labels, pie_chart_count, 'Changeset votes by Gender (last 4 years)')

In [18]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD,  [filter_date_1y,
                                                                                              filter_vote])
draw_piechart(pie_chart_labels, pie_chart_count, 'Changeset votes by Gender (last year)')

## Number of people voting

In [19]:
draw_evolutionary_chart("key_as_string", METRIC_NAME)

In [20]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD,  [filter_date_4y,
                                                                                              filter_vote])
draw_piechart(pie_chart_labels, pie_chart_values, 'People voting by Gender (last 4 years)')

In [21]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD,  [filter_date_1y,
                                                                                              filter_vote])
draw_piechart(pie_chart_labels, pie_chart_values, 'People voting by Gender (last year)')

## Number of core reviews (-2 OR +2) by gender

In [22]:
METRIC_NAME = "core_reviewers"
METRIC_FIELD = "uuid"

filter_core_vote = Q('terms', value=["2", "-2"])
filter_vote = Q('term', eventtype='CHANGESET_PATCHSET_APPROVAL_Code-Review') # filter by event: vote a code review
df = query_metric_over_time(INDEX, METRIC_NAME, METRIC_FIELD, [filter_core_vote, filter_date_4y, filter_vote])
draw_evolutionary_chart("key_as_string", "doc_count")

In [23]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD,  [filter_date_4y,
                                                                                              filter_vote, filter_core_vote])
draw_piechart(pie_chart_labels, pie_chart_count, 'Core Reviews by Gender (last 4 years)')

In [24]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD,  [filter_date_1y,
                                                                                              filter_vote, filter_core_vote])
draw_piechart(pie_chart_labels, pie_chart_count, 'Core Reviews by Gender (last year)')

## Number of people acting as core reviewers (-2 OR +2) by gender

In [25]:
draw_evolutionary_chart("key_as_string", METRIC_NAME)

In [26]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD,  [filter_date_4y,
                                                                                              filter_vote, filter_core_vote])
draw_piechart(pie_chart_labels, pie_chart_values, 'Core Reviewers by Gender (last 4 years)')

In [27]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD,  [filter_date_1y,
                                                                                              filter_vote, filter_core_vote])
draw_piechart(pie_chart_labels, pie_chart_values, 'Core Reviewers by Gender (last year)')

# GIT

In [28]:
INDEX = "git_eventized"
filter_merges_addedlines = Q('range', addedlines={'gt': 0})
filter_merges_removedlines = Q('range', removedlines={'gt': 0})
filter_bots = Q('bool', must_not=[Q('match', gender_analyzed_name='Jenkins')])

In [29]:
git_filter = Q('bool',
    must_not=[Q('match', gender_analyzed_name='Jenkins')],
    should=[filter_merges_addedlines, filter_merges_removedlines]
)

* Evolution and trends over time [per quarter] of commits by gender
  * Commits by gender (columns: hash, gender)

In [30]:
METRIC_NAME = "commits"
METRIC_FIELD = "id.keyword"
#df = query_metric_over_time(INDEX, METRIC_NAME, METRIC_FIELD, [filter_date_4y, filter_merges_addedlines,
#                                                               filter_merges_removedlines, filter_bots])
df = query_metric_over_time(INDEX, METRIC_NAME, METRIC_FIELD, [filter_date_4y, git_filter])

In [31]:
draw_evolutionary_chart("key_as_string", METRIC_NAME)

In [32]:
#pie_chart_labels, pie_chart_values = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD, 
#                                                            [filter_date_4y, filter_merges_addedlines,
#                                                             filter_merges_removedlines, filter_bots])

pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD, 
                                                            [filter_date_4y, git_filter])

draw_piechart(pie_chart_labels, pie_chart_values, 'Commits by Gender (last 4 years)')

In [33]:
#pie_chart_labels, pie_chart_values = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD, 
#                                                            [filter_date_1y, filter_merges_addedlines,
#                                                             filter_merges_removedlines, filter_bots])

pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD, 
                                                            [filter_date_1y, git_filter])

draw_piechart(pie_chart_labels, pie_chart_values, 'Commits by Gender (last year)')


* Evolution and trends [per quarter] of developers over time by gender
  * Developers by gender (columns: name, uuid, gender)

In [34]:
METRIC_NAME = "authors"
METRIC_FIELD = "uuid"
filter_merges_addedlines = Q('range', addedlines={'gt': 0})
filter_merges_removedlines = Q('range', removedlines={'gt': 0})
filter_bots = Q('bool', must_not=[Q('match', gender_analyzed_name='Jenkins')])
#df = query_metric_over_time(INDEX, METRIC_NAME, METRIC_FIELD, [filter_date_4y, filter_merges_addedlines, filter_merges_removedlines, filter_bots])
df = query_metric_over_time(INDEX, METRIC_NAME, METRIC_FIELD, [filter_date_4y, git_filter])

draw_evolutionary_chart("key_as_string", METRIC_NAME)

In [35]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD, 
                                                            [filter_date_4y, git_filter])
draw_piechart(pie_chart_labels, pie_chart_values, 'Authors by Gender (last 4 years)')

In [36]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD, 
                                                            [filter_date_1y, git_filter])
draw_piechart(pie_chart_labels, pie_chart_values, 'Authors by Gender (last year)')

* Evolution and trends of type of contributions (code or others) by gender over time
  * Type of file touched by developers (columns: filetype, gender)

In [37]:
METRIC_NAME = "code_files_touched"
METRIC_FIELD = "id.keyword"
filter_merges_addedlines = Q('range', addedlines={'gt': 0})
filter_merges_removedlines = Q('range', removedlines={'gt': 0})
filter_bots = Q('bool', must_not=[Q('match', gender_analyzed_name='Jenkins')])
filter_filetype = Q('term', filetype='code')
df = query_metric_over_time(INDEX, METRIC_NAME, METRIC_FIELD, [filter_date_4y, git_filter,
                                                               filter_filetype])

draw_evolutionary_chart("key_as_string", "doc_count")

In [38]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD, 
                                                            [filter_date_4y, git_filter, filter_filetype])
draw_piechart(pie_chart_labels, pie_chart_count, 'Code Files touched by Gender (last 4 years)')

In [39]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD, 
                                                            [filter_date_1y, git_filter, filter_filetype])
draw_piechart(pie_chart_labels, pie_chart_count, 'Code Files touched by Gender (last 1 year)')

In [40]:
METRIC_NAME = "others_files_touched"
METRIC_FIELD = "id.keyword"
filter_merges_addedlines = Q('range', addedlines={'gt': 0})
filter_merges_removedlines = Q('range', removedlines={'gt': 0})
filter_bots = Q('bool', must_not=[Q('match', gender_analyzed_name='Jenkins')])
filter_filetype = Q('term', filetype='other')
df = query_metric_over_time(INDEX, METRIC_NAME, METRIC_FIELD, [filter_date_4y, git_filter,
                                                               filter_filetype])
draw_evolutionary_chart("key_as_string", "doc_count")


In [41]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD, 
                                                            [filter_date_4y, git_filter, filter_filetype])
draw_piechart(pie_chart_labels, pie_chart_count, 'Non-Code Files touched by Gender (last 4 years)')

In [42]:
pie_chart_labels, pie_chart_values, pie_chart_count = query_total_piechart(INDEX, METRIC_NAME, METRIC_FIELD, 
                                                            [filter_date_1y, git_filter, filter_filetype])
draw_piechart(pie_chart_labels, pie_chart_count, 'Non-Code Files touched by Gender (last year)')

# Top Projects by women activity and women population

In [43]:
METRIC_NAME = "commits"
METRIC_FIELD = "id.keyword"

filters = [filter_date_4y, git_filter]

s = Search(using=es_conn, index=INDEX)  # Index selection
for filtering in filters:
    s = s.filter(filtering)
s.aggs.bucket('project', 'terms', field='projects', size=1000)\
      .bucket('gender', 'terms', field='gender')\
      .metric(METRIC_NAME, 'cardinality', field=METRIC_FIELD, precision_threshold=10000)\
      .metric('authors', 'cardinality', field='uuid')
result = s.execute()

values = result.to_dict()

In [59]:
value = result.to_dict()["aggregations"]['project']['buckets']
    
df = pandas.DataFrame()
df_summary = pandas.DataFrame(columns=["project", "commits", "authors", "ratio_commits", "ratio_authors"])
for i in value:
    df2 = (pandas.DataFrame.from_dict(i["gender"]["buckets"]))
    df2["project"] = i["key"]
    
    df2[METRIC_NAME] = df2[METRIC_NAME].apply(lambda row:row["value"])
    df2['authors'] = df2['authors'].apply(lambda row:row['value'])
    df = pandas.concat([df, df2])
    
    df_summary2 = pandas.DataFrame(columns=["ratio_authors", "ratio_commits", "authors", "commits", "project"])

    df_summary2["project"] = [i["key"]]
    df_summary2["commits"] = [int(df2[df2["key"]=="female"]["commits"])]
    df_summary2["authors"] = [int(df2[df2["key"]=="female"]["authors"])]
    df_summary2["ratio_commits"] = [float(df2[df2["key"]=="female"]["commits"]) / float(df2[df2["key"]=="male"]["commits"])*100]
    df_summary2["ratio_authors"] = [float(df2[df2["key"]=="female"]["authors"]) / float(df2[df2["key"]=="male"]["authors"])*100]


    df_summary = pandas.concat([df_summary, df_summary2])

df_summary = df_summary[["project", "authors", "commits", "ratio_authors", "ratio_commits"]]
#df_summary.columns = ["project", "authors", "commits", "ratio_authors", "ratio_commits"]
    


In [60]:
print(df_summary)

                 project  authors  commits  ratio_authors  ratio_commits
0          Packaging-deb    623.0  13175.0      14.361457       8.536736
0                 murano     31.0   1002.0      21.379310      27.073764
0         Infrastructure    145.0   2303.0      11.214230       4.849442
0                   fuel     48.0   4518.0      14.634146      17.884570
0                neutron     99.0    858.0      15.840000       7.279824
0       OpenStack Charms      9.0    639.0       8.181818       3.438072
0          Documentation    145.0   3080.0      20.393812      37.211550
0                   nova    121.0    891.0      16.285330       6.913944
0              Telemetry     57.0    281.0      21.590909       5.624500
0       OpenStackAnsible     23.0    377.0      13.372093       3.270866
0                tripleo     40.0    663.0      11.204482       4.850746
0                octavia     44.0    312.0      11.956522       8.208366
0                horizon     96.0    949.0      22.

In [61]:
print(df_summary.sort_values("ratio_commits", ascending=False))

                 project  authors  commits  ratio_authors  ratio_commits
0              tricircle      8.0     41.0      72.727273     128.125000
0                    zun     10.0     97.0      38.461538      47.783251
0          Documentation    145.0   3080.0      20.393812      37.211550
0                  kuryr      9.0    111.0      19.565217      33.035714
0                vitrage     11.0    167.0      45.833333      28.842832
0                 murano     31.0   1002.0      21.379310      27.073764
0             dragonflow     13.0    226.0      30.952381      25.799087
0                horizon     96.0    949.0      22.748815      25.333689
0                 ironic     43.0   1172.0      14.478114      24.209874
0               congress     10.0    239.0      12.987013      24.044266
0                 tacker     33.0    180.0      22.758621      22.988506
0                 magnum     26.0    315.0      17.931034      22.292994
0                   I18n      1.0     18.0       7.

In [47]:
df_summary.sort_values("ratio_authors", ascending=False)


Unnamed: 0,project,authors,commits,ratio_authors,ratio_commits
0,8.0,41.0,tricircle,72.727273,128.125
0,11.0,167.0,vitrage,45.833333,28.842832
0,10.0,97.0,zun,38.461538,47.783251
0,8.0,12.0,karbor,38.095238,7.272727
0,16.0,171.0,senlin,32.653061,5.765341
0,13.0,226.0,dragonflow,30.952381,25.799087
0,13.0,46.0,watcher,23.636364,9.368635
0,33.0,180.0,tacker,22.758621,22.988506
0,96.0,949.0,horizon,22.748815,25.333689
0,26.0,117.0,Security,22.033898,7.758621


In [62]:
print(df_summary.sort_values("authors", ascending=False))


                 project  authors  commits  ratio_authors  ratio_commits
0          Packaging-deb    623.0  13175.0      14.361457       8.536736
0          Documentation    145.0   3080.0      20.393812      37.211550
0         Infrastructure    145.0   2303.0      11.214230       4.849442
0                   nova    121.0    891.0      16.285330       6.913944
0      Quality Assurance    101.0    650.0      13.760218       7.747318
0                neutron     99.0    858.0      15.840000       7.279824
0                horizon     96.0    949.0      22.748815      25.333689
0                 cinder     77.0    532.0      13.898917       9.646419
0                   oslo     59.0    416.0      11.706349       4.804250
0              Telemetry     57.0    281.0      21.590909       5.624500
0                 glance     57.0    234.0      18.811881      10.077519
0               keystone     56.0    621.0      15.217391      10.482782
0                   heat     54.0    217.0      15.

In [63]:
print(df_summary.sort_values("commits", ascending=False))


                 project  authors  commits  ratio_authors  ratio_commits
0          Packaging-deb    623.0  13175.0      14.361457       8.536736
0                   fuel     48.0   4518.0      14.634146      17.884570
0          Documentation    145.0   3080.0      20.393812      37.211550
0         Infrastructure    145.0   2303.0      11.214230       4.849442
0                 ironic     43.0   1172.0      14.478114      24.209874
0                 murano     31.0   1002.0      21.379310      27.073764
0                horizon     96.0    949.0      22.748815      25.333689
0       Puppet OpenStack     33.0    894.0       9.455587      11.149913
0                   nova    121.0    891.0      16.285330       6.913944
0                neutron     99.0    858.0      15.840000       7.279824
0                tripleo     40.0    663.0      11.204482       4.850746
0      Quality Assurance    101.0    650.0      13.760218       7.747318
0       OpenStack Charms      9.0    639.0       8.

In [50]:
df_summary.sort_values("project", ascending=True)


Unnamed: 0,project,authors,commits,ratio_authors,ratio_commits
0,1.0,18.0,I18n,7.142857,20.0
0,2.0,22.0,Packaging-rpm,6.896552,2.906209
0,2.0,5.0,ec2-api,13.333333,1.798561
0,3.0,47.0,storlets,16.666667,16.607774
0,4.0,6.0,cloudkitty,10.25641,1.973684
0,5.0,33.0,winstackers,15.151515,7.951807
0,6.0,17.0,Community App Catalog,10.714286,4.632153
0,6.0,15.0,freezer,12.0,1.726122
0,6.0,45.0,solum,10.169492,5.415162
0,7.0,41.0,searchlight,14.285714,8.932462


In [51]:
df_summary.columns

Index(['project', 'authors', 'commits', 'ratio_authors', 'ratio_commits'], dtype='object')

In [52]:
df_summary.columns = ["project", "authors", "commits", "ratio_authors", "ratio_commits"]

In [53]:
df_summary

Unnamed: 0,project,authors,commits,ratio_authors,ratio_commits
0,623.0,13175.0,Packaging-deb,14.361457,8.536736
0,31.0,1002.0,murano,21.37931,27.073764
0,145.0,2303.0,Infrastructure,11.21423,4.849442
0,48.0,4518.0,fuel,14.634146,17.88457
0,99.0,858.0,neutron,15.84,7.279824
0,9.0,639.0,OpenStack Charms,8.181818,3.438072
0,145.0,3080.0,Documentation,20.393812,37.21155
0,121.0,891.0,nova,16.28533,6.913944
0,57.0,281.0,Telemetry,21.590909,5.6245
0,23.0,377.0,OpenStackAnsible,13.372093,3.270866
