# Loading in Data

Loading the scrapped data to pandas

In [None]:
import pandas as pd
import glob
import os
import numpy as np

path = ""
all_files = glob.glob(os.path.join(path, "*.csv"))
activity_df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
activity_df['Created'] = pd.to_datetime(activity_df['Created'])
activity_df['Updated'] = pd.to_datetime(activity_df['Updated'])
pr_df = activity_df[["ID", "SrcRepo", "SrcBranch", "DestRepo", "DestBranch", "Title", "Description", "State", "Author", "Created", "Updated",
			"FileChanged", "Added", "Removed", "Total"]].copy().drop_duplicates()

Make sure the data is loaded properly and do column manipulation for further analysis

In [None]:
activity_df['DateGroup'] = activity_df['Updated'].dt.strftime('%Y-%m')
activity_df['SrcRepo'] = activity_df['SrcRepo'].str.split('/').str[1]
activity_df['DestRepo'] = activity_df['DestRepo'].str.split('/').str[1]
activity_df.head()

In [None]:
pr_df['DateGroup'] = pr_df['Updated'].dt.strftime('%Y-%m')
pr_df['SrcRepo'] = pr_df['SrcRepo'].str.split('/').str[1]
pr_df['DestRepo'] = pr_df['DestRepo'].str.split('/').str[1]
pr_df.head()

# PR Data Analysis

## Grouped by Repositories

In [None]:
def pr_total_group_by(df,myList= [],*args): 
    c = ['Total']
    return df.groupby(myList)[c].sum()

def autopct_total(pct, df):
    absolute = int(pct/100.*np.sum(df["Total"].values.tolist()))
    return "{:.1f}%\n({:d})".format(pct, absolute)

repo_group = pr_total_group_by(pr_df, ['SrcRepo'])
# repo_group = repo_group.plot(kind= 'pie', y= 'Total', legend=False, autopct = lambda pct: autopct_total(pct, repo_group))
repo_group = repo_group.plot(kind= 'bar', y= 'Total', legend=False)
repo_group.set_title('Total Changes by repo')
repo_group.set_ylabel('');

#Un comment if needed per month data
# date_repo_pivot = pr_df.pivot_table( index='DateGroup', columns='SrcRepo', values=['Total'], aggfunc='sum')
# date_repo_group = date_repo_pivot.plot(kind="bar", legend=True)
# date_repo_group.set_title('Total Changes per Month')
# date_repo_group.set_ylabel('');
# date_repo_group.axhline(pr_df["Total"].mean())
# date_repo_group.legend(bbox_to_anchor=(1.0, 1.0))


## Grouped by Authors

In [None]:
author_group = pr_total_group_by(pr_df, ['Author'])
author_group = author_group.plot(kind= 'bar', y= 'Total', legend=False)
author_group.set_title('Total Changes by Author')
author_group.set_ylabel('');

# PR Activity Analysis

## Group by Repositories

In [None]:
activity_repo_pivot = activity_df.pivot_table( index='SrcRepo', columns='Type', values=['Total'], aggfunc='count')
activity_repo_pivot = activity_repo_pivot.plot(kind="bar", legend=True)
activity_repo_pivot.set_title('Total Activity per Repo')
activity_repo_pivot.set_ylabel('');
activity_repo_pivot.legend(bbox_to_anchor=(1.0, 1.0))

## Group by Users

In [None]:
activity_user_pivot = activity_df.pivot_table( index='User', columns='Type', values=['Total'], aggfunc='count')
activity_user_pivot = activity_user_pivot.plot(kind="bar", legend=True)
activity_user_pivot.set_title('Total Activity per User')
activity_user_pivot.set_ylabel('');
activity_user_pivot.legend(bbox_to_anchor=(1.0, 1.0))