# GitHub repositories analysis

In [1]:
import pandas as pd
import os
import glob

## Read the data

In [2]:
files = glob.glob("./data/**/*.csv", recursive=True)
print("Encontrados", len(files), "ficheros")

Encontrados 16 ficheros


In [3]:
df = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)

In [4]:
df.sort_values(by=["ts", "views_total"], ascending=False).head(3)

Unnamed: 0.1,Unnamed: 0,name,ts,stars_count,views_total,views_uniques,clones_total,clones_uniques
43,3,ml-github,2022-07-13,0,6,1,4,3
41,1,golang-rest-api-cassandra,2022-07-13,2,2,1,42,32
40,0,b0rr3g0-blog,2022-07-13,0,0,0,1,1


## Data analysis

In [5]:
df_grouped = df.groupby(['name']).agg({"stars_count": ["max"],"clones_uniques": ["sum"],"views_total": ["sum", "max"]})

### Views

In [6]:
views_best_quantile = 0.75
views_worst_quantile = 0.25
best_views_percentile = df_grouped[('views_total', 'sum')].quantile(views_best_quantile)
worst_views_percentile = df_grouped[('views_total', 'sum')].quantile(views_worst_quantile)
print("Best views over:", best_views_percentile)
print("Worst views under:", worst_views_percentile)

Best views over: 29.5
Worst views under: 1.0


In [7]:
views=df_grouped[('views_total')]
views_best = views[views['sum'] > best_views_percentile]
views_best.sort_values(by='sum', ascending=False)

Unnamed: 0_level_0,sum,max
name,Unnamed: 1_level_1,Unnamed: 2_level_1
quarkus-kotlin-example,177,85
golang-rest-api-cassandra,91,32
kotlin-ktor,84,25
kustomize-vs-helm,43,24
ta,38,26
ml-github,35,13
pelorus-workshop,34,14


### Clones

In [8]:
# calculate the best 25% repositories
best_clones_quantile = 0.75
worst_clones_quantile = 0.5
best_clones_percentile = df_grouped[('clones_uniques', 'sum')].quantile(best_clones_quantile)
worst_clones_percentile = df_grouped[('clones_uniques', 'sum')].quantile(worst_clones_quantile)
print("Best clones over:", best_clones_percentile)
print("Worst clones under:", worst_clones_percentile)

Best clones over: 6.25
Worst clones under: 1.0


In [9]:
clones=df_grouped[('clones_uniques')]
clones_best = clones[clones['sum'] > best_clones_percentile]
clones_best.sort_values(by='sum', ascending=False)

Unnamed: 0_level_0,sum
name,Unnamed: 1_level_1
ta,320
golang-rest-api-cassandra,297
mqtt-golang-influxdb,149
iot-devices-crud,130
pelorus-workshop,14
ml-github,13
b0rr3g0-blog,10


# Stars

In [10]:
# calculate the best 50% repositories
best_stars_quantile = 0.75
worst_stars_quantile = 0.25
best_stars_percentile = df_grouped[('stars_count', 'max')].quantile(best_stars_quantile)
worst_stars_percentile = df_grouped[('stars_count', 'max')].quantile(worst_stars_quantile)
print("Best stars over:", best_stars_percentile)
print("Worst stars under:", worst_stars_percentile)

Best stars over: 2.0
Worst stars under: 0.0


In [11]:
stars=df_grouped[('stars_count')]
stars_best = stars[stars['max'] > best_stars_percentile]
stars_best.sort_values(by=['max'], ascending=False)

Unnamed: 0_level_0,max
name,Unnamed: 1_level_1
mqtt-golang-influxdb,9
spring-kotlin-crud,6
kotlin-ktor,5
kops-aws,4
kustomize-vs-helm,3
ta,3


# Red zone

In [12]:
df_red_zone = df_grouped[df_grouped[('stars_count', 'max')] <= worst_stars_percentile]
df_red_zone = df_red_zone[df_red_zone[('clones_uniques', 'sum')] <= worst_clones_percentile]
df_red_zone = df_red_zone[df_red_zone[('views_total', 'sum')] <= worst_views_percentile]
df_red_zone.index.values.tolist()

['golang-azure-eventhub',
 'golang-azure-eventhub-kafka',
 'golang-gin-gonic',
 'ic-gradle',
 'spring-cloud-dataflow']