<a href="https://colab.research.google.com/github/fzanart/GHDomains/blob/main/Automation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
new_popular_df = pd.read_csv('/content/drive/MyDrive/GHDomains/Resources/new_popular_df_predicted.csv', usecols=['Name','Workflows','Predicted_labels'])
new_popular_df

Unnamed: 0,Name,Workflows,Predicted_labels
0,vuejs/devtools,['Create Release'],Web libraries and frameworks
1,futurice/android-best-practices,,Documentation
2,microsoft/Web-Dev-For-Beginners,"['Azure Static Web Apps CI/CD', 'Lock closed i...",Documentation
3,airbnb/react-sketchapp,,Web libraries and frameworks
4,eugeneyan/applied-ml,,Documentation
...,...,...,...
888,graphql/dataloader,,Non-web libraries and frameworks
889,junyanz/CycleGAN,,Non-web libraries and frameworks
890,Tencent/wcdb,,Application & System software
891,halfrost/Halfrost-Field,['Deploy Blog'],Non-web libraries and frameworks


In [3]:
# binarize the workflows data
new_popular_df['Workflows'] = new_popular_df['Workflows'].notnull().astype('int')
new_popular_df

Unnamed: 0,Name,Workflows,Predicted_labels
0,vuejs/devtools,1,Web libraries and frameworks
1,futurice/android-best-practices,0,Documentation
2,microsoft/Web-Dev-For-Beginners,1,Documentation
3,airbnb/react-sketchapp,0,Web libraries and frameworks
4,eugeneyan/applied-ml,0,Documentation
...,...,...,...
888,graphql/dataloader,0,Non-web libraries and frameworks
889,junyanz/CycleGAN,0,Non-web libraries and frameworks
890,Tencent/wcdb,0,Application & System software
891,halfrost/Halfrost-Field,1,Non-web libraries and frameworks


In [4]:
# Summarise the data:

# Group data by domain
df = new_popular_df.groupby(['Predicted_labels']).sum()
# Add total values
aux_df = new_popular_df.groupby(['Predicted_labels']).count()
aux_df = aux_df['Name']

# Concatenate data and add the difference of total values and repos that has workflows implemented
df = pd.concat([df, aux_df], axis=1)
df['Not Workflow'] = df['Name'] - df['Workflows']

# Rename and reorder columns
columns = ['Has workflow', 'Total', 'Not workflow']
df.columns = columns
df = df[['Has workflow', 'Not workflow', 'Total']]

# Data
df

Unnamed: 0_level_0,Has workflow,Not workflow,Total
Predicted_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Application & System software,55,24,79
Documentation,93,232,325
Non-web libraries and frameworks,97,76,173
Software tools,106,50,156
Web libraries and frameworks,97,63,160


In [5]:
domains = new_popular_df['Predicted_labels'].unique()

for domain in domains:
    print(domain, ':')
    data1 = df.loc[domain]
    data2 = df.loc[domains[domains != domain]].sum()
    aux = np.column_stack((data1, data2))[:2].T
    print(aux)
    stat, p, dof, expected = chi2_contingency(aux)
    print('dof=%d' % dof)
    print(expected)
    # interpret test-statistic
    prob = 0.95
    critical = chi2.ppf(prob, dof)
    print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
    if abs(stat) >= critical:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')
    # interpret p-value
    alpha = 1.0 - prob
    print('significance=%.3f, p=%.3f' % (alpha, p))
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')
    print('=========================================')

Web libraries and frameworks :
[[ 97  63]
 [351 382]]
dof=1
[[ 80.268757  79.731243]
 [367.731243 365.268757]]
probability=0.950, critical=3.841, stat=8.024
Dependent (reject H0)
significance=0.050, p=0.005
Dependent (reject H0)
Documentation :
[[ 93 232]
 [355 213]]
dof=1
[[163.04591265 161.95408735]
 [284.95408735 283.04591265]]
probability=0.950, critical=3.841, stat=93.590
Dependent (reject H0)
significance=0.050, p=0.000
Dependent (reject H0)
Software tools :
[[106  50]
 [342 395]]
dof=1
[[ 78.26203807  77.73796193]
 [369.73796193 367.26203807]]
probability=0.950, critical=3.841, stat=23.050
Dependent (reject H0)
significance=0.050, p=0.000
Dependent (reject H0)
Application & System software :
[[ 55  24]
 [393 421]]
dof=1
[[ 39.63269877  39.36730123]
 [408.36730123 405.63269877]]
probability=0.950, critical=3.841, stat=12.278
Dependent (reject H0)
significance=0.050, p=0.000
Dependent (reject H0)
Non-web libraries and frameworks :
[[ 97  76]
 [351 369]]
dof=1
[[ 86.79059351  86.20