<a href="https://colab.research.google.com/github/fzanart/GHDomains/blob/main/Automation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
new_popular_df = pd.read_csv('/content/drive/MyDrive/GHDomains/Resources/new_popular_df_predicted.csv', usecols=['Name','Workflows','Predicted_labels'])
new_popular_df

Unnamed: 0,Name,Workflows,Predicted_labels
0,vuejs/devtools,['Create Release'],Web libraries and frameworks
1,futurice/android-best-practices,,Documentation
2,microsoft/Web-Dev-For-Beginners,"['Azure Static Web Apps CI/CD', 'Lock closed i...",Documentation
3,airbnb/react-sketchapp,,Web libraries and frameworks
4,eugeneyan/applied-ml,,Documentation
...,...,...,...
436,railsware/upterm,,Software tools
437,matryer/xbar,,Software tools
438,thedaviddias/Front-End-Checklist,['pages-build-deployment'],Documentation
439,electronicarts/CnC_Remastered_Collection,,Documentation


In [5]:
# binarize the workflows data
new_popular_df['Workflows'] = new_popular_df['Workflows'].notnull().astype('int')
new_popular_df

Unnamed: 0,Name,Workflows,Predicted_labels
0,vuejs/devtools,1,Web libraries and frameworks
1,futurice/android-best-practices,0,Documentation
2,microsoft/Web-Dev-For-Beginners,1,Documentation
3,airbnb/react-sketchapp,0,Web libraries and frameworks
4,eugeneyan/applied-ml,0,Documentation
...,...,...,...
436,railsware/upterm,0,Software tools
437,matryer/xbar,0,Software tools
438,thedaviddias/Front-End-Checklist,1,Documentation
439,electronicarts/CnC_Remastered_Collection,0,Documentation


In [6]:
# Summarise the data:

# Group data by domain
df = new_popular_df.groupby(['Predicted_labels']).sum()
# Add total values
aux_df = new_popular_df.groupby(['Predicted_labels']).count()
aux_df = aux_df['Name']

# Concatenate data and add the difference of total values and repos that has workflows implemented
df = pd.concat([df, aux_df], axis=1)
df['Not Workflow'] = df['Name'] - df['Workflows']

# Rename and reorder columns
columns = ['Has workflow', 'Total', 'Not workflow']
df.columns = columns
df = df[['Has workflow', 'Not workflow', 'Total']]

# Data
df

Unnamed: 0_level_0,Has workflow,Not workflow,Total
Predicted_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Application & System software,28,7,35
Documentation,57,128,185
Non-web libraries and frameworks,39,23,62
Software tools,66,20,86
Web libraries and frameworks,51,22,73


In [7]:
domains = new_popular_df['Predicted_labels'].unique()

for domain in domains:
    print(domain, ':')
    data1 = df.loc[domain]
    data2 = df.loc[domains[domains != domain]].sum()
    aux = np.column_stack((data1, data2))[:2].T
    print(aux)
    stat, p, dof, expected = chi2_contingency(aux)
    print('dof=%d' % dof)
    print(expected)
    # interpret test-statistic
    prob = 0.95
    critical = chi2.ppf(prob, dof)
    print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
    if abs(stat) >= critical:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')
    # interpret p-value
    alpha = 1.0 - prob
    print('significance=%.3f, p=%.3f' % (alpha, p))
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')
    print('=========================================')

Web libraries and frameworks :
[[ 51  22]
 [190 178]]
dof=1
[[ 39.89342404  33.10657596]
 [201.10657596 166.89342404]]
probability=0.950, critical=3.841, stat=7.452
Dependent (reject H0)
significance=0.050, p=0.006
Dependent (reject H0)
Documentation :
[[ 57 128]
 [184  72]]
dof=1
[[101.09977324  83.90022676]
 [139.90022676 116.09977324]]
probability=0.950, critical=3.841, stat=71.421
Dependent (reject H0)
significance=0.050, p=0.000
Dependent (reject H0)
Software tools :
[[ 66  20]
 [175 180]]
dof=1
[[ 46.99773243  39.00226757]
 [194.00226757 160.99773243]]
probability=0.950, critical=3.841, stat=19.952
Dependent (reject H0)
significance=0.050, p=0.000
Dependent (reject H0)
Application & System software :
[[ 28   7]
 [213 193]]
dof=1
[[ 19.12698413  15.87301587]
 [221.87301587 184.12698413]]
probability=0.950, critical=3.841, stat=8.779
Dependent (reject H0)
significance=0.050, p=0.003
Dependent (reject H0)
Non-web libraries and frameworks :
[[ 39  23]
 [202 177]]
dof=1
[[ 33.88208617