<a href="https://colab.research.google.com/github/fzanart/GHDomains/blob/main/Automation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
new_popular_df = pd.read_csv('/content/drive/MyDrive/GHDomains/Resources/new_popular_df_predicted.csv', usecols=['Name','Workflows','Predicted_labels'])
new_popular_df

Unnamed: 0,Name,Workflows,Predicted_labels
0,vuejs/devtools,['Create Release'],Web libraries and frameworks
1,futurice/android-best-practices,,Documentation
2,microsoft/Web-Dev-For-Beginners,"['Azure Static Web Apps CI/CD', 'Lock closed i...",Documentation
3,airbnb/react-sketchapp,,Web libraries and frameworks
4,eugeneyan/applied-ml,,Documentation
...,...,...,...
888,graphql/dataloader,,Non-web libraries and frameworks
889,junyanz/CycleGAN,,Non-web libraries and frameworks
890,Tencent/wcdb,,Application & System software
891,halfrost/Halfrost-Field,['Deploy Blog'],Non-web libraries and frameworks


In [3]:
excluded_repos = new_popular_df['Name'].isin(['chromium/chromium','aosp-mirror/platform_frameworks_base','llvm/llvm-project','Homebrew/homebrew-core'])
new_popular_df = new_popular_df[~excluded_repos]

In [4]:
# binarize the workflows data
new_popular_df['Workflows'] = new_popular_df['Workflows'].notnull().astype('int')
new_popular_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Name,Workflows,Predicted_labels
0,vuejs/devtools,1,Web libraries and frameworks
1,futurice/android-best-practices,0,Documentation
2,microsoft/Web-Dev-For-Beginners,1,Documentation
3,airbnb/react-sketchapp,0,Web libraries and frameworks
4,eugeneyan/applied-ml,0,Documentation
...,...,...,...
888,graphql/dataloader,0,Non-web libraries and frameworks
889,junyanz/CycleGAN,0,Non-web libraries and frameworks
890,Tencent/wcdb,0,Application & System software
891,halfrost/Halfrost-Field,1,Non-web libraries and frameworks


In [5]:
# Summarise the data:

# Group data by domain
df = new_popular_df.groupby(['Predicted_labels']).sum()
# Add total values
aux_df = new_popular_df.groupby(['Predicted_labels']).count()
aux_df = aux_df['Name']

# Concatenate data and add the difference of total values and repos that has workflows implemented
df = pd.concat([df, aux_df], axis=1)
df['Not Workflow'] = df['Name'] - df['Workflows']

# Rename and reorder columns
columns = ['Has workflow', 'Total', 'Not workflow']
df.columns = columns
df = df[['Has workflow', 'Not workflow', 'Total']]

# Data
df

Unnamed: 0_level_0,Has workflow,Not workflow,Total
Predicted_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Application & System software,55,24,79
Documentation,93,232,325
Non-web libraries and frameworks,97,75,172
Software tools,104,49,153
Web libraries and frameworks,97,63,160


In [6]:
from math import sqrt 

def phi_effect_size(contingency_table):
  # interpretation for degree of freedom = 1
  # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5426219/#B2
  effect_size = sqrt(stat/sum(sum(contingency_table)))

  if effect_size <= 0.10:
    interpretation = 'small'
  elif 0.10 < effect_size <= 0.30:
    interpretation = 'small to medium'
  elif 0.30 < effect_size <= 0.50:
    interpretation = 'medium to large'
  elif effect_size > 0.50:
    interpretation = 'large'

  return round(effect_size,3), interpretation

In [7]:
domains = new_popular_df['Predicted_labels'].unique()

for domain in domains:
    print(domain, ':')
    data1 = df.loc[domain]
    data2 = df.loc[domains[domains != domain]].sum()
    aux = np.column_stack((data1, data2))[:2].T
    print(aux)
    stat, p, dof, expected = chi2_contingency(aux)
    print('dof=%d' % dof)
    print(expected)
    # interpret test-statistic
    prob = 0.95
    critical = chi2.ppf(prob, dof)
    print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
    if abs(stat) >= critical:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')
    # interpret p-value
    alpha = 1.0 - prob
    print('significance=%.3f, p=%.3f' % (alpha, p))
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')
    print('Effect size: ', phi_effect_size(aux))
    print('=========================================')

Web libraries and frameworks :
[[ 97  63]
 [349 380]]
dof=1
[[ 80.26996625  79.73003375]
 [365.73003375 363.26996625]]
probability=0.950, critical=3.841, stat=8.031
Dependent (reject H0)
significance=0.050, p=0.005
Dependent (reject H0)
Effect size:  (0.095, 'small')
Documentation :
[[ 93 232]
 [353 211]]
dof=1
[[163.04836895 161.95163105]
 [282.95163105 281.04836895]]
probability=0.950, critical=3.841, stat=93.838
Dependent (reject H0)
significance=0.050, p=0.000
Dependent (reject H0)
Effect size:  (0.325, 'medium to large')
Software tools :
[[104  49]
 [342 394]]
dof=1
[[ 76.75815523  76.24184477]
 [369.24184477 366.75815523]]
probability=0.950, critical=3.841, stat=22.583
Dependent (reject H0)
significance=0.050, p=0.000
Dependent (reject H0)
Effect size:  (0.159, 'small to medium')
Application & System software :
[[ 55  24]
 [391 419]]
dof=1
[[ 39.63329584  39.36670416]
 [406.36670416 403.63329584]]
probability=0.950, critical=3.841, stat=12.282
Dependent (reject H0)
significance=0