<a href="https://colab.research.google.com/github/fzanart/GHDomains/blob/main/Refactoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import os
import pandas as pd
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk import download
ss = SnowballStemmer('english')
from spacy import load
from spacy.matcher import Matcher
from tqdm import tqdm
nlp = load("en_core_web_sm", exclude=["tok2vec", "tagger", "parser",'senter', "attribute_ruler", "lemmatizer", 'ner'])
from google.colab import drive
drive.mount('/content/drive/')
download('punkt')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [28]:
directory = '/content/drive/MyDrive/GHDomains/Downloaded_repos/'

In [29]:
word_list = ['refactor', 'restruct', 'clean', 'not used', 'unused', 'reformat', 'import', 'remove', 'replace', 'split', 'reorg', 'rename', 'move']

matcher = Matcher(nlp.vocab)

# Add patterns
patterns= [[{"LOWER": "refactor"}],
            [{"LOWER": "restruct"}],
            [{"LOWER": "clean"}],
            [{"LOWER": "not"},{"LOWER": "use"}],    # used      steam = use
            [{"LOWER": "unus"}],                    # unused    steam = unus
            [{"LOWER": "reformat"}],                ## steam = reform
            [{"LOWER": "import"}],
            [{"LOWER": "remov"}],                   # remove    steam = remov
            [{"LOWER": "replac"}],                  # replace   steam = replac
            [{"LOWER": "split"}],                   ## split     steam = splite / split
            [{"LOWER": "reorg"}],                   # reorg     steam = reorg
            [{"LOWER": "renam"}],                   # rename    steam = renam
            [{"LOWER": "move"}]]

matcher.add("refactoring_pattern", patterns)

In [30]:
def word_matcher(text):
    
    try:
        with nlp.disable_pipes():
        # Tokenize the commit sentence:
            text = word_tokenize(text)
        # apply steam technique
            text = ' '.join([ss.stem(word) for word in text])
        # select matching words
            text = nlp(text)
            words = matcher(text)
    except TypeError:
        return 0

    if len(words)>0:

        return 1

    else:

        return 0

In [35]:
matches = []
to_rescan = []

for filename in tqdm(os.listdir(directory)):
    if not filename.startswith('.'):
        try:
            aux_reader = pd.read_json(directory+filename)
            commits    = aux_reader['Messages:'].size
            match      = aux_reader['Messages:'].apply(word_matcher).sum()
            filename   = filename.replace(':', '/')
            filename   = filename.replace('.json', '')
            matches.extend([(filename, commits, match)])
        except:
            print('None commits found in: ', filename)
            filename   = filename.replace(':', '/')
            filename   = filename.replace('.json', '')
            to_rescan.extend([filename])


df = pd.DataFrame(matches, columns =['Name', 'Commits', 'Matches'])
df

100%|██████████| 441/441 [4:31:48<00:00, 36.98s/it]


Unnamed: 0,Name,Commits,Matches
0,limetext/lime,1104,150
1,pypa/pipenv,7318,709
2,deezer/spleeter,477,70
3,fchollet/deep-learning-with-python-notebooks,42,3
4,microsoft/Web-Dev-For-Beginners,1265,60
...,...,...,...
435,akullpp/awesome-java,1590,174
436,puppeteer/puppeteer,2483,376
437,alibaba/arthas,1716,93
438,atlassian/react-beautiful-dnd,795,86


In [37]:
# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np

new_popular_df = pd.read_csv('/content/drive/MyDrive/GHDomains/Resources/new_popular_df_predicted.csv', usecols=['Name','Predicted_labels'])
new_popular_df

Unnamed: 0,Name,Predicted_labels
0,vuejs/devtools,Web libraries and frameworks
1,futurice/android-best-practices,Documentation
2,microsoft/Web-Dev-For-Beginners,Documentation
3,airbnb/react-sketchapp,Web libraries and frameworks
4,eugeneyan/applied-ml,Documentation
...,...,...
436,railsware/upterm,Software tools
437,matryer/xbar,Software tools
438,thedaviddias/Front-End-Checklist,Documentation
439,electronicarts/CnC_Remastered_Collection,Documentation


In [42]:
join_data = pd.merge(new_popular_df, df, on ='Name', how ='inner')
join_data['Non refactoring commits'] = join_data['Commits'] - join_data['Matches']

columns = ['Name','Domain','Total commits', 'Refactoring commits', 'Non refactoring commits']
join_data.columns = columns
join_data = join_data[['Name','Domain', 'Refactoring commits', 'Non refactoring commits','Total commits']]
join_data

Unnamed: 0,Name,Domain,Refactoring commits,Non refactoring commits,Total commits
0,vuejs/devtools,Web libraries and frameworks,246,1723,1969
1,futurice/android-best-practices,Documentation,31,216,247
2,microsoft/Web-Dev-For-Beginners,Documentation,60,1205,1265
3,airbnb/react-sketchapp,Web libraries and frameworks,123,657,780
4,eugeneyan/applied-ml,Documentation,22,377,399
...,...,...,...,...,...
435,railsware/upterm,Software tools,446,2552,2998
436,matryer/xbar,Software tools,96,935,1031
437,thedaviddias/Front-End-Checklist,Documentation,82,612,694
438,electronicarts/CnC_Remastered_Collection,Documentation,1,11,12


In [46]:
df = join_data.groupby(['Domain']).sum()
df

Unnamed: 0_level_0,Refactoring commits,Non refactoring commits,Total commits
Domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Application & System software,26933,221603,248536
Documentation,14615,171078,185693
Non-web libraries and frameworks,39684,218172,257856
Software tools,95718,684419,780137
Web libraries and frameworks,40484,280542,321026


In [47]:
domains = join_data['Domain'].unique()

for domain in domains:
    print(domain, ':')
    data1 = df.loc[domain]
    data2 = df.loc[domains[domains != domain]].sum()
    aux = np.column_stack((data1, data2))[:2].T
    print(aux)
    stat, p, dof, expected = chi2_contingency(aux)
    print('dof=%d' % dof)
    print(expected)
    # interpret test-statistic
    prob = 0.95
    critical = chi2.ppf(prob, dof)
    print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
    if abs(stat) >= critical:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')
    # interpret p-value
    alpha = 1.0 - prob
    print('significance=%.3f, p=%.3f' % (alpha, p))
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')
    print('=========================================')

Web libraries and frameworks :
[[  40484  280542]
 [ 176950 1295272]]
dof=1
[[  38924.88227172  282101.11772828]
 [ 178509.11772828 1293712.88227172]]
probability=0.950, critical=3.841, stat=86.508
Dependent (reject H0)
significance=0.050, p=0.000
Dependent (reject H0)
Documentation :
[[  14615  171078]
 [ 202819 1404736]]
dof=1
[[  22515.55376724  163177.44623276]
 [ 194918.44623276 1412636.55376724]]
probability=0.950, critical=3.841, stat=3518.742
Dependent (reject H0)
significance=0.050, p=0.000
Dependent (reject H0)
Software tools :
[[ 95718 684419]
 [121716 891395]]
dof=1
[[ 94592.77716077 685544.22283923]
 [122841.22283923 890269.77716077]]
probability=0.950, critical=3.841, stat=26.937
Dependent (reject H0)
significance=0.050, p=0.000
Dependent (reject H0)
Application & System software :
[[  26933  221603]
 [ 190501 1354211]]
dof=1
[[  30135.36143579  218400.63856421]
 [ 187298.63856421 1357413.36143579]]
probability=0.950, critical=3.841, stat=449.425
Dependent (reject H0)
sig