In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
import warnings
import classifier_utils
warnings.filterwarnings("ignore")

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

## Attributes relevance

In [4]:
import importlib
importlib.reload(classifier_utils)
results = classifier_utils.get_attributes_importance(projects, non_features_columns)

- Which attributes are more relevant (according to the information gain) across all projects?

In [5]:
results = results.sort_values('average_information_gain', ascending=False).head(50)
results

Unnamed: 0,attribute,average_information_gain,average_rank
1,left_lines_removed,0.348419,12.3
0,left_lines_added,0.344164,10.0
2,right_lines_added,0.324572,11.5
38,Changed files 2,0.324027,15.35
37,Changed files 1,0.317652,14.75
5,keyword_fix,0.314918,13.75
28,Branching time,0.311556,17.1
35,Commits 1,0.311012,11.3
3,right_lines_removed,0.30795,14.8
14,keyword_use,0.300825,15.45


In [6]:
results.to_csv('../../data/results/attributes_importance.csv', index=False)

- Are there any insightful association rules for the top-10 attributes?

In [7]:
import importlib
importlib.reload(classifier_utils)
top_10_attributes = results.iloc[:10]
top_10_attributes = list(top_10_attributes['attribute'])
target_class_name = 'developerdecision'
threshold = 0.5
min_occurences = 10
df_increase, df_decrease = classifier_utils.process_association_rules(top_10_attributes, projects, target_class_name, threshold, min_occurences)

In [8]:
print(f"Mined {len(df_increase)} rules with at least {threshold*100:.2f}% increased chance that occur more than {min_occurences} times:")
display(df_increase)

Mined 278 rules with at least 50.00% increased chance that occur more than 10 times:


Unnamed: 0,Rule,Lift,Confidence,Occurrences
2,left_lines_removed=17 => Manual,10.607873,0.976023,692
0,keyword_fix=0 => None,7.288073,0.014981,12
2,right_lines_added=9 => ConcatenationV2V1,7.236271,0.033730,17
0,keyword_use=12 => Combination,5.842355,0.375000,15
0,Changed files 2=13 => Version 2,5.575764,0.847826,39
...,...,...,...,...
29,keyword_fix=8 => ConcatenationV2V1,1.510804,0.007042,24
43,right_lines_removed=14 => Combination,1.506373,0.096689,438
27,keyword_use=10 => Combination,1.503042,0.096475,52
34,Commits 1=9 => ConcatenationV2V1,1.500822,0.006996,18


In [9]:
print(f"Mined {len(df_decrease)} rules with at least {threshold*100:.2f}% decreased chance that occur more than {min_occurences} times:")
display(df_decrease)

Mined 130 rules with at least 50.00% decreased chance that occur more than 10 times:


Unnamed: 0,Rule,Lift,Confidence,Occurrences
1,left_lines_removed=17 => Version 1,0.025370,0.016925,12
3,left_lines_removed=-1 => Version 2,0.050272,0.007644,33
4,left_lines_removed=-1 => Combination,0.050525,0.003243,14
0,keyword_use=0 => Manual,0.094187,0.008666,74
6,left_lines_removed=-1 => Manual,0.100704,0.009266,40
...,...,...,...,...
24,left_lines_added=8 => Version 1,0.486497,0.324561,444
16,keyword_use=1 => ConcatenationV1V2,0.486705,0.008708,18
27,right_lines_removed=17 => Manual,0.492448,0.045310,185
19,Commits 1=2 => Version 1,0.493321,0.329114,104


## Developers attribute relevance

Considers the information gain and attribute rank (according to the information gain) for the developer with the highest information gain in each project

In [10]:
import importlib
importlib.reload(classifier_utils)
developers_relevance = classifier_utils.get_developers_attribute_importance(projects, non_features_columns)

- Which author is the more relevant for each project?
- What about overall? How important are the authors?

In [11]:
developers_relevance

Unnamed: 0,project,author,information_gain,rank
0,Ramblurr__Anki-Android,martin.andre@gmail.com,0.175819,29.0
1,apache__directory-server,akarasulu@apache.org,0.215938,30.0
2,android__platform_frameworks_base,jbq@google.com,0.050371,27.0
3,freenet__fred,saces@freenetproject.org,0.122353,29.0
4,alexo__wro4j,thilo@ginkel.com,0.0106,28.0
5,apache__lucene-solr,markrmiller@apache.org,0.09649,23.0
6,getrailo__railo,michael@getrailo.org,0.092394,31.0
7,atlasapi__atlas,sergio.bossa@gmail.com,0.029947,24.0
8,hibernate__hibernate-orm,jverhaeg@redhat.com,0.086512,27.0
9,CloudStack-extras__CloudStack-archive,deepakgarg.iitg@gmail.com,0.099051,34.0


In [12]:
developers_relevance.to_csv('../../data/results/developers_importance.csv', index=False)

## Language constructs relevance
- What is the most important construct in each project?
- What is the overall average information gain and rank for language constructs (using the top-1 for each project)?

In [13]:
import importlib
importlib.reload(classifier_utils)
constructs_relevance = classifier_utils.get_constructs_attribute_importance(projects, non_features_columns)

In [14]:
constructs_relevance

Unnamed: 0,project,construct,information_gain,rank
0,Ramblurr__Anki-Android,Import,0.033104,46.0
1,apache__directory-server,Blank,0.216272,29.0
2,android__platform_frameworks_base,If statement,0.013492,39.0
3,freenet__fred,Annotation,0.14455,27.0
4,alexo__wro4j,Import,0.125109,1.0
5,apache__lucene-solr,Import,0.066641,30.0
6,getrailo__railo,Import,0.079876,35.0
7,atlasapi__atlas,Import,0.069572,12.0
8,hibernate__hibernate-orm,Import,0.185363,18.0
9,CloudStack-extras__CloudStack-archive,Blank,0.092971,35.0


In [15]:
constructs_relevance.to_csv('../../data/results/constructs_importance_projects.csv', index=False)

What is the average information gain and rank for each language construct?

In [16]:
import importlib
importlib.reload(classifier_utils)
constructs_ig = classifier_utils.get_constructs_information_gain(projects, non_features_columns)

In [17]:
constructs_ig.sort_values('avg_information_gain', ascending=False)

Unnamed: 0,construct,avg_information_gain,avg_rank
37,Import,0.057876,33.85
13,Method invocation,0.033857,42.0
17,Variable,0.026682,40.25
12,Comment,0.026347,50.25
38,Blank,0.022051,65.8
9,Method signature,0.021427,50.0
39,Overall,0.018471,154.7725
20,Annotation,0.017675,61.2
34,If statement,0.017594,47.25
4,Attribute,0.014829,53.55


In [18]:
constructs_ig.to_csv('../../data/results/constructs_ig.csv', index=False)