In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
import warnings
import classifier_utils
warnings.filterwarnings("ignore")

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

## Attributes relevance

In [4]:
import importlib
importlib.reload(classifier_utils)
results = classifier_utils.get_attributes_importance(projects, non_features_columns)

- Which attributes are more relevant (according to the information gain) across all projects?

In [5]:
results = results.sort_values('average_information_gain', ascending=False).head(50)
results

Unnamed: 0,attribute,average_information_gain,average_rank
0,left_lines_added,1.361994,11.05
35,Commits 1,1.22226,14.3
1,left_lines_removed,1.207105,13.7
20,chunkAbsSize,1.183627,18.75
3,right_lines_removed,1.140743,16.15
13,keyword_remove,1.060914,19.25
14,keyword_use,1.035883,18.75
37,Changed files 1,1.031389,19.1
2,right_lines_added,1.000893,17.25
5,keyword_fix,0.986123,18.05


In [6]:
results.to_csv('../../data/results/attributes_importance.csv', index=False)

- Are there any insightful association rules for the top-10 attributes?

In [7]:
import importlib
importlib.reload(classifier_utils)
top_10_attributes = results.iloc[:10]
top_10_attributes = list(top_10_attributes['attribute'])
target_class_name = 'developerdecision'
threshold = 0.5
min_occurences = 10
df_increase, df_decrease = classifier_utils.process_association_rules(top_10_attributes, projects, target_class_name, threshold, min_occurences)

In [8]:
print(f"Mined {len(df_increase)} rules with at least {threshold*100:.2f}% increased chance that occur more than {min_occurences} times:")
display(df_increase)

Mined 266 rules with at least 50.00% increased chance that occur more than 10 times:


Unnamed: 0,Rule,Lift,Confidence,Occurrences
2,left_lines_removed=17 => Manual,10.607873,0.976023,692
2,chunkAbsSize=12 => Combination,8.497970,0.545455,12
3,chunkAbsSize=11 => Combination,8.101398,0.520000,13
0,keyword_fix=0 => None,7.288073,0.014981,12
2,right_lines_added=9 => ConcatenationV2V1,7.236271,0.033730,17
...,...,...,...,...
29,keyword_fix=8 => ConcatenationV2V1,1.510804,0.007042,24
43,right_lines_removed=14 => Combination,1.506373,0.096689,438
27,keyword_use=10 => Combination,1.503042,0.096475,52
34,Commits 1=9 => ConcatenationV2V1,1.500822,0.006996,18


In [9]:
print(f"Mined {len(df_decrease)} rules with at least {threshold*100:.2f}% decreased chance that occur more than {min_occurences} times:")
display(df_decrease)

Mined 123 rules with at least 50.00% decreased chance that occur more than 10 times:


Unnamed: 0,Rule,Lift,Confidence,Occurrences
1,left_lines_removed=17 => Version 1,0.025370,0.016925,12
3,left_lines_removed=-1 => Version 2,0.050272,0.007644,33
4,left_lines_removed=-1 => Combination,0.050525,0.003243,14
1,keyword_remove=1 => Version 2,0.055210,0.008395,84
0,keyword_use=0 => Manual,0.094187,0.008666,74
...,...,...,...,...
16,keyword_use=1 => ConcatenationV1V2,0.486705,0.008708,18
15,keyword_remove=10 => Manual,0.488893,0.044983,13
27,right_lines_removed=17 => Manual,0.492448,0.045310,185
19,Commits 1=2 => Version 1,0.493321,0.329114,104


## Developers attribute relevance

Considers the information gain and attribute rank (according to the information gain) for the developer with the highest information gain in each project

In [10]:
import importlib
importlib.reload(classifier_utils)
developers_relevance = classifier_utils.get_developers_attribute_importance(projects, non_features_columns)

- Which author is the more relevant for each project?
- What about overall? How important are the authors?

In [11]:
developers_relevance

Unnamed: 0,project,author,information_gain,rank
0,Ramblurr__Anki-Android,martin.andre@gmail.com,0.95469,36.0
1,apache__directory-server,elecharny@apache.org,1.052,7.0
2,android__platform_frameworks_base,initial-contribution@android.com,0.289087,34.0
3,freenet__fred,toad@amphibian.dyndns.org,2.007149,2.0
4,alexo__wro4j,alex.objelean@gmail.com,2.091704,1.0
5,apache__lucene-solr,mikemccand@apache.org,1.335695,19.0
6,getrailo__railo,michael@getrailo.org,1.588456,5.0
7,atlasapi__atlas,fred@metabroadcast.com,1.099141,8.0
8,hibernate__hibernate-orm,steve@hibernate.org,2.069798,10.0
9,CloudStack-extras__CloudStack-archive,alena@cloud.com,0.921129,35.0


In [12]:
developers_relevance.to_csv('../../data/results/developers_importance.csv', index=False)

## Language constructs relevance
- What is the most important construct in each project?
- What is the overall average information gain and rank for language constructs (using the top-1 for each project)?

In [13]:
import importlib
importlib.reload(classifier_utils)
constructs_relevance = classifier_utils.get_constructs_attribute_importance(projects, non_features_columns)

In [14]:
constructs_relevance

Unnamed: 0,project,construct,information_gain,rank
0,Ramblurr__Anki-Android,Method invocation,1.309414,28.0
1,apache__directory-server,Method invocation,0.417061,34.0
2,android__platform_frameworks_base,Method invocation,0.820339,26.0
3,freenet__fred,Method invocation,1.160266,27.0
4,alexo__wro4j,Method invocation,1.061447,13.0
5,apache__lucene-solr,Method invocation,1.397565,18.0
6,getrailo__railo,Method invocation,0.953301,20.0
7,atlasapi__atlas,Method invocation,1.198229,6.0
8,hibernate__hibernate-orm,Import,1.260123,23.0
9,CloudStack-extras__CloudStack-archive,Method invocation,0.936753,34.0


In [15]:
constructs_relevance.to_csv('../../data/results/constructs_importance_projects.csv', index=False)

What is the average information gain and rank for each language construct?

In [16]:
import importlib
importlib.reload(classifier_utils)
constructs_ig = classifier_utils.get_constructs_information_gain(projects, non_features_columns)

In [17]:
constructs_ig.sort_values('avg_information_gain', ascending=False)

Unnamed: 0,construct,avg_information_gain,avg_rank
13,Method invocation,0.746177,26.65
17,Variable,0.424312,34.35
12,Comment,0.382789,33.65
34,If statement,0.33118,38.1
4,Attribute,0.310743,38.8
37,Import,0.280483,40.4
9,Method signature,0.251188,39.5
39,Overall,0.182639,148.5625
35,Method declaration,0.160039,43.9
20,Annotation,0.132506,50.7


In [18]:
constructs_ig.to_csv('../../data/results/constructs_ig.csv', index=False)