In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from IPython.display import HTML, display
import tabulate
import re
import seaborn as sns
sns.set_theme(style="whitegrid")

In [None]:
api_data = pd.read_csv("../../data/api_data.csv")

print(f"CSV has {len(api_data)} projects.")

In [None]:
api_data

In [None]:
conflicting_merges_per_project = pd.read_csv("../../data/number_conflicting_merges_project_selected.csv")
conflicting_merges_per_project

In [None]:
merges_per_project = pd.read_csv("../../data/number_merges_project_selected.csv")
merges_per_project

In [None]:
merges_csv = pd.merge(merges_per_project, conflicting_merges_per_project, on='project', how='inner')
merges_csv = merges_csv.drop(columns=['id_y'])
merges_csv = merges_csv.rename(columns={'id_x':'id'})
merges_csv

In [None]:
selected_dataset_2 = pd.read_csv("../../data/selected_dataset_2.csv")
selected_dataset_2

In [None]:
selected_dataset_2.columns

In [None]:
chunks_per_project = selected_dataset_2.groupby(['project']).count()['chunk_id']
chunks_per_project

In [None]:
join_api_selected = pd.merge(api_data, chunks_per_project, on='project', how='inner')
join_api_selected[['project_actual_ownername','chunks', 'chunk_id']]

## Characteristics of the Projects

In [None]:
selected_projects = api_data[api_data.chunks >= 1000]
selected_projects

In [None]:
join_api_merges_csv = pd.merge(merges_csv, selected_projects, on='project', how='inner')
join_api_merges_csv = join_api_merges_csv.drop(columns=['id_y'])
join_api_merges_csv = join_api_merges_csv.rename(columns={'id_x':'id'})

join_api_merges_csv

### Projects that are not forks (from the 29 initial projects)

In [None]:
projs = [("freenet/fred", 0),
("apache/directory-server", 0),
("Ramblurr/Anki-Android", 0),
("alexo/wro4j", 0),
("getrailo/railo", 0),
("atlasapi/atlas", 0),
("hibernate/hibernate-orm", 0),
("TeamDev-Ltd/OpenFaces", 0),
("CloudStack-extras/CloudStack-archive", 0),
("zkoss/zk", 0),
("eucalyptus/eucalyptus", 0),
("alkacon/opencms-core", 0),
("SINTEF-9012/ThingML", 0),
("Unidata/thredds", 0),
("apache/accumulo", 0),
("jgralab/jgralab", 0),
("sebastianbenz/Jnario", 0),
("CCI-MIT/XCoLab", 0),
("apache/lucene-solr", 0),
("android/platform_frameworks_base", 0),
("elastic/elasticsearch", 1),
("eclipse/jetty.project", 1),
("revolsys/com.revolsys.open", 1)]

new_projs = pd.DataFrame.from_records(projs, columns=['project', 'NA'])
new_projs

In [None]:
projects23 = pd.merge(join_api_merges_csv, new_projs, on='project', how='inner')
projects23

In [None]:
selected_projects_table = projects23[["id", "project_actual_ownername", "chunks", "nr_merges", "nr_conflicting_merges", "stargazerCount",
                                             "commits", "contributors", "branches"]]

selected_projects_table = selected_projects_table.rename(columns={"project_actual_ownername": "Project",
                                        "nr_merges": "Merges",
                                        "nr_conflicting_merges": "Conflicting Merges",
                                        "chunks": "Chunks",
                                        "stargazerCount": "Stars",
                                        "commits": "Commits",
                                        "contributors": "Developers",
                                        "branches": "Branches"})
# float columns to int
selected_projects_table[["Chunks", "Stars", "Commits", "Developers", "Branches"]] = \
  selected_projects_table[["Chunks", "Stars", "Commits", "Developers", "Branches"]].astype(int)

selected_projects_table = selected_projects_table.sort_values(by=['Chunks'], ascending=False)
selected_projects_table

### Fixing columns "Commits" and "Developers" to show data from Ghiotto's database

In [None]:
statistics_bd_project_selected = pd.read_csv("../../data/statistics_bd_project_selected.csv")

# fixing project with wrong value
statistics_bd_project_selected.at[0, 'commits'] = 246462

statistics_bd_project_selected

In [None]:
commits_devs_from_BD = statistics_bd_project_selected[["id", "commits", "developers", "stars"]]
commits_devs_from_BD

In [None]:
table_projects = pd.merge(selected_projects_table, 
                                   commits_devs_from_BD,
                                   on='id', how='inner')\
        .drop(columns=['id', 'Branches', 'Developers', 'Commits', 'Stars'])\
        .rename(columns={"developers": "Developers",
                         "commits": "Commits",
                         "stars": "Stars"})

# fixing project with value 0
# table_projects.at[3, 'Commits'] = 246462

# table_projects = table_projects.append(table_projects.median(numeric_only=True), ignore_index=True)
# table_projects.at[29, 'Project'] = 'Median'
table_projects = table_projects.sort_values(by=['Project'], key=lambda x: x.str.lower())
table_projects

In [None]:
table_projects.describe()

In [None]:
# pd.merge(table_projects, new_projs, on='Project', how='outer')

In [None]:
# metrics = ['sum', 'median', 'mean', 'std', 'min', quantile_25, quantile_50, quantile_75, 'max']

projects_stats = table_projects.agg({'Chunks' : 'sum',
                    'Merges' : 'sum',
                    'Conflicting Merges' : 'sum',
                    'Commits' : 'sum',
                    'Developers' : 'sum',
                    'Stars' : 'sum'
                   })

projects_stats['Project'] = 'Total'

projects_stats.to_frame().T

In [None]:
# table_with_stats = pd.concat([table_projects, projects_stats])
table_with_stats = table_projects.append(table_projects.sum(numeric_only=True).rename('Total'))

table_with_stats.at['Total', 'Project'] = 'Total'

table_with_stats = table_with_stats[['Project', 'Stars', 'Developers', 'Commits', 'Merges', 'Conflicting Merges', 'Chunks']]
table_with_stats

In [None]:
# boxplot = table_projects.boxplot(rot=45, grid=False, column=['Chunks', 'Merges', 'Conflicting Merges', 'Commits', 'Developers', 'Stars'])

### LaTeX output

In [None]:
# with pd.option_context("max_colwidth", None):
#   print(table_projects.to_latex(index=False))
table_with_stats.to_csv('../../data/selected_projects_statistics.csv', index=None)
with pd.option_context("max_colwidth", None):
  print(table_with_stats.to_latex(index=False, float_format="{:,.0f}".format))

## Attributes

In [None]:
attributes = pd.read_csv("../../data/attributes.csv")
attributes

In [None]:
attributes_used = attributes[attributes["Name in the dataset"].notna()]
print(f"{attributes_used.shape[0]} atributes used at the dataset.") 

In [None]:
attributes_used

In [None]:
attrs_table = attributes_used[['Category', 'Attribute', 'Type', 'Scope', 'Details', 'Ref_latex']]\
                .sort_values(by=['Category', 'Scope'])\
                .rename(columns={'Details':'Description',
                                 'Ref_latex':'References'})
attrs_table

In [None]:
t = "cite\{menezes\_what\_2020\}"
print(t.replace("cite\\", "\cite"))

In [None]:
with pd.option_context("max_colwidth", None):
    txt = attrs_table.to_latex(index=False,
                               float_format="{:,.0f}".format,
                               column_format="p{0.08\linewidth}p{0.2\linewidth} p{0.15\linewidth}\
                               p{0.04\linewidth}p{0.30\linewidth}p{0.1\linewidth}")
    x = re.sub("[ \t]+", " ", txt)\
          .replace("cite\\", "\cite")\
          .replace("\\}", "}")\
          .replace("\\_", "_")\
          .replace("NaN ", "")\
          .replace("\\textbackslash ", "")

    print(x)

# Experiment results summary

In [None]:
summary = pd.read_csv('../../data/results/experiment_results.csv')
'''
    Given two dataframes, reorder the second based on the first
'''
def reorder_projects(original_df, second_df):
    projects_order = list(original_df['Project'])
    projects_order.remove('Total')
    new_df_data = []
    for project in projects_order:
        rows = second_df[second_df['project'] == project]
        if len(rows) == 1:
            new_df_data.append(rows.iloc[0])
    new_df_data.append(second_df[second_df['project'] == 'Overall'].iloc[0])
    return pd.DataFrame(new_df_data, columns=second_df.columns)

def get_new_project_name(project_name):
    project_row = api_data[api_data['project'] == project_name]
    if len(project_row) == 1:
        if not pd.isnull(project_row.iloc[0]['project_new_ownername']):
            return project_row.iloc[0]['project_new_ownername'] 
    return project_name

summary['project'] = summary['project'].apply(get_new_project_name)
summary

In [None]:
overall_row = summary.iloc[-1:]
summary_wo_overall = summary.iloc[0:-1]
summary_wo_overall_sorted = summary_wo_overall.sort_values('norm._improv.', ascending=False)
summary_wo_overall_sorted
summary_concat = pd.concat([summary_wo_overall_sorted, overall_row])
summary = summary_concat[['project', 'norm._improv.', 'accuracy_cv', 'baseline',
                          'accuracy_test', 'precision', 'recall', 'f1-score']]
summary

In [None]:
latex_string = summary.to_latex(index=False, float_format="{:,.3f}".format)
x = re.sub("[ \t]+", " ", latex_string)
print(x)

# Experiments results by class

In [None]:
import re
by_class = pd.read_csv('../../data/results/experiment_by_class.csv')
by_class = by_class.round(3)
def fix_model_name(model_name):
    if 'baseline_' in model_name:
        return re.sub('baseline_*.*', 'Baseline', model_name)
#         return model_name.replace('baseline_*', 'Baseline')
    if 'random forest' in model_name:
        return model_name.replace('random forest', 'Random Forest')
by_class['model'] = by_class['model'].apply(fix_model_name)
by_class

In [None]:
print(by_class.to_latex(index=False))

# Attributes importance top-10

In [None]:
attributes_importance = pd.read_csv('../../data/results/attributes_importance.csv').head(10)

In [None]:
attributes_scope = pd.read_csv('../../data/results/attributes_scope.csv')
attributes_importance = attributes_importance.merge(attributes_scope)[['attribute', 'scope', 'average_information_gain', 'average_rank']].round(2)
attributes_importance

In [None]:
print(attributes_importance.to_latex(index=False))

# Top-10 Language Constructs information gain

In [None]:
constructs_top10 = pd.read_csv('../../data/results/constructs_ig.csv')
constructs_top10 = constructs_top10[constructs_top10.construct != 'Overall']
constructs_top10 = constructs_top10.sort_values('avg_information_gain', ascending=False)\
                                   .head(10)\
                                   .round(2)\
                                   .rename(columns={'construct':'Language construct',
                                 'avg_information_gain':'Avg. Information gain',
                                 'avg_rank': 'Avg. Rank'})
constructs_top10

In [None]:
print(constructs_top10.to_latex(index=False))

# ============ Debug ===========

In [None]:
df = pd.read_csv('../../data/selected_dataset_2.csv')
df

In [None]:
df.groupby('project').apply(\
        lambda df_x: (df_x.isna().sum().sum()))

In [None]:
df.dropna()\
  .groupby('project').apply(\
        lambda df_x: (df_x.isna().sum().sum()))

In [None]:
df_na = df.isna().sum()
df_na

In [None]:
df_na[df_na != 0]

In [None]:
df.dropna()

In [None]:
df_clean_count = df.dropna().groupby('project').count()
df_clean_count

In [None]:
df_clean_count_big = df_clean_count[df_clean_count.chunk_id >= 10]
df_clean_count_big

In [None]:
projects_20 = list(df_clean_count_big.index.values)
projects_20

In [None]:
df.groupby('project').count()

In [None]:
df.dropna().groupby('project').count()

In [None]:
all_constructors = ['Class declaration', 'Return statement', 'Array access', 'Cast expression', 
                            'Attribute', 'Array initializer', 'Do statement', 'Case statement', 'Other', 'Method signature', 'Break statement',
                            'TypeDeclarationStatement', 'Comment', 'Method invocation', 'Package declaration', 'While statement', 
                            'Interface signature', 'Variable', 'Enum value', 'Class signature', 'Annotation', 'Method interface',
                            'Interface declaration', 'Synchronized statement', 'Throw statement', 'Switch statement', 'Catch clause',
                            'Try statement', 'Annotation declaration', 'For statement', 'Enum declaration', 'Enum signature', 'Assert statement',
                            'Static initializer', 'If statement', 'Method declaration', 'Continue statement', 'Import', 'Blank']

In [None]:
proj_constr = ['project', 'developerdecision'] + all_constructors

df_constructs = df[df.project.isin(projects_20)]\
                  .dropna()\
                  .filter(items=proj_constr)
df_constructs

In [None]:
df_constructs[all_constructors]

In [None]:
df_constructs.apply(lambda x: classifi if x.name in ['x', 'y'] else x)

In [None]:
df[df.project.isin(projects_20)]\
  .dropna()\
  .groupby('project').apply(\
        lambda df_x: pd.Series({'information gain': df_x,
                                
                                     'entropy': entropy(df_x, attrib, y_attrib)})

### Frequency of language constructs on all projects.

In [None]:
selected_dataset_2

In [None]:
selected_dataset_2[['Import', 'developerdecision']]

In [None]:
selected_dataset_2[['Import', 'developerdecision']].groupby('Import').count()

Distribution of 'Import' and 'developer decision'.

In [None]:
selected_dataset_2[['Import', 'developerdecision']].apply(lambda x: x.value_counts(normalize=True) * 100)
# .count()

Distribution of classes, grouped by values of Import (0 and 1).

The value 1 means that the chunk has an Import construct.

In [None]:
import_dist = selected_dataset_2[['Import', 'developerdecision']].groupby(['Import']).apply(\
        lambda df_x: (df_x.groupby(['developerdecision']).count() * 100) / df_x.shape[0])
import_dist

In [None]:
import_dist.unstack()

In [None]:
import_dist.unstack().plot(kind='bar', figsize=(10,8))

Distribution of classes on the whole dataset.

In [None]:
dataset_dist = selected_dataset_2[['Import', 'developerdecision']].groupby(['developerdecision']).count()\
  .apply(lambda x: x*100/x.sum())\
#   .sort_values(by=['Import'], ascending=False)
dataset_dist

In [None]:
dataset_dist.plot(kind='bar', figsize=(10,8))

In [None]:
selected_dataset_2.columns

In [None]:
all_constructors = ['Class declaration', 'Return statement', 'Array access', 'Cast expression', 
                            'Attribute', 'Array initializer', 'Do statement', 'Case statement', 'Other', 'Method signature', 'Break statement',
                            'TypeDeclarationStatement', 'Comment', 'Method invocation', 'Package declaration', 'While statement', 
                            'Interface signature', 'Variable', 'Enum value', 'Class signature', 'Annotation', 'Method interface',
                            'Interface declaration', 'Synchronized statement', 'Throw statement', 'Switch statement', 'Catch clause',
                            'Try statement', 'Annotation declaration', 'For statement', 'Enum declaration', 'Enum signature', 'Assert statement',
                            'Static initializer', 'If statement', 'Method declaration', 'Continue statement', 'Import', 'Blank']

In [None]:
selected_dataset_2[all_constructors]

In [None]:
constructs_freq = selected_dataset_2[all_constructors].apply(lambda s: s.value_counts(normalize=True)*100).T\
  .sort_values(by=[1], ascending=False)
constructs_freq

In [None]:
constructs_top10.set_index('Language construct')\
  .join(constructs_freq)\
  .drop(columns=[0])\
  .rename(columns={1:'Frequency %'})

# Language construct top-1 per project

In [None]:
constructs_per_project = pd.read_csv('../../data/results/constructs_importance_projects.csv')
def fix_project_name(project_name):
    return project_name.replace('__','/')
constructs_per_project['project'] = constructs_per_project['project'].apply(fix_project_name)
constructs_per_project['project'] = constructs_per_project['project'].apply(get_new_project_name)
constructs_per_project = reorder_projects(table_with_stats, constructs_per_project).reset_index(drop=True)
constructs_per_project['information_gain'] = constructs_per_project['information_gain'].round(3)
data_rows = constructs_per_project.iloc[:-1 , :]\
  .sort_values(by=['information_gain'], ascending=False)
overall_row = constructs_per_project.iloc[-1: , :]
constructs_per_project = pd.concat([data_rows, overall_row])
constructs_per_project

In [None]:
# def fix_big_construct_name(construct_name):
#     if construct_name == 'Method invocation':
#         part1 = construct_name.split()[0]
#         part2 = construct_name.split()[1]
#         return "\\begin{tabular}[l]{@{}c@{}}" + part1 + "\\\\" + part2 + "\end{tabular}"
#     return construct_name
# constructs_per_project['construct'] = constructs_per_project['construct'].apply(fix_big_construct_name)
# constructs_per_project

In [None]:
constructs = constructs_per_project.rename(columns={'project':'Project',
                                       'construct':'Construct',
                                       'information_gain':'Information gain',
                                       'rank': 'Rank'})

constructs['Rank'] = constructs['Rank'].astype(int)

print(constructs.to_latex(index=False,
                          float_format="{:,.3f}".format,
                          column_format="p{5cm}p{3cm}p{2cm}p{1cm}"))

# Developers importance per project

In [None]:
def is_authorship_feature(feature_name):
    if '.' in feature_name or '@' in feature_name:
        return True

In [None]:
developers_per_project = pd.read_csv('../../data/results/developers_importance.csv')
developers_per_project['project'] = developers_per_project['project'].apply(fix_project_name)
developers_per_project['project'] = developers_per_project['project'].apply(get_new_project_name)
developers_per_project = reorder_projects(table_with_stats, developers_per_project).reset_index(drop=True)
developers_per_project['information_gain'] = developers_per_project['information_gain'].round(3)
developers_per_project = developers_per_project.drop('author', axis=1)
developers_per_project['rank'] = developers_per_project['rank'].astype(int)
data_rows = developers_per_project.iloc[:-1 , :]\
  .sort_values(by=['information_gain'], ascending=False)
overall_row = developers_per_project.iloc[-1: , :]
developers_per_project = pd.concat([data_rows, overall_row])
developers_per_project

In [None]:
print(developers_per_project.to_latex(index=False))

In [None]:
cci_training = pd.read_csv("../../data/projects/CCI-MIT__XCoLab-training.csv")
cci_training

In [None]:
cci_test = pd.read_csv("../../data/projects/CCI-MIT__XCoLab-test.csv")
cci_test

In [None]:
cci = pd.concat([cci_training, cci_test])
cci

In [None]:
cci_devs = cci.filter(regex=("developerdecision|.*@.*\..*"))
cci_devs

In [None]:
cci_devs.apply(lambda x: x.value_counts(normalize=True) * 100)

In [None]:
cci_devs[['developerdecision', 'steverab93@gmail.com']].groupby('steverab93@gmail.com').count()

In [None]:
steverab93_dist = cci_devs[['steverab93@gmail.com', 'developerdecision']].groupby(['steverab93@gmail.com']).apply(\
        lambda df_x: (df_x.groupby(['developerdecision']).count() * 100) / df_x.shape[0])
steverab93_dist

In [None]:
def dev_dist(project, dev_email):
    training = pd.read_csv(f"../../data/projects/{project}-training.csv")
    test = pd.read_csv(f"../../data/projects/{project}-test.csv")
    proj = pd.concat([training, test])
    devs = proj.filter(regex=("developerdecision|.*@.*\..*"))
    df_count = devs[['developerdecision', dev_email]].groupby(dev_email).count()
    count0 = df_count['developerdecision'][0]
    count1 = df_count['developerdecision'][1]
    percentage = count1 / (count0 + count1)
    print(f"Chunks of {dev_email}: {count1} from {count0 + count1}.\n{round(percentage*100,2)}% of the chunks")
    return devs[[dev_email, 'developerdecision']].groupby([dev_email]).apply(\
            lambda df_x: (df_x.groupby(['developerdecision']).count() * 100) / df_x.shape[0])\
            .unstack()

In [None]:
dev_dist('CCI-MIT__XCoLab', 'steverab93@gmail.com')

In [None]:
dev_dist('apache__directory-server', 'akarasulu@apache.org')

In [None]:
dev_dist('Ramblurr__Anki-Android', 'martin.andre@gmail.com')