The goal of this notebook is to select a sample of chunks from the dataset where we will manually analyze their resolution. 
To determine the sample size, we used Cochran’s sample size formula (Israel 1992), using maximum variability of 0.5, confidence level of 99% and error level of 10%. The calculated sample size is 166 chunks.

Used this calculator: https://www.socscistatistics.com/tests/samplesize/default.aspx

Israel GD (1992) Determining sample size. Tech. rep., University of Florida

In [77]:
import pandas as pd
import analysis_util
import matplotlib.pyplot as plt
import json
from IPython.display import display, HTML
from math import sqrt

In [154]:
data_folder = "/Users/heleno/Documents/data"
analysis_util.data_folder = data_folder
SAMPLE_SIZE = 166

In [155]:
all_chunks_attributes = pd.read_csv(f'{data_folder}/dataset_attributes.csv')

In [156]:
df = pd.read_csv(f'{data_folder}/resolution_composition.csv')
print(len(df))
df.head()

13573


Unnamed: 0,chunk_id,v1_percentage,v2_percentage,normalized_v1_percentage,normalized_v2_percentage,intersection_percentage,chunk_composition,missing_v1_lines,missing_v2_lines,missing_v1_lines_perc,missing_v2_lines_perc,v1_size,v2_size,chunk_size_delta
0,776662,0.42,0.92,0.25,0.75,0.33,v1 v2 (v1_2) v1 (v1_2) v1 v2,1,1,16.67,8.33,6,12,6.0
1,776782,0.57,0.75,0.41,0.59,0.32,v2 v1 (v1_2) v1 (v1_2) v1 (v1_2) v2 (v1_2) v2...,2,4,6.06,7.55,33,53,20.0
2,776793,0.75,0.25,0.75,0.25,0.0,v2 v1,1,0,25.0,0.0,4,1,-3.0
3,776828,0.33,0.67,0.33,0.67,0.0,v2 v1 v2,2,0,66.67,0.0,3,2,-1.0
4,776863,0.25,0.75,0.25,0.75,0.0,v2 v1,0,1,0.0,25.0,1,4,3.0


In [157]:
df_po = pd.read_csv(f'{data_folder}/partial_order_result.csv')
df = pd.merge(df, df_po, on='chunk_id', how='inner', suffixes=('', ''))
df.head()

Unnamed: 0,chunk_id,v1_percentage,v2_percentage,normalized_v1_percentage,normalized_v2_percentage,intersection_percentage,chunk_composition,missing_v1_lines,missing_v2_lines,missing_v1_lines_perc,missing_v2_lines_perc,v1_size,v2_size,chunk_size_delta,partial_order,chunk_size,resolution_size
0,776662,0.42,0.92,0.25,0.75,0.33,v1 v2 (v1_2) v1 (v1_2) v1 v2,1,1,16.67,8.33,6,12,6.0,True,18,12
1,776782,0.57,0.75,0.41,0.59,0.32,v2 v1 (v1_2) v1 (v1_2) v1 (v1_2) v2 (v1_2) v2...,2,4,6.06,7.55,33,53,20.0,True,113,101
2,776793,0.75,0.25,0.75,0.25,0.0,v2 v1,1,0,25.0,0.0,4,1,-3.0,True,6,5
3,776828,0.33,0.67,0.33,0.67,0.0,v2 v1 v2,2,0,66.67,0.0,3,2,-1.0,True,5,3
4,776863,0.25,0.75,0.25,0.75,0.0,v2 v1,0,1,0.0,25.0,1,4,3.0,True,6,5


In [158]:
malformed_chunks = pd.read_csv(f'{data_folder}/malformed_chunks.csv')
malformed_chunks = malformed_chunks['chunk_id'].unique()
print(f'Removing {len(malformed_chunks)} malformed chunks')
df = df[~df['chunk_id'].isin(malformed_chunks)]
print(f'Total number of chunks: {len(df)}')

Removing 264 malformed chunks
Total number of chunks: 13309


In [159]:
# remove imprecise resolutions
df = df[df['v1_percentage']!= -1]
print(len(df))

10726


In [160]:
# remove implicit forks
df = analysis_util.filter_implicit_forks(df)
print(f'New total number of chunks: {len(df)}')

Total projects:  1082
Filtered 6 of 1082 projects for being implicit forks: {'IceColdSandwich/android_frameworks_base', 'AOKP/frameworks_base_disabled', 'adetaylor/android-frameworks-base-with-remote-control-service', 'RealVNC/android-frameworks-base-with-screenshot-tweaks', 'allwinner-ics/platform_frameworks_base', 'cgjones/android-frameworks-base'}
Total valid projects:  1076
New total number of chunks: 10177


In [161]:
df_descriptive = df.describe()
df_descriptive

Unnamed: 0,chunk_id,v1_percentage,v2_percentage,normalized_v1_percentage,normalized_v2_percentage,intersection_percentage,missing_v1_lines,missing_v2_lines,missing_v1_lines_perc,missing_v2_lines_perc,v1_size,v2_size,chunk_size_delta,chunk_size,resolution_size
count,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0,10177.0
mean,1097355.0,0.55345,0.533306,0.510045,0.48993,0.086609,1.823622,2.175789,32.502489,36.404496,6.79493,6.965216,0.170286,15.298418,8.516164
std,211934.3,0.31901,0.320762,0.304105,0.304072,0.198059,6.835704,10.463504,29.994432,30.863874,30.5106,33.251309,20.674248,68.754593,36.407157
min,776662.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-388.0,2.0,1.0
25%,939823.0,0.33,0.33,0.33,0.3,0.0,0.0,0.0,0.0,0.0,2.0,2.0,-2.0,4.0,2.0
50%,1064856.0,0.5,0.5,0.5,0.5,0.0,1.0,1.0,33.33,40.0,2.0,3.0,0.0,6.0,3.0
75%,1258354.0,0.81,0.8,0.7,0.67,0.0,1.0,2.0,50.0,50.0,5.0,5.0,2.0,12.0,7.0
max,1658666.0,1.0,1.0,1.0,1.0,1.0,353.0,689.0,100.0,100.0,1352.0,1402.0,1401.0,2922.0,1557.0


In [162]:
sample = df.sample(n=SAMPLE_SIZE, random_state=33)
sample_descriptive = sample.describe()
sample_descriptive

Unnamed: 0,chunk_id,v1_percentage,v2_percentage,normalized_v1_percentage,normalized_v2_percentage,intersection_percentage,missing_v1_lines,missing_v2_lines,missing_v1_lines_perc,missing_v2_lines_perc,v1_size,v2_size,chunk_size_delta,chunk_size,resolution_size
count,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0
mean,1115454.0,0.567952,0.535723,0.515843,0.484157,0.103373,1.433735,1.76506,32.150663,37.517108,4.933735,5.506024,0.572289,11.620482,7.096386
std,211940.2,0.302075,0.315036,0.290969,0.290969,0.207153,2.107377,2.607492,29.18113,30.395041,6.041408,8.513623,9.131574,13.336005,9.716109
min,776948.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-38.0,2.0,1.0
25%,961459.2,0.5,0.33,0.33,0.33,0.0,0.0,1.0,0.0,3.0125,2.0,2.0,-1.0,4.0,2.0
50%,1079062.0,0.5,0.5,0.5,0.5,0.0,1.0,1.0,33.33,50.0,3.0,3.0,0.0,7.0,3.0
75%,1299988.0,0.8,0.83,0.67,0.67,0.08,1.0,2.0,50.0,50.0,6.0,6.0,2.0,13.75,8.0
max,1610560.0,1.0,1.0,1.0,1.0,1.0,15.0,20.0,100.0,100.0,38.0,76.0,76.0,90.0,70.0


In [163]:
# standard error for the sample = standard deviation of the dataset / sqrt of the sample size
data = []
for column in df_descriptive.columns:
    if 'chunk_id' not in column:
        df_value = df_descriptive.loc['mean'][column]
        df_std = df_descriptive.loc['std'][column]
        sample_value = sample_descriptive.loc['mean'][column]
        standard_error = df_descriptive.loc['std'][column] / sqrt(SAMPLE_SIZE)
        sample_range_lower = df_value - standard_error
        sample_range_upper = df_value + standard_error
        is_within = (sample_value <= sample_range_upper) and (sample_value >= sample_range_lower)
#         variability = df_value - sample_value
#         variability_perc = (variability * 100) / df_value
    #     else:
    #         variability = sample_value - df_value
        data.append([column, df_value, df_std, standard_error, sample_range_lower, sample_range_upper, sample_value, is_within])
pd.DataFrame(data, columns=['metric', 'dataset_mean', 'dataset_std', 'standard error', 'sample lower range', 'sample upper range', 'sample_mean', 'is within?'])

Unnamed: 0,metric,dataset_mean,dataset_std,standard error,sample lower range,sample upper range,sample_mean,is within?
0,v1_percentage,0.55345,0.31901,0.02476,0.52869,0.57821,0.567952,True
1,v2_percentage,0.533306,0.320762,0.024896,0.508411,0.558202,0.535723,True
2,normalized_v1_percentage,0.510045,0.304105,0.023603,0.486442,0.533648,0.515843,True
3,normalized_v2_percentage,0.48993,0.304072,0.023601,0.46633,0.513531,0.484157,True
4,intersection_percentage,0.086609,0.198059,0.015372,0.071237,0.101981,0.103373,False
5,missing_v1_lines,1.823622,6.835704,0.530554,1.293068,2.354175,1.433735,True
6,missing_v2_lines,2.175789,10.463504,0.812125,1.363663,2.987914,1.76506,True
7,missing_v1_lines_perc,32.502489,29.994432,2.328019,30.174469,34.830508,32.150663,True
8,missing_v2_lines_perc,36.404496,30.863874,2.395501,34.008995,38.799998,37.517108,True
9,v1_size,6.79493,30.5106,2.368082,4.426848,9.163012,4.933735,True


In [164]:
sample.head()

Unnamed: 0,chunk_id,v1_percentage,v2_percentage,normalized_v1_percentage,normalized_v2_percentage,intersection_percentage,chunk_composition,missing_v1_lines,missing_v2_lines,missing_v1_lines_perc,missing_v2_lines_perc,v1_size,v2_size,chunk_size_delta,partial_order,chunk_size,resolution_size
10515,1310670,0.5,0.5,0.5,0.5,0.0,v2 v1,0,1,0.0,50.0,1,2,1.0,True,3,2
11270,1374858,0.5,0.83,0.33,0.67,0.33,(v1_2) v1 v2,3,0,50.0,0.0,6,5,-1.0,True,13,7
8957,1159289,0.0,1.0,0.0,1.0,0.0,v2,1,2,100.0,66.67,1,3,2.0,True,4,1
7636,1111021,0.5,0.5,0.5,0.5,0.0,v1 v2,0,1,0.0,50.0,1,2,1.0,True,3,2
7404,1106228,0.5,0.5,0.5,0.5,0.0,v1 v2,1,2,50.0,66.67,2,3,1.0,True,7,2


In [165]:
file = f'{data_folder}/dataset.json'
with open(file) as f:
    data_listofdict = json.load(f)
dataset_complete = pd.DataFrame.from_dict(data_listofdict)
dataset_complete.head()

Unnamed: 0,chunk_id,v1,v2,base,solution,before_context,after_context
0,776662,if (response.getResponseCode() == 200 ...,if (response.getResponseCode() == ...,package net.threescale.api.v2;\n\nimport net.t...,"\n log.info(""response code was: "" +...","\n log.info(""response code was: "" +...",}\n }\n\n\n
1,776782,this.outgoingOrdinals = createOrdinals( so...,this.sourceProcessID = sourceProcess.getID...,"/*\n * Copyright (c) 2007-2015 Concurrent, Inc...",public ProcessEdge( ElementGraph sourceEleme...,\n public ProcessEdge( ElementGraph sourceEle...,}\n\n private Set<Integer> createOrdinals...
2,776793,this.ordinals = processEdge.getIncomingO...,this.ordinals = processEdge.getSourcePro...,"/*\n * Copyright (c) 2007-2015 Concurrent, Inc...",{\n this.config = config;\n th...,{\n this.config = config;\n th...,}\n\n public FlowElement getFlowEleme...
3,776828,flowStepStats.markFailed( throwable ...,flowStepStats.markFailed( this.throw...,"/*\n * Copyright (c) 2007-2015 Concurrent, Inc...",\n if( this.throwable != null )\n ...,\n if( this.throwable != null )\n ...,}\n else\n {\n\n
4,776863,"public Map<Object, Object> getDefaultPropert...",{\n Update.registerPlanner( getClass() );\n...,"/*\n * Copyright (c) 2007-2014 Concurrent, Inc...",return DebugLevel.valueOf( debugLevel );\n...,return DebugLevel.valueOf( debugLevel );\n...,{\n return defaultProperties;\n }\n\n


In [166]:
sample_data = dataset_complete[dataset_complete['chunk_id'].isin(sample['chunk_id'].unique())]
sample_data.head()

Unnamed: 0,chunk_id,v1,v2,base,solution,before_context,after_context
10,776948,public static <R> R returnInstanceFieldIfExi...,public static Object invokeConstructor( Stri...,"/*\n * Copyright (c) 2007-2014 Concurrent, Inc...",}\n }\n\n public static <R> R return...,}\n }\n\n,{\n try\n {\n\n
81,777948,"assert(b2.toFEN().equals(""k7/8/P7/8/8/...","assert(b2.toFEN().equals(""k7/8/P7/8/8/...","// Oliver Kullmann, 6.12.2010 (Swansea)\n\ncla...","b2.do_normal_white_move('h','2','h','4...","b2.do_normal_white_move('h','2','h','4...",}\n // testing setting and resettin...
116,778595,mDeck = deck;\n sSeriesList = g...,sDeck = deck;\n if (type == TYP...,/*********************************************...,\n public static boolean refreshDeckStatist...,\n public static boolean refreshDeckStatist...,}\n }\n\n\n
187,779647,"mSpaceUntil.substring(0, mSpaceUnt...","spaceUntil.substring(0, spaceUntil...",/*********************************************...,for (Field f : getFields()) {\n ...,for (Field f : getFields()) {\n ...,for (Card card : getUpdatedRelated...
188,779688,"cursor = mDatabase.rawQuery(query,...","cursor = database.rawQuery(query, ...",/*********************************************...,Cursor cursor = null;\n\n try {...,Cursor cursor = null;\n\n try {\n,String methodName = getCursorMetho...


In [167]:
#sample_data[['chunk_id', 'solution']].to_csv('test.csv')

In [177]:
sample_data.sample(16)

Unnamed: 0,chunk_id,v1,v2,base,solution,before_context,after_context
7733,1113875,"\t\tv = T.setNodeV(new BSTNode(T, K = x));\n\t...","\t\tv = T.v = new BSTNode(T, K = x);\n\t\tv.bg...",package algvis.bst;\n\nimport algvis.core.Algo...,"\tpublic BSTInsert(BST T, int x) {\n\t\tsuper(...","\tpublic BSTInsert(BST T, int x) {\n\t\tsuper(...","\t\tsetHeader(""insertion"");\n\t}\n\n\n"
11507,1393203,public abstract NavigationStore getNavigat...,public abstract SiteService getSiteService...,/*\n * Copyright (C) 2012 eXo Platform SAS.\n ...,\n public abstract LayoutService getLayoutS...,\n public abstract LayoutService getLayoutS...,\n public abstract DescriptionStore getDesc...
12207,1418870,import io.realm.entities.Dog;\nimport io.realm...,import io.realm.entities.AnnotationTypes;\n,package io.realm;\n\nimport android.test.Andro...,import java.io.IOException;\n\nimport io.realm...,import java.io.IOException;\n\nimport io.realm...,import io.realm.exceptions.RealmMigrationNeede...
7742,1113895,"\t\tv = T.setNodeV(new AANode(T, K = x));\n\t\...","\t\tT.v = v = new AANode(T, K = x);\n\t\tv.get...",package algvis.aatree;\n\nimport algvis.bst.BS...,"\tpublic AAInsert(AA T, int x) {\n\t\tsuper(T)...","\tpublic AAInsert(AA T, int x) {\n\t\tsuper(T)...","\t\tsetHeader(""insertion"");\n\t}\n\n\n"
10515,1310670,public class PrimitiveType extends Annotatable...,"@SuppressWarnings({""rawtypes"", ""unchecked""})\n...",/*********************************************...,* @since 2.0\n * @noinstantiate This class is...,* @since 2.0\n * @noinstantiate This class is...,\n\t/**\n \t * Primitive type codes (typesafe ...
5783,1039380,final String epoch = sourcePro...,final LCMetadata lcMetadata = ...,package org.esa.cci.lc.conversion;\n\nimport o...,@Override\n public Obje...,@Override\n public Obje...,s...
11219,1371887,\tpublic static final Set<String> identifiers ...,\tpublic static final Set<String> identifiers ...,package de.uni_leipzig.simba.saim.core.metric;...,public class Measure extends Node\n{\n\tpublic...,public class Measure extends Node\n{\n\tpublic...,\n\t@Override public Set<String> identifiers()...
2867,930236,"public List<String> lrange(String key, int...",public List<String> lrange(final String ke...,package redis.clients.jedis;\n\nimport java.io...,"* @return Multi bulk reply, specifically ...","* @return Multi bulk reply, specifically ...","client.lrange(key, start, end);\n ..."
9009,1159572,import com.google.common.base.Predicate;\nimpo...,import java.lang.Thread.UncaughtExceptionHandl...,"/*\n * Copyright 2010-2011 Ning, Inc.\n *\n * ...",\npackage com.ning.billing.entitlement.engine....,\npackage com.ning.billing.entitlement.engine....,\npublic abstract class ApiEventProcessorBase ...
10967,1361290,.machine(machine.get())\n,.machine((SshMachineLocati...,package brooklyn.entity.database.postgresql;\n...,\n feed = SshFeed.builder()\n ...,\n feed = SshFeed.builder()\n ...,.poll(new SshPollConfig<Bo...


In [181]:
sample_data[sample_data['chunk_id'] == 782705]

Unnamed: 0,chunk_id,v1,v2,base,solution,before_context,after_context
330,782705,,ModuleVersionArtifactMetaData artifact(Str...,/*\n * Copyright 2012 the original author or a...,\n ModuleVersionArtifactMetaData artifact(A...,\n ModuleVersionArtifactMetaData artifact(A...,boolean isMetaDataOnly();\n}\n\n
