# Sample analyses

This notebooks extracts variables from the observations we put in the samples spreadsheet

In [1]:
from collections import Counter
import pandas as pd

import analysis_helpers, importlib
importlib.reload(analysis_helpers)

from analysis_helpers import var, relative_var

splitcolumn = lambda x: set(y.strip() for y in x.split(","))

In [2]:
df = pd.read_excel("samples.xls")

## Check code

In [3]:
counter = Counter()
df["code"].apply(splitcolumn).apply(counter.update)
counter

Counter({'python': 66, 'no': 2, 'bash': 5, 'html': 1, 'sql': 2})

In [4]:
for key, value in counter.most_common():
    variable = "a9_code_" + key.lower().replace(" ", "_")
    print(variable, relative_var(variable, value, len(df)))


a9_code_python 66 (95.65%)
a9_code_bash 5 (7.25%)
a9_code_no 2 (2.90%)
a9_code_sql 2 (2.90%)
a9_code_html 1 (1.45%)


## Check type

In [5]:
counter = Counter()
df["type"].apply(splitcolumn).apply(counter.update)
counter

Counter({'practice': 9,
         'course': 31,
         'analysis': 10,
         'academic': 10,
         'tool': 5,
         'book': 3,
         'presentation': 1})

In [6]:
for key, value in counter.most_common():
    variable = "a9_type_" + key.lower().replace(" ", "_")
    print(variable, relative_var(variable, value, len(df)))


a9_type_course 31 (44.93%)
a9_type_analysis 10 (14.49%)
a9_type_academic 10 (14.49%)
a9_type_practice 9 (13.04%)
a9_type_tool 5 (7.25%)
a9_type_book 3 (4.35%)
a9_type_presentation 1 (1.45%)


## Check area

In [7]:
counter = Counter()
df["mainarea"].apply(splitcolumn).apply(counter.update)
counter

Counter({'Data mining': 6,
         'Math': 3,
         'Data cleaning': 5,
         'CV': 2,
         'Problem definition': 2,
         'Data exploration': 28,
         'Algorithms': 3,
         'ML': 10,
         'Programming': 4,
         'Databases': 3,
         'Game': 1,
         'Computer Graphics': 1,
         'Physics': 1})

In [8]:
for key, value in counter.most_common():
    variable = "a9_area_" + key.lower().replace(" ", "_")
    print(variable, relative_var(variable, value, len(df)))


a9_area_data_exploration 28 (40.58%)
a9_area_ml 10 (14.49%)
a9_area_data_mining 6 (8.70%)
a9_area_data_cleaning 5 (7.25%)
a9_area_programming 4 (5.80%)
a9_area_math 3 (4.35%)
a9_area_algorithms 3 (4.35%)
a9_area_databases 3 (4.35%)
a9_area_cv 2 (2.90%)
a9_area_problem_definition 2 (2.90%)
a9_area_game 1 (1.45%)
a9_area_computer_graphics 1 (1.45%)
a9_area_physics 1 (1.45%)


## Check Markdown

In [9]:
counter = Counter()
df["markdown"].apply(splitcolumn).apply(counter.update)
counter

Counter({'no': 22,
         'code': 38,
         'tasks': 16,
         'title': 36,
         'problem': 29,
         'conclusion': 13})

In [10]:
for key, value in counter.most_common():
    variable = "a9_markdown_" + key.lower().replace(" ", "_")
    print(variable, relative_var(variable, value, len(df)))


a9_markdown_code 38 (55.07%)
a9_markdown_title 36 (52.17%)
a9_markdown_problem 29 (42.03%)
a9_markdown_no 22 (31.88%)
a9_markdown_tasks 16 (23.19%)
a9_markdown_conclusion 13 (18.84%)


In [11]:
with_markdown = df[
    df["markdown"].apply(splitcolumn).apply(lambda x: "no" not in x)
]

In [12]:
variable = "a9_markdown_yes"
print(variable, relative_var(variable, len(with_markdown), len(df)))

a9_markdown_yes 47 (68.12%)


In [13]:
counter = Counter()
with_markdown["language"].apply(splitcolumn).apply(counter.update)
counter

Counter({'English': 43,
         'Chinese': 1,
         'Japanese': 1,
         'Spanish': 1,
         'Portuguese': 1})

In [14]:
variable = "a9_markdown_non_english"
print(variable, relative_var(variable, sum(v for c, v in counter.items() if c != "English"), len(with_markdown)))

a9_markdown_non_english 4 (8.51%)


## Check title

In [15]:
counter = Counter()
df["title"].apply(splitcolumn).apply(counter.update)
counter

Counter({'meaningful': 58,
         'numbered': 20,
         'space': 12,
         'character': 1,
         'meaningless': 11,
         'copy': 2})

In [16]:
invalid_title = df[df["title"].apply(splitcolumn).apply(lambda x: "space" in x or "character" in x or "copy" in x)]
print("Notebooks with invalid title: ", relative_var("a9_invalid_title", len(invalid_title), len(df)))

Notebooks with invalid title:  15 (21.74%)


In [17]:
for key, value in counter.most_common():
    variable = "a9_title_" + key.lower().replace(" ", "_")
    print(variable, relative_var(variable, value, len(df)))


a9_title_meaningful 58 (84.06%)
a9_title_numbered 20 (28.99%)
a9_title_space 12 (17.39%)
a9_title_meaningless 11 (15.94%)
a9_title_copy 2 (2.90%)
a9_title_character 1 (1.45%)


## Check imports

In [18]:
counter = Counter()
df["imports"].apply(splitcolumn).apply(counter.update)
counter

Counter({'both': 29,
         'first': 26,
         'second': 7,
         'no': 4,
         'middle': 5,
         'third': 1})

In [19]:
for key, value in counter.most_common():
    variable = "a9_imports_" + key.lower().replace(" ", "_")
    print(variable, relative_var(variable, value, len(df)))


a9_imports_both 29 (42.03%)
a9_imports_first 26 (37.68%)
a9_imports_second 7 (10.14%)
a9_imports_middle 5 (7.25%)
a9_imports_no 4 (5.80%)
a9_imports_third 1 (1.45%)


In [20]:
variable = "a9_imports_beginning_exclusive"
print(variable, relative_var(variable, len(df[df["imports"].apply(splitcolumn).apply(
    lambda x: bool(set(x) & {'first', 'second', 'third'})
)]), len(df)))

a9_imports_beginning_exclusive 31 (44.93%)


In [21]:
variable = "a9_imports_beginning_any"
print(variable, relative_var(variable, len(df[df["imports"].apply(splitcolumn).apply(
    lambda x: bool(set(x) & {'first', 'second', 'third', 'both'})
)]), len(df)))

a9_imports_beginning_any 60 (86.96%)


## Check organization

In [22]:
counter = Counter()
df["organization"].apply(splitcolumn).apply(counter.update)
counter

Counter({'orchestration': 55, 'heavy': 12, 'no': 2})

In [23]:
for key, value in counter.most_common():
    variable = "a9_organization_" + key.lower().replace(" ", "_")
    print(variable, relative_var(variable, value, len(df)))


a9_organization_orchestration 55 (79.71%)
a9_organization_heavy 12 (17.39%)
a9_organization_no 2 (2.90%)


In [24]:
counter = Counter()
df["functions"].apply(splitcolumn).apply(counter.update)
counter

Counter({'spread': 32, 'complex': 21, 'no': 34, 'simple': 14, 'beginning': 3})

In [25]:
variable = "a9_organization_functions"
print(variable, relative_var(variable, len(df[df["functions"].apply(splitcolumn).apply(
    lambda x: "no" not in x
)]), len(df)))

a9_organization_functions 35 (50.72%)


## Check tests

In [26]:
counter = Counter()
df["tests"].apply(splitcolumn).apply(counter.update)
counter

Counter({'no': 61,
         'unrelated': 3,
         'tool': 2,
         'other': 1,
         'related': 1,
         'execution': 1})

In [27]:
total_tests = df[df["tests"].apply(splitcolumn).apply(lambda x: "no" not in x)]
print("Repositories with tests:", relative_var("a9_tests", len(total_tests), len(df)))

Repositories with tests: 8 (11.59%)


In [28]:
for key, value in counter.most_common():
    variable = "a9_tests_" + key.lower().replace(" ", "_")
    print(variable, relative_var(variable, value, len(df)))


a9_tests_no 61 (88.41%)
a9_tests_unrelated 3 (4.35%)
a9_tests_tool 2 (2.90%)
a9_tests_other 1 (1.45%)
a9_tests_related 1 (1.45%)
a9_tests_execution 1 (1.45%)


## Check retrospective

In [29]:
counter = Counter()
df["outputs"].apply(splitcolumn).apply(counter.update)
counter

Counter({'print': 50,
         'exception': 10,
         'out': 37,
         'table': 30,
         'file': 19,
         'image': 32,
         'no': 5,
         'out*': 4,
         'widget': 5})

In [30]:
for key, value in counter.most_common():
    variable = "a9_outputs_" + key.lower().replace(" ", "_").replace("*", "_star")
    print(variable, relative_var(variable, value, len(df)))


a9_outputs_print 50 (72.46%)
a9_outputs_out 37 (53.62%)
a9_outputs_image 32 (46.38%)
a9_outputs_table 30 (43.48%)
a9_outputs_file 19 (27.54%)
a9_outputs_exception 10 (14.49%)
a9_outputs_no 5 (7.25%)
a9_outputs_widget 5 (7.25%)
a9_outputs_out_star 4 (5.80%)


## Check unordered

In [31]:
counter = Counter()
df["unordered"].apply(splitcolumn).apply(counter.update)
counter

Counter({'ambiguous': 11,
         'no': 38,
         'variable': 19,
         'exploratory': 28,
         'posterior': 6,
         'import': 9})

In [32]:
for key, value in counter.most_common():
    variable = "a9_unordered_" + key.lower().replace(" ", "_").replace("*", "_star")
    print(variable, relative_var(variable, value, len(df)))


a9_unordered_no 38 (55.07%)
a9_unordered_exploratory 28 (40.58%)
a9_unordered_variable 19 (27.54%)
a9_unordered_ambiguous 11 (15.94%)
a9_unordered_import 9 (13.04%)
a9_unordered_posterior 6 (8.70%)


In [33]:
df[df["unordered"].apply(splitcolumn).apply(lambda x: "no" not in x and "exploratory" not in x)]

Unnamed: 0.1,Unnamed: 0,Sample ID,repository_id,notebook_id,repository,commit,notebooks,notebooks_count,name,language,...,imports,organization,loop,functions,outputs,unordered,skips,empty,non-executed,tests
0,143044,1,241773,1356065,r3dmaohong/pyspark-practice,e64acc0dc508286770ad383f6adec75935447bde,python_spark_practice_10_sql_dataframe.ipynb;p...,5,python_spark_practice_9_disisiontree.ipynb,English,...,both,orchestration,simple,"complex, spread",print,ambiguous,middle,end,no,no
25,371404,26,175907,1023393,AdrianHsu/charades-parser,1d75b1a66c0f5fbb67e8c3824c02fafdb5f3397d,plot.ipynb;train_actions_csv.ipynb;test_action...,5,train_actions_csv.ipynb,Chinese,...,both,orchestration,simple,"complex, spread","out, table, image, file","ambiguous, posterior","beginning, middle",end,no,no
59,844009,60,97148,628171,Yogessvaren/Python_Stuff,196022de7087d0c6be00cc8aaaccbfd66bad63df,IPython/Learning IPython for Interactive Compu...,19,Python Language/Learn Python 3 The Hard Way/Ex...,English,...,first,orchestration,no,no,print,import,no,end,no,no


## Check empty

In [34]:
counter = Counter()
df["empty"].apply(splitcolumn).apply(counter.update)
counter

Counter({'end': 40, 'no': 28, 'middle': 5, 'markdown': 3, 'beginning': 1})

## Check non executed

In [35]:
counter = Counter()
df["non-executed"].apply(splitcolumn).apply(counter.update)
counter

Counter({'no': 44,
         'exception': 1,
         'all': 5,
         'end': 8,
         'middle': 10,
         'incomplete': 1,
         'commented': 4,
         'beginning': 1})

In [36]:
total_nonexecuted =  len(df[df["non-executed"].apply(splitcolumn).apply(
    lambda x: "no" not in x
)])
variable = "a9_nonexecuted_any"
print(variable, relative_var(variable, total_nonexecuted, len(df)))

a9_nonexecuted_any 25 (36.23%)


In [37]:
for key, value in counter.most_common():
    variable = "a9_nonexecuted_" + key.lower().replace(" ", "_").replace("*", "_star")
    total = total_nonexecuted if key != "no" else len(df)
    print(variable, relative_var(variable, value, total))


a9_nonexecuted_no 44 (63.77%)
a9_nonexecuted_middle 10 (40.00%)
a9_nonexecuted_end 8 (32.00%)
a9_nonexecuted_all 5 (20.00%)
a9_nonexecuted_commented 4 (16.00%)
a9_nonexecuted_exception 1 (4.00%)
a9_nonexecuted_incomplete 1 (4.00%)
a9_nonexecuted_beginning 1 (4.00%)


All variables have been created for the paper