In [1]:
from scipy import stats
from dataset_loader import load_dataset
import numpy
import pingouin

def run_mw_test(service_time, non_service_time, attribute, alpha=0.05):
    result = {}
    result["test"] = "Mann-Whitney"
    result["h0"] = str("service_time" + attribute + " == non_service_time" + attribute)
    result["h1"] = str("service_time" + attribute + " > non_service_time" + attribute)
    result["h2"] = str("service_time" + attribute + " < non_service_time" + attribute)
    
    mwu, p_value = stats.mannwhitneyu(service_time[attribute], non_service_time[attribute], alternative='two-sided')
    result["accepted"] = "h0"
    result["h0stats"] = mwu
    result["h0p-value"] = p_value
    if p_value > alpha:
        result["accepted"] = "h0"
    else :
        mwu, p_value = stats.mannwhitneyu(service_time[attribute], non_service_time[attribute], alternative='greater')        
        if p_value <= alpha:
            result["accepted"] = "h1"
        else :
            mwu, p_value = stats.mannwhitneyu(service_time[attribute], non_service_time[attribute], alternative='less')
            if p_value <= alpha:
                result["accepted"] = "h2"
                
    result["stats"] = mwu
    result["p-value"] = p_value
    result["effec-size"] = numpy.abs(pingouin.compute_effsize(service_time[attribute], non_service_time[attribute], eftype='cohen'))
    
    return result

    
def print_test_result(result):
    if result["accepted"] != "h0":
        print("{} test result: the H0 was rejected with statistic {} and p-value {}. Then the alternative hypothesis {} was acepted with statistic {} and p-value {}, with Cohen’s d effect size = {} [0.2 <= d (small), 0.5 <= d (medium), 0.8 <= d (large)].".format(
            result["test"],            
            result["h0stats"], 
            result["h0p-value"],            
            result[result["accepted"]], 
            result["stats"], 
            result["p-value"],
            result["effec-size"]))
    else:
        print("{} test result: the hypothesis {} is acepted with statistic {} and p-value {}, with Cohen’s d effect size = {} [0.2 <= d (small), 0.5 <= d (medium), 0.8 <= d (large)].".format(result["test"], result[result["accepted"]], result["stats"], result["p-value"], result["effec-size"]))

## Comparando Serviços

In [7]:
def fetch_comparison_data(dataset, serviceName, exclude):
    if (exclude):
        return dataset.loc[dataset["labels"].str.contains(serviceName)]
    else:
        return dataset.loc[~dataset["labels"].str.contains(serviceName)]

def run_difference_between_issue_resolution():
    service_tags = ['bandcamp', 'youtube', 'media.ccc.de', 'soundcloud', 'peertube']
    for service in service_tags:
        compare_service(service)

def compare_service(service):
    dataset = load_dataset()
    yes_dataset = fetch_comparison_data(dataset, service, False)
    no_dataset = fetch_comparison_data(dataset, service, True)
    # print(yes_dataset)
    # print(no_dataset)

    print_test_result(run_mw_test(yes_dataset, no_dataset, 'time_to_complete'))

In [8]:
run_difference_between_issue_resolution()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_outliers["label_crunched"] = df_issues_no_outliers["labels"].apply(convert_label_to_number)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_outliers["closed_at"] = df_issues["closed_at"].apply(datetime.timestamp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_o

Mann-Whitney test result: the hypothesis service_timetime_to_complete == NON_service_timetime_to_complete is acepted with statistic 2748.0 and p-value 0.2298316464762843, with Cohen’s d effect size = 0.2416596096784851 [0.2 <= d (small), 0.5 <= d (medium), 0.8 <= d (large)].


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_outliers["label_crunched"] = df_issues_no_outliers["labels"].apply(convert_label_to_number)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_outliers["closed_at"] = df_issues["closed_at"].apply(datetime.timestamp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_o

Mann-Whitney test result: the H0 was rejected with statistic 287846.0 and p-value 1.8410604117042094e-16. Then the alternative hypothesis service_timetime_to_complete < NON_service_timetime_to_complete was acepted with statistic 287846.0 and p-value 9.205302058521047e-17, with Cohen’s d effect size = 0.10260614740525463 [0.2 <= d (small), 0.5 <= d (medium), 0.8 <= d (large)].


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_outliers["label_crunched"] = df_issues_no_outliers["labels"].apply(convert_label_to_number)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_outliers["closed_at"] = df_issues["closed_at"].apply(datetime.timestamp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_o

Mann-Whitney test result: the H0 was rejected with statistic 11449.0 and p-value 0.02179061101940417. Then the alternative hypothesis service_timetime_to_complete < NON_service_timetime_to_complete was acepted with statistic 11449.0 and p-value 0.010895305509702085, with Cohen’s d effect size = 0.023634869396139502 [0.2 <= d (small), 0.5 <= d (medium), 0.8 <= d (large)].


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_outliers["label_crunched"] = df_issues_no_outliers["labels"].apply(convert_label_to_number)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_outliers["closed_at"] = df_issues["closed_at"].apply(datetime.timestamp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_o

Mann-Whitney test result: the H0 was rejected with statistic 42431.0 and p-value 0.0014819647410689043. Then the alternative hypothesis service_timetime_to_complete < NON_service_timetime_to_complete was acepted with statistic 42431.0 and p-value 0.0007409823705344521, with Cohen’s d effect size = 0.5855288012477932 [0.2 <= d (small), 0.5 <= d (medium), 0.8 <= d (large)].
Mann-Whitney test result: the H0 was rejected with statistic 19054.0 and p-value 0.040774335347854795. Then the alternative hypothesis service_timetime_to_complete < NON_service_timetime_to_complete was acepted with statistic 19054.0 and p-value 0.020387167673927398, with Cohen’s d effect size = 0.05904668645382262 [0.2 <= d (small), 0.5 <= d (medium), 0.8 <= d (large)].


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_outliers["label_crunched"] = df_issues_no_outliers["labels"].apply(convert_label_to_number)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_outliers["closed_at"] = df_issues["closed_at"].apply(datetime.timestamp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_issues_no_o