In [1]:
import pandas as pd
from scipy.stats import mannwhitneyu
import math

In [2]:
def read_file(file, sample_size=None):
    df = pd.read_csv(
    file, 
    sep=r"\[|\]\s=",
    index_col=False,
    skipinitialspace=True,
    names=['Variable', 'Project', 'Ratio'],
    engine='python',
    usecols=['Project', 'Ratio'],)
    df['Ratio'] = df['Ratio'].replace(" NaN", 0)
    df['Ratio'] = pd.to_numeric(df['Ratio'])
    if(sample_size != None):
        df = df.sample(sample_size)
    return df


# Aufgabe 2 a)

In [3]:
sample_size = 100_000
significance_niveau = 0.01

python_try_statement_ratio_df = read_file('./python_ratio.txt', sample_size)
java_try_statement_ratio_df = read_file('./java_ratio.txt', sample_size)

python_try_statement_ratio_mean = python_try_statement_ratio_df['Ratio'].mean()
python_try_statement_ratio_variance = python_try_statement_ratio_df['Ratio'].var()

java_try_statement_ratio_mean = java_try_statement_ratio_df['Ratio'].mean()
java_try_statement_ratio_variance = java_try_statement_ratio_df['Ratio'].var()

cohens_d = abs(python_try_statement_ratio_mean - java_try_statement_ratio_mean) / \
            math.sqrt((python_try_statement_ratio_variance+java_try_statement_ratio_variance)/2)


print("Aufgabe 2 a)")
print('Python Try Statment Ratio Mean:' + str(python_try_statement_ratio_mean))
print('Python Try Statment Ratio Variance:'+str(python_try_statement_ratio_variance))
print('Java Try Statment Ratio Mean:' + str(java_try_statement_ratio_mean))
print('Java Try Statment Ratio Variance:' + str(java_try_statement_ratio_variance))

results = mannwhitneyu(java_try_statement_ratio_df['Ratio'], python_try_statement_ratio_df['Ratio'])

print("Mann-Whitney U Test p-value: " + str(results.pvalue))
print("Are the results significant? " + str(results.pvalue < 0.01))

print(f"Cohend's d (Effektstärke): {cohens_d}")
print()

Aufgabe 2 a)
Python Try Statment Ratio Mean:0.006888994687144517
Python Try Statment Ratio Variance:7.869056619312679e-05
Java Try Statment Ratio Mean:0.008939431849949343
Java Try Statment Ratio Variance:0.00019253887670649068
Mann-Whitney U Test p-value: 3.265529996073257e-264
Are the results significant? True
Cohend's d (Effektstärke): 0.1760731145067849



# Aufgabe 2 b)

In [4]:
python_start = read_file('./python_ratio.txt', sample_size)
java_start = read_file('./java_ratio.txt', sample_size)

def mannwhitneyu_p(n):
    python_df = python_start.sample(n)
    java_df = java_start.sample(n)
    return mannwhitneyu(java_df['Ratio'], python_df['Ratio']).pvalue

python_full = read_file('./python_ratio.txt', sample_size)
java_full = read_file('./java_ratio.txt', sample_size)

uppper_bound = 100_000
lower_bound = 1
test_per_iteration = 50

counter = 0

print("Aufgabe 2 b)")
while True:
    current_sample_size = int((uppper_bound+lower_bound)/2)

    pvalues = list()
    for i in range(test_per_iteration):
        pvalues.append(mannwhitneyu_p(current_sample_size))
    p = sum(pvalues) / len(pvalues)
    
    print(f"#{counter}, upper_bound: {uppper_bound}, lower_bound={lower_bound}, size: {current_sample_size}, is_significant: {p < 0.01}")
    counter+=1

    if(abs(uppper_bound - lower_bound) <= 1):
        print(f"#{counter}, upper_bound: {uppper_bound}, lower_bound={lower_bound}, size: {current_sample_size}, is_significant: {p < 0.01}")
        break

    if(p < 0.01):
        uppper_bound = current_sample_size
    else:
        lower_bound = current_sample_size
print()

Aufgabe 2 b)
#0, upper_bound: 100000, lower_bound=1, size: 50000, is_significant: True
#1, upper_bound: 50000, lower_bound=1, size: 25000, is_significant: True
#2, upper_bound: 25000, lower_bound=1, size: 12500, is_significant: True
#3, upper_bound: 12500, lower_bound=1, size: 6250, is_significant: True
#4, upper_bound: 6250, lower_bound=1, size: 3125, is_significant: True
#5, upper_bound: 3125, lower_bound=1, size: 1563, is_significant: True
#6, upper_bound: 1563, lower_bound=1, size: 782, is_significant: False
#7, upper_bound: 1563, lower_bound=782, size: 1172, is_significant: False
#8, upper_bound: 1563, lower_bound=1172, size: 1367, is_significant: True
#9, upper_bound: 1367, lower_bound=1172, size: 1269, is_significant: True
#10, upper_bound: 1269, lower_bound=1172, size: 1220, is_significant: True
#11, upper_bound: 1220, lower_bound=1172, size: 1196, is_significant: True
#12, upper_bound: 1196, lower_bound=1172, size: 1184, is_significant: True
#13, upper_bound: 1184, lower_bound

# Aufgabe 3

In [5]:
python_try_statement_ratio_df = read_file('./python_ratio.txt')
java_try_statement_ratio_df = read_file('./java_ratio.txt')

python_try_statement_ratio_mean = python_try_statement_ratio_df['Ratio'].mean()
python_try_statement_ratio_variance = python_try_statement_ratio_df['Ratio'].var()

java_try_statement_ratio_mean = java_try_statement_ratio_df['Ratio'].mean()
java_try_statement_ratio_variance = java_try_statement_ratio_df['Ratio'].var()

cohens_d = abs(python_try_statement_ratio_mean - java_try_statement_ratio_mean) / \
            math.sqrt((python_try_statement_ratio_variance+java_try_statement_ratio_variance)/2)

print("Aufgabe 3")
print(f"Cohend's d (Effektstärke): {cohens_d}")

Aufgabe 3
Cohend's d (Effektstärke): 0.17501356403969534
