In [6]:
import os
from collections import defaultdict
import re

In [7]:
def load_txt_files(directory):
    # load all txt files in a directory
    data = defaultdict(list)
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r') as file:
                data[filename] = file.read()
    return data

data = load_txt_files('.')

In [8]:
data

defaultdict(list,
            {'results_deezer_0.txt': 'K 20 T 10 alpha_word 0.2 alpha_ent 0.30: \n51.4062\n\n\nK 20 T 10 alpha_word 0.3 alpha_ent 0.30: \n51.4062\n\n\nK 20 T 10 alpha_word 0.4 alpha_ent 0.30: \n50.824000000000005\n\n\nK 20 T 10 alpha_word 0.5 alpha_ent 0.30: \n50.1422\n\n\nK 20 T 10 alpha_word 0.2 alpha_ent 0.40: \n59.6488\n\n\nK 20 T 10 alpha_word 0.3 alpha_ent 0.40: \n59.6488\n\n\nK 20 T 10 alpha_word 0.4 alpha_ent 0.40: \n59.42040000000001\n\n\nK 20 T 10 alpha_word 0.5 alpha_ent 0.40: \n59.0009\n\n\nK 50 T 10 alpha_word 0.2 alpha_ent 0.30: \n51.9509\n\n\nK 50 T 10 alpha_word 0.3 alpha_ent 0.30: \n51.5633\n\n\nK 50 T 10 alpha_word 0.4 alpha_ent 0.30: \n51.477300000000014\n\n\nK 50 T 10 alpha_word 0.5 alpha_ent 0.30: \n48.678399999999996\n\n\nK 50 T 10 alpha_word 0.2 alpha_ent 0.40: \n57.787600000000005\n\n\nK 50 T 10 alpha_word 0.3 alpha_ent 0.40: \n57.6685\n\n\nK 50 T 10 alpha_word 0.4 alpha_ent 0.40: \n56.3269\n\n\nK 50 T 10 alpha_word 0.5 alpha_ent 0.40: \n58.888\

In [9]:
def preprocess_data(data_dict):
    # Regular expression pattern to extract data
    pattern = r'K (\d+) T (\d+) alpha_word (\d\.\d) alpha_ent (\d\.\d+): \n(\d+\.\d+)'

    # Dictionary to store the processed data
    processed_data = defaultdict(list)

    # Looping through each file's data in the dictionary
    for file_name, file_data in data_dict.items():
        # Finding all matches of the pattern in the current file's data
        matches = re.findall(pattern, file_data)

        # Adding the extracted data to the processed_data dictionary
        for match in matches:
            K, T, alpha_word, alpha_ent, value = match
            processed_data[file_name].append({
                'K': int(K),
                'T': int(T),
                'alpha_word': float(alpha_word),
                'alpha_ent': float(alpha_ent),
                'value': float(value)
            })

    return processed_data

processed = preprocess_data(data)

In [10]:
processed

defaultdict(list,
            {'results_deezer_0.txt': [{'K': 20,
               'T': 10,
               'alpha_word': 0.2,
               'alpha_ent': 0.3,
               'value': 51.4062},
              {'K': 20,
               'T': 10,
               'alpha_word': 0.3,
               'alpha_ent': 0.3,
               'value': 51.4062},
              {'K': 20,
               'T': 10,
               'alpha_word': 0.4,
               'alpha_ent': 0.3,
               'value': 50.824000000000005},
              {'K': 20,
               'T': 10,
               'alpha_word': 0.5,
               'alpha_ent': 0.3,
               'value': 50.1422},
              {'K': 20,
               'T': 10,
               'alpha_word': 0.2,
               'alpha_ent': 0.4,
               'value': 59.6488},
              {'K': 20,
               'T': 10,
               'alpha_word': 0.3,
               'alpha_ent': 0.4,
               'value': 59.6488},
              {'K': 20,
               'T': 10,
     

In [12]:
# Now calculate mean, emdian, standard deviation, variance and range for the value for each setting of K, T, alpha_word and alpha_ent
from statistics import mean, median, stdev, variance

def calculate_statistics(processed_data):
    # Dictionary to store the calculated statistics
    statistics = defaultdict(dict)

    # Looping through each file's data in the dictionary
    for file_name, file_data in processed_data.items():
        # Looping through each setting in the file's data
        for setting in file_data:
            K = setting['K']
            T = setting['T']
            alpha_word = setting['alpha_word']
            alpha_ent = setting['alpha_ent']
            value = setting['value']

            # If the setting is not in the dictionary, add it
            if (K, T, alpha_word, alpha_ent) not in statistics:
                statistics[(K, T, alpha_word, alpha_ent)] = []

            # Add the value to the list of values for the setting
            statistics[(K, T, alpha_word, alpha_ent)].append(value)

    # Looping through each setting in the dictionary
    for setting, values in statistics.items():
        # Calculate the statistics for the values
        statistics[setting] = {
            'mean': mean(values),
            'median': median(values),
            'stdev': stdev(values),
            'variance': variance(values),
            'range': max(values) - min(values)
        }

    return statistics

statistics = calculate_statistics(processed)
statistics

defaultdict(dict,
            {(20, 10, 0.2, 0.3): {'mean': 51.4062,
              'median': 51.4062,
              'stdev': 0.0,
              'variance': 0.0,
              'range': 0.0},
             (20, 10, 0.3, 0.3): {'mean': 51.4062,
              'median': 51.4062,
              'stdev': 0.0,
              'variance': 0.0,
              'range': 0.0},
             (20, 10, 0.4, 0.3): {'mean': 50.824000000000005,
              'median': 50.824000000000005,
              'stdev': 0.0,
              'variance': 0.0,
              'range': 0.0},
             (20, 10, 0.5, 0.3): {'mean': 50.1422,
              'median': 50.1422,
              'stdev': 0.0,
              'variance': 0.0,
              'range': 0.0},
             (20, 10, 0.2, 0.4): {'mean': 55.9246,
              'median': 55.37639999999999,
              'stdev': 1.7165107398440598,
              'variance': 2.946409120000002,
              'range': 5.436800000000005},
             (20, 10, 0.3, 0.4): {'mean': 55.92

In [13]:
# alright now create a table of these sttatistics
import pandas as pd

def create_table(statistics):
    # Create a DataFrame from the statistics dictionary
    df = pd.DataFrame(statistics).T

    # Rename the columns
    df.columns = ['mean', 'median', 'stdev', 'variance', 'range']

    return df

table = create_table(statistics)
table

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,mean,median,stdev,variance,range
20,10,0.2,0.3,51.4062,51.4062,0.0,0.0,0.0
20,10,0.3,0.3,51.4062,51.4062,0.0,0.0,0.0
20,10,0.4,0.3,50.824,50.824,0.0,0.0,0.0
20,10,0.5,0.3,50.1422,50.1422,0.0,0.0,0.0
20,10,0.2,0.4,55.9246,55.3764,1.716511,2.946409,5.4368
20,10,0.3,0.4,55.9246,55.3764,1.716511,2.946409,5.4368
20,10,0.4,0.4,55.6962,55.148,1.716511,2.946409,5.4368
20,10,0.5,0.4,55.2767,54.7285,1.716511,2.946409,5.4368
50,10,0.2,0.3,51.9509,51.9509,0.0,0.0,0.0
50,10,0.3,0.3,51.5633,51.5633,0.0,0.0,0.0


In [21]:
# sort table by the values in the index
table = table.sort_index()
table

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,mean,median,stdev,variance,range
20,10,0.2,0.3,51.4062,51.4062,0.0,0.0,0.0
20,10,0.2,0.4,55.9246,55.3764,1.716511,2.946409,5.4368
20,10,0.3,0.3,51.4062,51.4062,0.0,0.0,0.0
20,10,0.3,0.4,55.9246,55.3764,1.716511,2.946409,5.4368
20,10,0.4,0.3,50.824,50.824,0.0,0.0,0.0
20,10,0.4,0.4,55.6962,55.148,1.716511,2.946409,5.4368
20,10,0.5,0.3,50.1422,50.1422,0.0,0.0,0.0
20,10,0.5,0.4,55.2767,54.7285,1.716511,2.946409,5.4368
50,10,0.2,0.3,51.9509,51.9509,0.0,0.0,0.0
50,10,0.2,0.4,55.62355,55.9729,2.114889,4.472754,5.3937


In [19]:
# also apply cv to the value for each setting of K, T, alpha_word and alpha_ent
def calculate_cv(processed_data):
    # Dictionary to store the calculated CVs
    cvs = defaultdict(dict)

    # Looping through each file's data in the dictionary
    for file_name, file_data in processed_data.items():
        # Looping through each setting in the file's data
        for setting in file_data:
            K = setting['K']
            T = setting['T']
            alpha_word = setting['alpha_word']
            alpha_ent = setting['alpha_ent']
            value = setting['value']

            # If the setting is not in the dictionary, add it
            if (K, T, alpha_word, alpha_ent) not in cvs:
                cvs[(K, T, alpha_word, alpha_ent)] = []

            # Add the value to the list of values for the setting
            values = cvs[(K, T, alpha_word, alpha_ent)]
            values.append(value)

            # If there are at least two values, calculate the CV
            if len(values) >= 2:
                values.append(stdev(values) / mean(values))

    # Looping through each setting in the dictionary
    for setting, values in cvs.items():
        # Calculate the CV for the values
        cvs[setting] = mean(values)

    return cvs

cvs = calculate_cv(processed)
cvs

defaultdict(dict,
            {(20, 10, 0.2, 0.3): 27.187532200001243,
             (20, 10, 0.3, 0.3): 27.187532200001243,
             (20, 10, 0.4, 0.3): 26.883694931958544,
             (20, 10, 0.5, 0.3): 26.527876308193857,
             (20, 10, 0.2, 0.4): 29.548509904886895,
             (20, 10, 0.3, 0.4): 29.548509904886895,
             (20, 10, 0.4, 0.4): 29.429331599452095,
             (20, 10, 0.5, 0.4): 29.21043775598385,
             (50, 10, 0.2, 0.3): 27.471797450236917,
             (50, 10, 0.3, 0.3): 27.26951890099313,
             (50, 10, 0.4, 0.3): 27.224637591030945,
             (50, 10, 0.5, 0.3): 25.763937626658947,
             (50, 10, 0.2, 0.4): 29.389405351076526,
             (50, 10, 0.3, 0.4): 29.327255739972923,
             (50, 10, 0.4, 0.4): 29.37920144025429,
             (50, 10, 0.5, 0.4): 29.963622178511176,
             (100, 10, 0.2, 0.3): 26.509331327456646,
             (100, 10, 0.3, 0.3): 27.449826571561204,
             (100, 10, 0.4, 0

In [20]:
# make cv table
def create_cv_table(cvs):
    # Create a DataFrame from the CVs dictionary
    df = pd.DataFrame(cvs, index=['cv']).T

    return df

cv_table = create_cv_table(cvs)
cv_table

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,cv
20,10,0.2,0.3,27.187532
20,10,0.3,0.3,27.187532
20,10,0.4,0.3,26.883695
20,10,0.5,0.3,26.527876
20,10,0.2,0.4,29.54851
20,10,0.3,0.4,29.54851
20,10,0.4,0.4,29.429332
20,10,0.5,0.4,29.210438
50,10,0.2,0.3,27.471797
50,10,0.3,0.3,27.269519


In [34]:
# do the shapiro test for normality on the aggregated data
from scipy.stats import shapiro

p_values = table['mean']
shapiro(p_values)


ShapiroResult(statistic=0.9014052748680115, pvalue=0.0067262486554682255)

In [39]:
# alright now do the homogeneity of variances test of levene for the aggregated data
from scipy.stats import levene

b_values = [
    50.2, 53.1, 48.5, 53.3, 53.2, 56.4, 52.5, 56.3,
    48.9, 49.2, 52.1, 50.9, 51.5, 52.6, 56.3, 60.6, 
    51.4, 50.8, 51.5, 55.3, 52.2, 48.1, 50.8, 54.9, 
    48.4, 50.6, 49.8, 51.6, 50, 49, 55.4, 53.3
    ]

p_values = table['mean']
levene(p_values, b_values)


LeveneResult(statistic=0.13202522718070456, pvalue=0.7175773733918681)

In [38]:
# alright now let's do the paired t test for the aggregated data
from scipy.stats import ttest_rel

p_values = table['mean']

b_values = [
    50.2, 53.1, 48.5, 53.3, 53.2, 56.4, 52.5, 56.3,
    48.9, 49.2, 52.1, 50.9, 51.5, 52.6, 56.3, 60.6, 
    51.4, 50.8, 51.5, 55.3, 52.2, 48.1, 50.8, 54.9, 
    48.4, 50.6, 49.8, 51.6, 50, 49, 55.4, 53.3
    ]
ttest_rel(p_values, b_values)

TtestResult(statistic=0.7256256632667231, pvalue=0.47351109185532136, df=31)