In [171]:
def load_arff(file_paths):
    list_of_lists = []

    for file_path in file_paths:
        with open(file_path, 'r') as f:
            lines = f.readlines()

        data_start = lines.index('@data\n') + 1
        attributes = [line.split()[1] for line in lines if line.startswith('@attribute')]

        data_list = []
        for line in lines[data_start:]:
            values = line.strip().split(',')
            data_dict = {attr: float(val) if val.replace('.', '', 1).isdigit() else val for attr, val in zip(attributes, values)}
            data_list.append(data_dict)

        # Convert the list of dictionaries to a list of lists
        list_of_lists.extend([[row[attr] for attr in attributes] for row in data_list])

    return list_of_lists, attributes



# Remove missing values and perform standardization to achieve O(n) runtime
def remove_missing_values(data_list):
    cleaned_data = []

    # Calculate column means and standard deviations
    column_means = [sum(row[j] for row in data_list if isinstance(row[j], (int, float))) / len(data_list) for j in range(2, len(data_list[0]))]
    column_std_devs = [((sum((row[j] - column_means[j - 2]) ** 2 for row in data_list if isinstance(row[j], (int, float)))) / len(data_list)) ** 0.5 for j in range(2, len(data_list[0]))]

    for row in data_list:
        cleaned_row = row[:2]  # Keep the first two columns unchanged

        for j in range(2, len(row)):
            if isinstance(row[j], (int, float)):
                # Standardization
                cleaned_row.append((row[j] - column_means[j - 2]) / column_std_devs[j - 2])
            else:
                cleaned_row.append(0)

        cleaned_data.append(cleaned_row)

    return cleaned_data



def main():
    V4_data_2017 = 'V4_data/2017.arff'
    V4_data_2018 = 'V4_data/2018.arff'

    file_paths = [V4_data_2017, V4_data_2018]

    data_list, attributes = load_arff(file_paths)

    # Print the header
    print(attributes)

    # Remove missing values and calculate for standardization
    data_list = remove_missing_values(data_list)

    # Print the result
    for row in data_list:
        print(row)

if __name__ == "__main__":
    main()


['Num', 'Country', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71', 'X72', 'X73', 'X74', 'X75', 'X76', 'X77', 'X78', 'X79', 'X80', 'X81', 'X82', 'S']
[10.0, 'Hungary', 0.013989980615209018, -0.03337145579034831, -0.03287108476624076, -0.0566172698324833, -0.03388397796468453, 0.00863279512375896, -0.06894116315784725, -0.03335207266503263, -0.03335300982374853, 0.019219750789737144, -0.03230419676225334, -0.04350257734192088, 0.11296566715325072, -0.03290602596924057, -0.0665071332909982, -0.01392184788147024, -0.03898230923775427, 0.08515155898393659, -0

NameError: name 'eig' is not defined

In [163]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def load_arff(file_paths):
    dfs = []

    for file_path in file_paths:
        with open(file_path, 'r') as f:
            lines = f.readlines()

        data_start = lines.index('@data\n') + 1
        attributes = [line.split()[1] for line in lines if line.startswith('@attribute')]

        data_list = []
        for line in lines[data_start:]:
            values = line.strip().split(',')
            data_dict = {attr: float(val) if val.replace('.', '', 1).isdigit() else val for attr, val in zip(attributes, values)}
            data_list.append(data_dict)

        df = pd.DataFrame(data_list)
        dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

def interpolate_missing_values(df):
    numeric_columns = df.iloc[:, 2:].select_dtypes(include=['float64']).columns
    df[numeric_columns] = df[numeric_columns].interpolate()
    return df

def standardize_numeric_columns(df):
    numeric_columns = df.iloc[:, 2:].select_dtypes(include=['float64']).columns
    scaler = StandardScaler()
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    return df

def main():
    V4_data_2017 = 'V4_data/2017.arff'
    file_paths = [V4_data_2017]
    
    df_combined = load_arff(file_paths)
    df_combined = interpolate_missing_values(df_combined)
    df_combined = standardize_numeric_columns(df_combined)

    print(df_combined)

if __name__ == "__main__":
    main()


       Num   Country     X1    X2    X3    X4     X5     X6     X7    X8  ...  \
0     10.0   Hungary   0.14  0.53  0.19  1.41   0.33   0.14   0.89  1.08  ...   
1     22.0    Poland   0.01   0.5  0.07   1.4   0.06   0.03   1.01  0.65  ...   
2     27.0   Hungary   0.03  0.74  0.01  1.02    0.0   0.03   0.35  0.93  ...   
3     73.0    Poland    0.0  0.58  0.15  1.29   0.22   0.01   0.72  0.85  ...   
4     74.0    Poland    0.0   0.0   0.0   0.0    0.0    0.0    0.0   0.0  ...   
..     ...       ...    ...   ...   ...   ...    ...    ...    ...   ...  ...   
445  404.0  Slovakia   0.03  0.22  0.03  1.58    0.0   0.03   3.48  0.18  ...   
446  423.0    Poland    0.0   0.0   0.0   0.0    0.0    0.0    0.0   0.0  ...   
447  427.0  Slovakia  -0.07  0.08  0.03   2.5  -0.68  -0.08  11.39  0.13  ...   
448  432.0    Poland    0.0   0.0   0.0   0.0    0.0    0.0    0.0   0.0  ...   
449  438.0  Slovakia   0.07  0.01  0.93  66.5   0.26   0.07  69.76  0.01  ...   

     X74  X75  X76    X77  