In [231]:
def load_arff(file_paths):
    list_of_lists = []

    for file_path in file_paths:
        with open(file_path, 'r') as f:
            lines = f.readlines()

        data_start = lines.index('@data\n') + 1
        attributes = [line.split()[1] for line in lines if line.startswith('@attribute')]

        data_list = []
        for line in lines[data_start:]:
            values = line.strip().split(',')
            data_dict = {attr: float(val) if val.replace('.', '', 1).isdigit() else val for attr, val in zip(attributes, values)}
            data_list.append(data_dict)

        # Convert the list of dictionaries to a list of lists
        list_of_lists.extend([[row[attr] for attr in attributes] for row in data_list])

    return list_of_lists, attributes



# Remove missing values and perform standardization to achieve O(n) runtime
def remove_missing_values(data_list):
    cleaned_data = []

    # Calculate column means and standard deviations
    column_means = [sum(row[j] for row in data_list if isinstance(row[j], (int, float))) / len(data_list) for j in range(2, len(data_list[0]))]
    column_std_devs = [((sum((row[j] - column_means[j - 2]) ** 2 for row in data_list if isinstance(row[j], (int, float)))) / len(data_list)) ** 0.5 for j in range(2, len(data_list[0]))]

    for row in data_list:
        cleaned_row = row[:2]  # Keep the first two columns unchanged

        for j in range(2, len(row)):
            if isinstance(row[j], (int, float)):
                # Standardization
                cleaned_row.append((row[j] - column_means[j - 2]) / column_std_devs[j - 2])
            else:
                cleaned_row.append(0)

        cleaned_data.append(cleaned_row)

    return cleaned_data



#Calculating the covariance matrix for each data from the dataset
def is_numeric(value):
    return isinstance(value, (int, float))

def covariance_matrix(data_list):
    num_rows = len(data_list)
    num_columns = len(data_list[0])

    cov_matrix = [[0] * (num_columns - 2) for _ in range(num_columns - 2)]

    for x in range(2, num_columns):
        for y in range(2, num_columns):
            if is_numeric(data_list[0][x]) and is_numeric(data_list[0][y]):
                sum_value = sum(data_list[i][x] * data_list[i][y] for i in range(num_rows))
                cov_matrix[x-2][y-2] = sum_value / num_rows
            
    return cov_matrix



def get_pca(data_list2, n_components=3):
    # Calculate eigenvalues and eigenvectors of the covariance matrix
    eigenvalues, eigenvectors = calculate_eigen(data_list2)

    # Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = sorted(range(len(eigenvalues)), key=lambda k: eigenvalues[k], reverse=True)
    eigenvalues = [eigenvalues[i] for i in sorted_indices]
    eigenvectors = [eigenvectors[i] for i in sorted_indices]

    # Select the top n_components eigenvectors
    selected_eigenvectors = eigenvectors[:n_components]

    # Perform PCA transformation
    pca_result = matrix_multiply(data_list2, transpose(selected_eigenvectors))

    return pca_result


def calculate_eigen(data_list2):
    # Calculate eigenvalues and eigenvectors using the power iteration method
    num_rows = len(data_list2)
    num_columns = len(data_list2[0])

    # Initialize a random vector for power iteration
    vector = [1.0] * (num_columns - 2)

    # Power iteration
    for _ in range(1000):  # Adjust the number of iterations as needed
        vector = matrix_multiply(data_list2, [vector])
        vector = normalize(vector[0])

    # Calculate eigenvalue and eigenvector
    eigenvalue = dot_product(vector, matrix_multiply(data_list2, [vector])[0])
    eigenvector = normalize(vector)

    return [eigenvalue], [eigenvector]


def matrix_multiply(matrix, vector):
    return [[sum(a * b for a, b in zip(row, col)) for col in zip(*vector)] for row in matrix]


def transpose(matrix):
    return [[matrix[j][i] for j in range(len(matrix))] for i in range(len(matrix[0]))]


def dot_product(vector1, vector2):
    return sum(a * b for a, b in zip(vector1, vector2))


def normalize(vector):
    magnitude = (sum(x ** 2 for x in vector)) ** 0.5
    return [x / magnitude for x in vector]

def calculate_svd(data_list2):
    # Calculate SVD using power iteration
    num_rows = len(data_list2)
    num_columns = len(data_list2[0])

    # Initialize random vectors for power iteration
    u_vector = [1.0] * (num_rows - 2)
    v_vector = [1.0] * (num_columns - 2)

    # Power iteration for U
    for _ in range(1000):  # Adjust the number of iterations as needed
        u_vector = matrix_multiply(transpose(data_list2), [v_vector])
        u_vector = normalize(u_vector[0])

    # Power iteration for V
    for _ in range(1000):  # Adjust the number of iterations as needed
        v_vector = matrix_multiply(data_list2, [u_vector])
        v_vector = normalize(v_vector[0])

    # Calculate singular values
    singular_values = [dot_product(u_vector, matrix_multiply(data_list2, [v_vector])[0])]

    return u_vector, singular_values, v_vector


def main():
    V4_data_2017 = 'V4_data/2017.arff'
    V4_data_2018 = 'V4_data/2018.arff'

    file_paths = [V4_data_2017, V4_data_2018]

    data_list, attributes = load_arff(file_paths)

    # Print the header
    print(attributes)

    # Remove missing values and calculate for standardization
    data_list = remove_missing_values(data_list)
    data_list2 = covariance_matrix(data_list)
    
    # Perform PCA on data_list2 with 3 components
    pca_result = get_pca(data_list2, n_components=3)

    # Calculate SVD
    u_vector, singular_values, v_vector = calculate_svd(data_list2)
    
    # Print the result
    for row in data_list2:
        print(row)

    # Print the PCA result
    print("PCA Result:")
    for row in pca_result:
        print(row)

    # Print the SVD result
    print("\nSVD Result:")
    print("U Vector:", u_vector)
    print("Singular Values:", singular_values)
    print("V Vector:", v_vector)

if __name__ == "__main__":
    main()


['Num', 'Country', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71', 'X72', 'X73', 'X74', 'X75', 'X76', 'X77', 'X78', 'X79', 'X80', 'X81', 'X82', 'S']
[1.000000000000001, -0.00046598840774238936, -7.386592524141464e-05, 0.001871873088390433, 0.0005817625937536598, 0.9998363613989701, -0.0006743600412310786, -0.00046569074964108575, -0.0004688184955923462, 0.9998611017781036, 0.9962122760522042, 0.957829855256485, 0.01249107234253652, 0.9962956321137905, -0.0005462334846589197, 0.9989153159060624, 0.9919284168567686, 0.9982472717897616, 0.9929955259185308, 