In [261]:
def load_arff(file_paths):
    list_of_lists = []

    for file_path in file_paths:
        with open(file_path, 'r') as f:
            lines = f.readlines()

        data_start = lines.index('@data\n') + 1
        attributes = [line.split()[1] for line in lines if line.startswith('@attribute')]

        data_list = []
        for line in lines[data_start:]:
            values = line.strip().split(',')
            data_dict = {attr: float(val) if val.replace('.', '', 1).isdigit() else val for attr, val in zip(attributes, values)}
            data_list.append(data_dict)

        # Convert the list of dictionaries to a list of lists
        list_of_lists.extend([[row[attr] for attr in attributes] for row in data_list])

    return list_of_lists, attributes



# Remove missing values and perform standardization to achieve O(n) runtime
def remove_missing_values(data_list):
    cleaned_data = []

    # Calculate column means and standard deviations
    column_means = [sum(row[j] for row in data_list if isinstance(row[j], (int, float))) / len(data_list) for j in range(2, len(data_list[0]))]
    column_std_devs = [((sum((row[j] - column_means[j - 2]) ** 2 for row in data_list if isinstance(row[j], (int, float)))) / len(data_list)) ** 0.5 for j in range(2, len(data_list[0]))]

    for row in data_list:
        cleaned_row = row[:2]  # Keep the first two columns unchanged

        for j in range(2, len(row)):
            if isinstance(row[j], (int, float)):
                # Standardization
                cleaned_row.append((row[j] - column_means[j - 2]) / column_std_devs[j - 2])
            else:
                cleaned_row.append(0)

        cleaned_data.append(cleaned_row)

    return cleaned_data



#This is used to calculate for the covariance matrix from the data_list
def is_numeric(value):
    return isinstance(value, (int, float))

def covariance_matrix(data_list):
    num_rows = len(data_list)
    num_columns = len(data_list[0])

    cov_matrix = [[0] * (num_columns - 2) for _ in range(num_columns - 2)]

    for x in range(2, num_columns):
        for y in range(2, num_columns):
            if is_numeric(data_list[0][x]) and is_numeric(data_list[0][y]):
                sum_value = sum(data_list[i][x] * data_list[i][y] for i in range(num_rows))
                cov_matrix[x-2][y-2] = sum_value / num_rows
            
    return cov_matrix



def get_pca(data_list2, n_components=3):
    # Calculate eigenvalues and eigenvectors of the covariance matrix
    eigenvalues, eigenvectors = calculate_eigen(data_list2)

    sorted_indices = sorted(range(len(eigenvalues)), key=lambda k: eigenvalues[k], reverse=True)
    eigenvalues = [eigenvalues[i] for i in sorted_indices]
    eigenvectors = [eigenvectors[i] for i in sorted_indices]

    selected_eigenvectors = eigenvectors[:n_components]

    pca_result = matrix_multiply(data_list2, transpose(selected_eigenvectors))

    return pca_result


# Calculate for the eigen values to be used in calculating for PCA
def calculate_eigen(data_list2):
    num_rows = len(data_list2)
    num_columns = len(data_list2[0])

    vector = [1.0] * (num_columns - 2)

    for _ in range(1000):  
        vector = matrix_multiply(data_list2, [vector])
        vector = normalize(vector[0])

    eigenvalue = dot_product(vector, matrix_multiply(data_list2, [vector])[0])
    eigenvector = normalize(vector)

    return [eigenvalue], [eigenvector]


def matrix_multiply(matrix, vector):
    return [[sum(a * b for a, b in zip(row, col)) for col in zip(*vector)] for row in matrix]

def transpose(matrix):
    return [[matrix[j][i] for j in range(len(matrix))] for i in range(len(matrix[0]))]


def dot_product(vector1, vector2):
    return sum(a * b for a, b in zip(vector1, vector2))


def normalize(vector):
    magnitude = (sum(x ** 2 for x in vector)) ** 0.5
    return [x / magnitude for x in vector]


# This is used for calculating the SVD
def calculate_svd(data_list2):
    num_rows = len(data_list2)
    num_columns = len(data_list2[0])

    u_vector = [1.0] * (num_rows - 2)
    v_vector = [1.0] * (num_columns - 2)

    for _ in range(1000):  
        u_vector = matrix_multiply(transpose(data_list2), [v_vector])
        u_vector = normalize(u_vector[0])

    for _ in range(1000):  
        v_vector = matrix_multiply(data_list2, [u_vector])
        v_vector = normalize(v_vector[0])

    singular_values = [dot_product(u_vector, matrix_multiply(data_list2, [v_vector])[0])]

    return u_vector, singular_values, v_vector



#Prints the Covariance Matrix into a formatted table for reference
def print_table(data_list, data_list2, attribute_names):

    max_attr_len = max(len(name) for name in attribute_names)
    rounded_data = [[round(val, 5) if isinstance(val, float) else val for val in row] for row in data_list2]

    # Print the table header
    header = ["{:<{}}".format(attribute_names[0], max_attr_len)]
    header.extend(["{:<16}".format(name) if i == 0 else "{:<{}}".format(name, 8) for i, name in enumerate(attribute_names[1:])])

    print("|", " | ".join(header), "|")
    separator = "+".join(["-" * (max_attr_len + 2) for _ in range(len(attribute_names))])
    print(separator)

    # Print each row of the table
    for i, (row_data_list, row_data_list2) in enumerate(zip(data_list, rounded_data)):

        formatted_row_data_list = ["{:<{}}".format(str(val), max_attr_len) for val in row_data_list[:1]]
        formatted_row_data_list.append("{:<16}".format(str(row_data_list[1])))
        formatted_row_data_list2 = ["{:<8}".format(str(val), max_attr_len) for val in row_data_list2]

        print("|", " | ".join(formatted_row_data_list + formatted_row_data_list2), "|")

    print(separator)


def main():
    V4_data_2017 = 'V4_data/2017.arff'
    V4_data_2018 = 'V4_data/2018.arff'
    V4_data_2019 = 'V4_data/2019.arff'
    V4_data_2020 = 'V4_data/2020.arff'
    V4_data_2021 = 'V4_data/2021.arff'

    file_paths = [V4_data_2017, V4_data_2018, V4_data_2019, V4_data_2020, V4_data_2021]

    
    data_list, attributes = load_arff(file_paths)

    # Remove missing values and calculate for standardization
    data_list = remove_missing_values(data_list)
    data_list2 = covariance_matrix(data_list)
    
    # Perform PCA on data_list2 with 3 components
    pca_result = get_pca(data_list2, n_components=3)

    # Calculate SVD
    u_vector, singular_values, v_vector = calculate_svd(data_list2)
    
    # Print the table
    print("Standardization Table:")
    print_table(data_list, data_list2, attributes)

    # Print the PCA result
    print("PCA Result:")
    for row in pca_result:
        print(row)

    # Print the SVD result
    print("\nSVD Result:")
    print("U Vector:", u_vector)
    print("Singular Values:", singular_values)
    print("V Vector:", v_vector)


if __name__ == "__main__":
    main()

Standardization Table:
| Num     | Country          | X1       | X2       | X3       | X4       | X5       | X6       | X7       | X8       | X9       | X10      | X11      | X12      | X13      | X14      | X15      | X16      | X17      | X18      | X19      | X20      | X21      | X22      | X23      | X24      | X25      | X26      | X27      | X28      | X29      | X30      | X31      | X32      | X33      | X34      | X35      | X36      | X37      | X38      | X39      | X40      | X41      | X42      | X43      | X44      | X45      | X46      | X47      | X48      | X49      | X50      | X51      | X52      | X53      | X54      | X55      | X56      | X57      | X58      | X59      | X60      | X61      | X62      | X63      | X64      | X65      | X66      | X67      | X68      | X69      | X70      | X71      | X72      | X73      | X74      | X75      | X76      | X77      | X78      | X79      | X80      | X81      | X82      | S        |
---------+---------+---------+---

## Performance Differences Between Vanilla Python and scikit-learn for PCA and SVD

### Optimizations and Parallelization:

- Scikit-learn is optimized for performance and implements parallelized versions of many algorithms, making use of multicore processors effectively. This can result in faster computation, especially for large datasets.
- Vanilla python without the use of imports and libraries does not have the same level of parallelization for certain operations, leading to potential differences in execution time.

### Implementation Details:

- Scikit-learn may use optimized algorithms and low-level implementations that take advantage of the underlying hardware and software, providing faster execution.
- Vanilla python without the use of libraries have a harder time when it comes to measurement as Scikit-learn and other pythonic libraries has the capability of implementing plots which measures the comparison for PCA and SVD accuracy

### Algorithmic Differences:

- Scikit-learn may implement different algorithms or parameterizations for PCA and SVD compared to the ones made by vanilla python as imports easily streamline the use of various functions when it comes to the context of heavily derived mathematical formulas

### Size of the Dataset:

- The performance difference might become more noticeable with larger datasets where the efficiency of the underlying algorithms and optimizations can have a significant impact.
- Scikit-learn and other pythonic libraries makes the process of parsing the dataset easier as opposed to using vanilla python

### Conclusion:

- The provided Python code offers a clear demonstration of Principal Component Analysis (PCA) and Singular Value Decomposition (SVD) without relying on external libraries. It begins by parsing a large dataset and preparing it for machine learning tasks through a data cleaning process that involves removing unreadable values. The calculation of PCA involves representing the covariance matrix in a readable table format to aid accuracy assessment.

- Eigenvalues and eigenvectors are then computed to derive the PCA, involving matrix multiplication, transposition, dot product, and normalization. The code concludes with the calculation of SVD, presenting the U_vector, V_vector, and singular values. While this approach allows for a deeper understanding of the underlying mathematical concepts, it's acknowledged that employing established libraries such as scikit-learn is more efficient in practical applications.

- The use of scikit-learn or similar libraries streamlines complex operations, takes advantage of optimized algorithms, and often leverages parallelization for improved performance. This trade-off between a manually implemented approach and library usage showcases the importance of balancing conceptual understanding with practical efficiency in real-world data science and machine learning workflows.





