# Import from Matrix Market

Download and convert the matrix to our dense format. 
We may have problems in the future as the matrix can be very large.

In [15]:
import os
import numpy as np
import sh

In [25]:
DATASET_NAME = "Dataset003_name_mycielskian19_sparsity_0_58"

link_of_matrix = (
    "https://suitesparse-collection-website.herokuapp.com/MM/Mycielski/mycielskian19.tar.gz"
)
matrix_name = link_of_matrix.split("/")[-1]
matrix_market_folder = (
    "/scratch/eschreib/matrix_market_matrices"
)
own_matrices_folder = (
    "/scratch/eschreib/matrices/" + DATASET_NAME
)
# Make the own_matrices_folder if it does not exist
if not os.path.exists(own_matrices_folder):
    os.makedirs(own_matrices_folder)


matrix_path = os.path.join(
    matrix_market_folder, matrix_name.split(".")[0], matrix_name.split(".")[0] + ".mtx"
)
# Check if the matrix is already downloaded
if not os.path.exists(matrix_path):
    # Download the matrix
    os.system("wget -P " + matrix_market_folder + " " + link_of_matrix)

    print(
        f"Size of the file: {os.path.getsize(os.path.join(matrix_market_folder, matrix_name))} bytes"
    )

    # Unzip the matrix into the same folder
    os.system(
        "tar -xf "
        + os.path.join(matrix_market_folder, matrix_name)
        + " -C "
        + matrix_market_folder
    )
else:
    print(f"Matrix already downloaded, check {matrix_path}")

--2023-12-07 09:58:59--  https://suitesparse-collection-website.herokuapp.com/MM/Mycielski/mycielskian19.tar.gz
Resolving suitesparse-collection-website.herokuapp.com (suitesparse-collection-website.herokuapp.com)... 54.224.34.30, 34.201.81.34, 54.208.186.182, ...
Connecting to suitesparse-collection-website.herokuapp.com (suitesparse-collection-website.herokuapp.com)|54.224.34.30|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://sparse-files.engr.tamu.edu/MM/Mycielski/mycielskian19.tar.gz [following]
--2023-12-07 09:58:59--  http://sparse-files.engr.tamu.edu/MM/Mycielski/mycielskian19.tar.gz
Resolving sparse-files.engr.tamu.edu (sparse-files.engr.tamu.edu)... 52.219.104.11, 52.219.107.56, 52.219.176.192, ...
Connecting to sparse-files.engr.tamu.edu (sparse-files.engr.tamu.edu)|52.219.104.11|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1087184328 (1.0G) [application/x-tar]
Saving to: ‘/scratch/eschreib/matrix_m

Size of the file: 1087184328 bytes


In [3]:
def read_file_to_list_of_lists(file_path):
    """The format of the file is as follows:
    1. Lines with % are comments
    2. The first line contains the dimensions of the matrix (num_rows, num_columns, total_num_entries)
    3. The following lines contain the entries of the matrix in the following format:
        row_index column_index value
    """
    with open(file_path, "r") as file:  # returns  tuple(list, int, int, int):
        lines = file.readlines()
        # Remove all comments
        lines = [line for line in lines if not line.startswith("%")]
        # Remove all empty lines
        lines = [line for line in lines if line.strip()]
        # Remove the first line
        num_rows, num_columns, total_num_entries = lines.pop(0).split(" ")

        matrix = np.zeros((int(num_rows), int(num_columns)))
        # Remove the newline character
        lines = [line.strip() for line in lines]
        # Split the lines into lists
        lines = [line.split(" ") for line in lines]

        for line in lines:
            matrix[int(line[0]) - 1, int(line[1]) - 1] = float(line[2])

        return matrix, int(num_rows), int(num_columns), int(total_num_entries)

In [4]:
def write_matrix_to_dense_format(matrix, file_path):
    """Safe in dense format:
    First line: num_rows num_columns data_type (f for float)
    Following lines: matrix entries
    """
    with open(file_path, "w") as file:
        file.write(f"{matrix.shape[0]} {matrix.shape[1]} f\n")
        for row in matrix:
            for entry in row:
                file.write(f"{entry} ")
            file.write("\n")

In [5]:
matrix, num_rows, num_columns, total_num_entries = read_file_to_list_of_lists(
    matrix_path
)
print(
    f"Number of rows: {num_rows}, number of columns: {num_columns}, total number of entries: {total_num_entries}"
)
print(f"Matrix shape: {matrix.shape}")

Number of rows: 1138, number of columns: 1138, total number of entries: 2596
Matrix shape: (1138, 1138)


In [10]:
save_path = os.path.join(own_matrices_folder, matrix_name.split(".")[0])
write_matrix_to_dense_format(matrix, save_path)

# Generate Matrices to compute with the Sampled Matrix

In [21]:
def check_validity(Matrix_A_Shape, Matrix_B_Shape, Sampled_Shape):
    # Check that we can multiply the matrices A * B and get the shape of the sampled matrix
    assert Matrix_A_Shape[1] == Matrix_B_Shape[0] 
    assert Sampled_Shape[0] == Matrix_A_Shape[0]
    assert Sampled_Shape[1] == Matrix_B_Shape[1]
    return True

In [19]:
# Make dirs A and B if they do not exist
if not os.path.exists(os.path.join(own_matrices_folder, "A")):
    os.makedirs(os.path.join(own_matrices_folder, "A"))
if not os.path.exists(os.path.join(own_matrices_folder, "B")):
    os.makedirs(os.path.join(own_matrices_folder, "B"))

# Path of this file
this_dir_path = os.getcwd()
generate_matrix_script_path = os.path.join(this_dir_path, "generateMatrix.sh")

In [31]:
# Matrix Shape definitions as tuples (num_rows, num_columns)
Matrix_A_Shape = (393215, 10000)
Matrix_B_Shape = (10000, 393215)

assert check_validity(Matrix_A_Shape, Matrix_B_Shape, matrix.shape)
print(f"Rough size estimate of matrix A: {Matrix_A_Shape[0] * Matrix_A_Shape[1] * 4 / 1024 / 1024 / 1024} GB")
print(f"Rough size estimate of matrix B: {Matrix_B_Shape[0] * Matrix_B_Shape[1] * 4 / 1024 / 1024 / 1024} GB")

Rough size estimate of matrix A: 575.9970703162253 GB
Rough size estimate of matrix B: 14.648400247097015 GB


In [24]:
# Generate Matrix A
sh.bash(generate_matrix_script_path, os.path.join(own_matrices_folder, "A"), "dense", f"{Matrix_A_Shape[0]}x{Matrix_A_Shape[1]}")
# Generate Matrix B
sh.bash(generate_matrix_script_path, os.path.join(own_matrices_folder, "B"), "dense", f"{Matrix_B_Shape[0]}x{Matrix_B_Shape[1]}")

'Generated matrix /scratch/eschreib/matrices/Dataset002_testing_S_name_1138_bus/B/n_100_m_1138_sparsity_1\n'