# 01 hdfs vs MySQL Write-Performance-Test

In [1]:
import pandas as pd

from hdfs import InsecureClient
from datetime import datetime

In [9]:
hdfs_host = "192.168.64.102"
hdfs_port = 9870
hdfs_user = "enricogoerlitz"
hdfs_client = InsecureClient(f"http://{hdfs_host}:{hdfs_port}", user=hdfs_user)

iterator_db_employee_pay = pd.read_csv("./database/bigdatafiles/datev.dbo.employee_pay.csv", chunksize=10_000, sep="|")

start_time = datetime.now()

chunk: pd.DataFrame
for i, chunk in enumerate(iterator_db_employee_pay):
    parquet_content = chunk.to_parquet(index=False)

    hdfs_path = f"/hive/test/datev/employee_pay/file{i}.parquet"
    with hdfs_client.write(hdfs_path, overwrite=True) as hdfs_file:
        hdfs_file.write(parquet_content)

    print(f"PARQUET file was successfully stored in HDFS at {hdfs_path}.")

end_time = datetime.now()
duration_time = (end_time - start_time).total_seconds()
print(f"TOTAL TIME: {duration_time}s")  # 15_142_128 Records => 809.731399s | 13m 29.7s


PARQUET file was successfully stored in HDFS at /hive/test/datev/employee_pay/file0.parquet.
PARQUET file was successfully stored in HDFS at /hive/test/datev/employee_pay/file1.parquet.
PARQUET file was successfully stored in HDFS at /hive/test/datev/employee_pay/file2.parquet.
PARQUET file was successfully stored in HDFS at /hive/test/datev/employee_pay/file3.parquet.
PARQUET file was successfully stored in HDFS at /hive/test/datev/employee_pay/file4.parquet.
PARQUET file was successfully stored in HDFS at /hive/test/datev/employee_pay/file5.parquet.
PARQUET file was successfully stored in HDFS at /hive/test/datev/employee_pay/file6.parquet.
PARQUET file was successfully stored in HDFS at /hive/test/datev/employee_pay/file7.parquet.
PARQUET file was successfully stored in HDFS at /hive/test/datev/employee_pay/file8.parquet.
PARQUET file was successfully stored in HDFS at /hive/test/datev/employee_pay/file9.parquet.
PARQUET file was successfully stored in HDFS at /hive/test/datev/emplo

In [4]:
from sqlalchemy import create_engine
import pandas as pd

from hdfs import InsecureClient
from datetime import datetime

MYSQL_USERNAME = "root"
MYSQL_PASSWORD = "root"
MYSQL_HOST = "localhost:3306"
MYSQL_DB = "mysqldb"

MYSQL_TABLENAME = "bigdatatable2"

iterator_db_employee_pay = pd.read_csv("./database/bigdatafiles/datev.dbo.employee_pay.csv", chunksize=10_000, sep="|")

engine = create_engine(f"mysql://{MYSQL_USERNAME}:{MYSQL_PASSWORD}@{MYSQL_HOST}/{MYSQL_DB}")

start_time = datetime.now()

chunk: pd.DataFrame
for i, chunk in enumerate(iterator_db_employee_pay):
    chunk.to_sql(
        name="bigdatatable2",
        con=engine,
        if_exists="append",
        index=False
    )
    print(f"CHUNK was successfully stored in MYSQL at {MYSQL_TABLENAME} Loop: {i}.")

end_time = datetime.now()
duration_time = (end_time - start_time).total_seconds()
print(f"TOTAL TIME: {duration_time}s")  # 15_142_128 Records => 

CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 0.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 1.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 2.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 3.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 4.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 5.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 6.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 7.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 8.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 9.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 10.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 11.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 12.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 13.
CHUNK was successfully stored in MYSQL at bigdatatable2 Loop: 14.
CHUNK was successful

In [7]:
import pyarrow.fs as fs
# hdfs = fs.HadoopFileSystem("hdfs://192.168.64.102:9870")

# pd.read_parquet("hdfs://192.168.64.102:9870/hive/test/datev/employee_pay/file1.parquet", filesystem=hdfs)
pd.read_parquet("hdfs://192.168.64.102:9870/hive/test/datev/employee_pay/file1.parquet", engine="pyarrow")


OSError: Prior attempt to load libhdfs failed