In [26]:
import os
import yaml

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType, IntegerType, TimestampType

import random
from datetime import datetime, timedelta

In [21]:
# Open the YAML file and load its contents into a dictionary
with open('../../../references/config_notebook.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Access the variables in the dictionary
my_vars = config

In [22]:
# Data location
source_folder_path = my_vars['TEST']

In [3]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("create_dataframe_test") \
    .getOrCreate()

24/05/01 20:03:49 WARN Utils: Your hostname, skynet resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp2s0)
24/05/01 20:03:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/01 20:03:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/01 20:03:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [49]:
# Define schema for the DataFrame
schema = StructType([
    StructField("id", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("qty", DoubleType(), True),
    StructField("quoteQty", DoubleType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("makerBuy", BooleanType(), True),
    StructField("bestPrice", BooleanType(), True),
    StructField("zipname", IntegerType(), True),
        ])

# Define data
data = [ 
    ('3061645923',27261.06,0.00156,  42.5272536,None,    True, True,20230329),
    ('3061645924',27261.06, 8.2E-4,  22.3540692,None,    True, True,20230329),
    ('3061645925',27261.07, 0.0034,   92.687638,None,   False, True,20230329),
    ('3061645926',27261.06,0.00436, 118.8582216,None,    True, True,20230329),
    ('3061645927',27261.06,0.00151,  41.1642006,None,    True, True,20230329),
    ('3061645928',27261.07,0.00467, 127.3091969,None,   False, True,20230329),
    ('3061645929',27261.07,0.00197,  53.7043079,None,   False, True,20230329),
    ('3061645930',27261.06,0.00277,  75.5131362,None,    True, True,20230329),
    ('3061645931',27261.07,0.00207,  56.4304149,None,   False, True,20230329),
    ('3061645932',27261.06,0.00952, 259.5252912,None,    True, True,20230329),
    ('3061645933',27261.06,0.00222,  60.5195532,None,    True, True,20230329),
    ('3061645934',27261.07,0.00299,  81.5105993,None,   False, True,20230330),
    ('3061645935',27261.07,0.15165,4134.1412655,None,   False, True,20230330),
    ('3061645936',27261.07,0.11999,3271.0557893,None,   False, True,20230330),
    ('3061645937',27261.07,  0.007,   190.82749,None,   False, True,20230330),
    ('3061645938',27261.07,0.05673,1546.5205011,None,   False, True,20230330),
    ('3061645939',27261.07,0.00627, 170.9269089,None,   False, True,20230330),
    ('3061645940',27261.07,0.07153,1949.9843371,None,   False, True,20230330),
    ('3061645941',27261.07, 3.7E-4,  10.0865959,None,   False, True,20230330),
    ('3061645942',27261.07,0.00239,  65.1539573,None,   False, True,20230330)
    ]

In [53]:
# Sort the data by zipname
data.sort(key=lambda x: x[-1])

# Define start and end times
start_time = datetime.strptime('00:00:00', '%H:%M:%S')
end_time = datetime.strptime('23:59:59', '%H:%M:%S')

# Generate random times within the range, adjusting hours, minutes, and seconds based on zipname
modified_data = []
previous_date = None
for row in data:
    zipname = row[-1]
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)
    random_time = datetime(year=int(str(zipname)[:4]), month=int(str(zipname)[4:6]), day=int(str(zipname)[6:8]),
                            hour=random_hour, minute=random_minute, second=random_second)
    # Ensure the time is greater than the previous date
    if previous_date is not None and random_time < previous_date:
        random_time = previous_date + timedelta(seconds=random.randint(1, 60))
    row_with_time = row[:4] + (random_time,) + row[5:]
    modified_data.append(row_with_time)
    previous_date = random_time


In [54]:
# Create DataFrame
df = spark.createDataFrame(modified_data, schema)

In [23]:
# Define output path for the Parquet file
output_path = '/home/giujorge/Downloads/test/BTCUSDT_dollars_test.parquet'

In [56]:

# Write DataFrame to Parquet file
df.repartition(1).write \
            .partitionBy("zipname") \
            .mode("append") \
            .option("compression", "gzip") \
            .option("blockSize", "256m") \
            .parquet(output_path)


In [55]:
# Show DataFrame
df.show()

+----------+--------+-------+------------+-------------------+--------+---------+--------+
|        id|   price|    qty|    quoteQty|          timestamp|makerBuy|bestPrice| zipname|
+----------+--------+-------+------------+-------------------+--------+---------+--------+
|3061645923|27261.06|0.00156|  42.5272536|2023-03-29 09:00:41|    true|     true|20230329|
|3061645924|27261.06| 8.2E-4|  22.3540692|2023-03-29 20:01:06|    true|     true|20230329|
|3061645925|27261.07| 0.0034|   92.687638|2023-03-29 20:01:17|   false|     true|20230329|
|3061645926|27261.06|0.00436| 118.8582216|2023-03-29 20:02:12|    true|     true|20230329|
|3061645927|27261.06|0.00151|  41.1642006|2023-03-29 20:03:01|    true|     true|20230329|
|3061645928|27261.07|0.00467| 127.3091969|2023-03-29 20:03:26|   false|     true|20230329|
|3061645929|27261.07|0.00197|  53.7043079|2023-03-29 20:03:36|   false|     true|20230329|
|3061645930|27261.06|0.00277|  75.5131362|2023-03-29 20:03:42|    true|     true|20230329|

In [None]:
# Stop Spark session
spark.stop()