In [1]:
import findspark
findspark.init('/opt/spark')
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Data-Importer") \
    .enableHiveSupport() \
    .getOrCreate()

spark.sql("set hive.exec.dynamici.partition=true;")
spark.sql("set hive.exec.dynamic.partition.mode=nonstrict;")

# 源数据 => ODS
print("ODS init spark content successful")

ODS init spark content successful


In [2]:
# create database time_series
spark.sql("DROP TABLE IF EXISTS ods_time_series;")
sql = """
CREATE TABLE IF NOT EXISTS ods_time_series
(id bigint,
time TIMESTAMP,
lon float,
lat float)
PARTITIONED BY(yearMonth string)
STORED AS PARQUET;
"""
spark.sql(sql)

# create database job
spark.sql("DROP TABLE IF EXISTS ods_job;")
sql = """
CREATE TABLE IF NOT EXISTS ods_job
(id bigint,start_time TIMESTAMP,end_time TIMESTAMP)
PARTITIONED BY(yearMonth string)
STORED AS PARQUET;
"""
spark.sql(sql)

# 创建输出表
#CREATE TABLE output(id SERIAL PRIMARY KEY,job_id int4);

print("Init ods ods_time_series ,ods_job  Success")

Init ods ods_time_series ,ods_job  Success


In [3]:
# 读入临时表 ods_time_series_external

spark.sql("DROP TABLE IF EXISTS ods_time_series_external;")
sql = """
CREATE TABLE IF NOT EXISTS ods_time_series_external(
 id string,
 time string,
 lon string,
 lat string)
 ROW FORMAT DELIMITED
 FIELDS TERMINATED BY ',';
"""
spark.sql(sql)
    
sql = f"load data local inpath './data/timeseries.csv' into table ods_time_series_external ;"
print(sql)
spark.sql(sql)

# 临时表写入ODS表


    

load data local inpath './data/timeseries.csv' into table ods_time_series_external ;


DataFrame[]

In [4]:
# 临时表写入ODS表

# see https://stackoverflow.com/questions/62607279/how-to-load-a-csv-file-containing-time-string-value-to-timestamp-in-hive

sql = '''
INSERT INTO ods_time_series partition(yearMonth)
SELECT
 cast(id as bigint) as id,
 cast(cast(time  as int) as timestamp) as time,
 cast(lon as float) as lon,
 cast(lat as float) as lat,
 date_format(cast(cast(time  as int) as timestamp),'yyyyMM') as yearMonth
 FROM ods_time_series_external;
'''
print(sql)
result = spark.sql(sql)
result.show()

# 查看一条数据
result = spark.sql("SELECT * FROM ods_time_series LIMIT 1")
result.show()


INSERT INTO ods_time_series partition(yearMonth)
SELECT
 cast(id as bigint) as id,
 cast(cast(time  as int) as timestamp) as time,
 cast(lon as float) as lon,
 cast(lat as float) as lat,
 date_format(cast(cast(time  as int) as timestamp),'yyyyMM') as yearMonth
 FROM ods_time_series_external;

++
||
++
++

+--------+-------------------+--------+--------+---------+
|      id|               time|     lon|     lat|yearMonth|
+--------+-------------------+--------+--------+---------+
|14711039|2026-05-01 00:00:00|45.23426|46.23426|   202605|
+--------+-------------------+--------+--------+---------+



In [5]:
# 查看数据规模
result = spark.sql("SELECT count(*) FROM ods_time_series ")
result.show()

+--------+
|count(1)|
+--------+
|31535907|
+--------+



In [6]:
# 读入临时表 ods_time_series_external

spark.sql("DROP TABLE IF EXISTS ods_job_external;")
sql = """
CREATE TABLE IF NOT EXISTS ods_job_external(
 id string,
 start_time string,
 end_time string)
 ROW FORMAT DELIMITED
 FIELDS TERMINATED BY ',';
"""
spark.sql(sql)
    
sql = f"load data local inpath './data/jobs.csv' into table ods_job_external ;"
print(sql)
spark.sql(sql)


# 查看一条数据
result = spark.sql("SELECT * FROM ods_job_external LIMIT 1")
result.show()

load data local inpath './data/jobs.csv' into table ods_job_external ;
+---+----------+----------+
| id|start_time|  end_time|
+---+----------+----------+
|  0|1704038400|1704042000|
+---+----------+----------+



In [7]:
# 临时表写入ODS表 ods_job

# see https://stackoverflow.com/questions/62607279/how-to-load-a-csv-file-containing-time-string-value-to-timestamp-in-hive

sql = '''
INSERT INTO ods_job partition(yearMonth)
SELECT
 cast(id as bigint) as id,
 cast(cast(start_time as int) as timestamp) as start_time,
 cast(cast(end_time as int) as timestamp) as end_time,
 date_format(cast(cast(start_time  as int) as timestamp),'yyyyMM') as yearMonth
 FROM ods_job_external;
'''
print(sql)
result = spark.sql(sql)

# 查看一条数据
result = spark.sql("SELECT * FROM ods_job LIMIT 1")
result.show()


INSERT INTO ods_job partition(yearMonth)
SELECT
 cast(id as bigint) as id,
 cast(cast(start_time as int) as timestamp) as start_time,
 cast(cast(end_time as int) as timestamp) as end_time,
 date_format(cast(cast(start_time  as int) as timestamp),'yyyyMM') as yearMonth
 FROM ods_job_external;

+-----+-------------------+-------------------+---------+
|   id|         start_time|           end_time|yearMonth|
+-----+-------------------+-------------------+---------+
|30656|2027-07-01 00:00:00|2027-07-01 01:00:00|   202707|
+-----+-------------------+-------------------+---------+



In [8]:
# 查看分区数目
result = spark.sql("select date_format(time,'yyyyMM') from ods_time_series ;")
result.show()

+-------------------------+
|date_format(time, yyyyMM)|
+-------------------------+
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
|                   202503|
+-------------------------+
only showing top 20 rows

