In [1]:
import findspark
findspark.init() # this must be executed before the below import

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark import SparkFiles

In [3]:
# spark = SparkSession.builder.master('local').appName('myAppName').getOrCreate()

In [4]:
sc = SparkContext()
sqlContext = SQLContext(sc)

In [5]:
df = sqlContext.read.csv(SparkFiles.get('/home/cloudray/Downloads/TPCH_12M_8Field.csv'), header=False, inferSchema=True)

In [6]:
subset_rows = df.filter(df._c2 < 2).collect()

In [9]:
len(subset_rows)

571

In [6]:
subset_rdd = sc.parallelize(subset_rows)

In [7]:
subset_df = subset_rdd.toDF()

In [8]:
df.count()
subset_df.count()

571

In [None]:
# write to parquet (HDFS)

In [12]:
df.write.parquet('hdfs://localhost:9000/user/cloudray/NORA/partition_0.parquet')

In [11]:
subset_df.write.parquet('hdfs://localhost:9000/user/cloudray/NORA/partition_1.parquet')

In [None]:
# read from parquet (HDFS)

In [15]:
df2 = sqlContext.read.parquet('hdfs://localhost:9000/user/cloudray/NORA/partition_1.parquet')

In [16]:
print(df2)

DataFrame[_c0: double, _c1: double, _c2: double, _c3: double, _c4: double, _c5: double, _c6: double, _c7: double]


In [18]:
df2.head()

Row(_c0=21378.0, _c1=400000.0, _c2=1.0, _c3=3.0, _c4=47.0, _c5=51699.53, _c6=0.01, _c7=0.06)

In [20]:
df2.count()

571

In [24]:
# df2.filter(df2._c2 < 2).collect()

In [None]:
# testing locading multiple parquet files at once

In [25]:
def get_parquet_file_paths(partition_ids):
    
    hdfs_path = 'hdfs://localhost:9000/user/cloudray/NORA/'
    result_paths = []
    
    for pid in partition_ids:
        partition_name = 'partition_' + str(pid)+'.parquet'
        path = hdfs_path + partition_name
        result_paths.append(path)
        
    return result_paths

In [26]:
paths = get_parquet_file_paths([0,1])

In [27]:
print(paths)

['hdfs://localhost:9000/user/cloudray/NORA/partition_0.parquet', 'hdfs://localhost:9000/user/cloudray/NORA/partition_1.parquet']


In [28]:
dfs = sqlContext.read.parquet(*paths)

In [29]:
print(dfs)

DataFrame[_c0: double, _c1: double, _c2: double, _c3: double, _c4: double, _c5: double, _c6: double, _c7: double]


In [30]:
dfs.count()

11998567

In [39]:
def try_pass_sqlcontext(context, df):
    df = context.read.parquet('hdfs://localhost:9000/user/cloudray/NORA/partition_1.parquet')
    return df

In [42]:
try_pass_sqlcontext(sqlContext, dfs)

In [43]:
dfs.count()

11998567

In [44]:
# Test Multi Thread Parquet File Writing
import threading
class myThread(threading.Thread):
    def __init__(self, thread_id, name, df, lock_dict):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.name = name
        self.df = df
        self.lock_dict = lock_dict
        
    def run(self):
        print('start thread: ',self.thread_id, self.name)
        pid = 0
        self.lock_dict[pid].acquire()
        self.df.write.mode('append').parquet('hdfs://localhost:9000/user/cloudray/NORA/partition_TEST.parquet')
        self.lock_dict[pid].release()

max_threads = 8

In [45]:
import numpy as np
import pandas as pd

pdf1 = pd.DataFrame(np.array([[1,2],[3,4]]))
df1 = sqlContext.createDataFrame(pdf1)

pdf2 = pd.DataFrame(np.array([[11,12],[13,14]]))
df2 = sqlContext.createDataFrame(pdf2)

pdf3 = pd.DataFrame(np.array([[21,22],[23,24]]))
df3 = sqlContext.createDataFrame(pdf3)

pdf4 = pd.DataFrame(np.array([[31,32],[33,34]]))
df4 = sqlContext.createDataFrame(pdf4)

In [46]:
# try to use lock dict
lock_dict = {0:threading.Lock()}

In [48]:
thread1 = myThread(1, 'thread_'+str(1), df1, lock_dict)
thread2 = myThread(2, 'thread_'+str(2), df2, lock_dict)
thread3 = myThread(3, 'thread_'+str(3), df3, lock_dict)
thread4 = myThread(4, 'thread_'+str(4), df4, lock_dict)

thread1.start()
thread2.start()
thread3.start()
thread4.start()

start thread:  1 thread_1
start thread:  2 thread_2
start thread:  3 thread_3
start thread:  4 thread_4


In [49]:
# check
loaded_df = sqlContext.read.parquet('hdfs://localhost:9000/user/cloudray/NORA/partition_TEST.parquet')

In [50]:
print(loaded_df.head(8))

[Row(0=11, 1=12), Row(0=1, 1=2), Row(0=31, 1=32), Row(0=21, 1=22), Row(0=13, 1=14), Row(0=3, 1=4), Row(0=33, 1=34), Row(0=23, 1=24)]


In [35]:
print(df1.head(2))
print(df2.head(2))
print(df3.head(2))
print(df4.head(2))

[Row(0=1, 1=2), Row(0=3, 1=4)]
[Row(0=11, 1=12), Row(0=13, 1=14)]
[Row(0=21, 1=22), Row(0=23, 1=24)]
[Row(0=31, 1=32), Row(0=33, 1=34)]


In [34]:
print(lock_dict)

{0: <unlocked _thread.lock object at 0x7f7b892bed80>}
