# DataFrame Builder:

In [29]:
import os
import sys
import numpy as np
import pandas as pd

pd.options.display.float_format = '{:8,.2f}'.format

In [2]:
# mylib:
my_library = os.path.expanduser('~/.myconfigs')
my_spark = os.path.expanduser('~/spark2_dfanalysis')
sys.path.append(my_library)
sys.path.append(my_spark)

In [3]:
import pyspark as spark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# spark = SparkSession.builder.appName('myappname').getOrCreate()
# print(spark)

In [4]:
# print(dir(pyspark))
# print(dir(pyspark.sql))
# print(dir(pyspark.rdd))

In [5]:
from shared.app_context import *

ctx = ApplicationContext("Dev-Job")
print(ctx.spark)

<pyspark.sql.session.SparkSession object at 0x00000258EC00FEF0>


In [6]:
# print(sys.path)
from builder.DataFrameBuild import *

## DataFrameBuild Class:
### x = DataFrameBuild(ctx.spark)

In [7]:
x = DataFrameBuild(ctx.spark)

### build_array function

In [8]:
mystr = x.build_array("string",num=12,width=8) 
myint = x.build_array("integer",num=12,nrange=(0,4))    # inclusive on range
mydoub = x.build_array("double",num=12,nrange=(10,10.1))

print(len(mystr),mystr)
print(len(myint),myint)
print(len(mydoub),mydoub)

12 ['opeizyzr', 'attpzusd', 'quvokdqv', 'wxmouazi', 'dmkeherw', 'ehlkthcc', 'wnvrknzf', 'fldkrdji', 'nmplkbti', 'bnitphox', 'cxyaeaow', 'phhovscg']
12 [4 4 4 1 3 0 4 1 4 4 1 0]
12 [10.05958403 10.04038789 10.02415255 10.03016893 10.09653555 10.06439566
 10.05368564 10.09777727 10.03880275 10.03040578 10.01831231 10.09674956]


### Ex1. Combine arrays, lists into DataFrame.

In [10]:
x = DataFrameBuild(ctx.spark)
num = 500
df4 = x.arrays_to_dataframe([x.build_array("string",num=num,width=8),
                        x.build_array("integer",num=num,nrange=(1,4)),
                        x.build_array("integer",num=num,nrange=(1,12)),
                        x.build_array("double",num=num,nrange=(0.0,10000))],
                      ['passwords','quarter','month','price'])

In [11]:
df4.limit(10).toPandas()

Unnamed: 0,passwords,quarter,month,price
0,xdjmhvdv,3,7,512.07
1,hdttuclf,1,2,9135.76
2,fqxfoius,3,2,2469.62
3,srsvkoaj,4,11,2648.54
4,jtqpjbuh,3,11,3399.43
5,sfgvcmph,2,4,8059.82
6,ktyhaltn,1,4,6793.42
7,oqvsrbjf,4,2,6056.37
8,yomocvlg,1,12,8122.58
9,mmeiyrcn,1,2,7984.3


### Ex2. Combine arrays, lists into DataFrame.

In [12]:
x = DataFrameBuild(ctx.spark)
num = 500
df4 = x.arrays_to_dataframe([[int(x) for x in np.linspace(1,num,num)],
                             x.build_array("string",num=num,width=8),
                             x.build_array("integer",num=num,nrange=(1,4)),
                             x.build_array("integer",num=num,nrange=(1,12)),
                             x.build_array("double",num=num,nrange=(0.0,10000))],
                             ['index','passwords','quarter','month','price'])

In [13]:
df4.show()

+-----+---------+-------+-----+------------------+
|index|passwords|quarter|month|             price|
+-----+---------+-------+-----+------------------+
|    1| cyztyrko|      4|    5|314.34591154064105|
|    2| dnjjnmog|      3|   12|2539.7816194662882|
|    3| zoxkrvlh|      2|   12| 9801.939184016917|
|    4| cvijsyye|      4|    9| 3677.520232671988|
|    5| okdhgred|      4|   12|276.09900447780444|
|    6| hmlpmbee|      2|   10| 9536.927951560743|
|    7| tyegbebb|      3|    9| 9286.739083559703|
|    8| mgifqqyk|      2|    6|1187.9236726452248|
|    9| ifapfcib|      1|    1| 7746.014392663392|
|   10| vgsirpgx|      3|    4|2773.1656526600714|
|   11| xwgwkcds|      3|   11|1219.0549284092378|
|   12| vipkcbag|      4|   10| 9677.401001532802|
|   13| cssmcddg|      1|   12|3563.4784574583246|
|   14| cujfjyra|      2|    1| 9453.952312881494|
|   15| rjgzczvx|      1|    3| 2465.080085638733|
|   16| qwxtfcwm|      1|   11| 4954.020688559832|
|   17| cdfsadig|      3|    7|

In [14]:
df4.orderBy("index").show()

+-----+---------+-------+-----+------------------+
|index|passwords|quarter|month|             price|
+-----+---------+-------+-----+------------------+
|    1| cyztyrko|      4|    5|314.34591154064105|
|    2| dnjjnmog|      3|   12|2539.7816194662882|
|    3| zoxkrvlh|      2|   12| 9801.939184016917|
|    4| cvijsyye|      4|    9| 3677.520232671988|
|    5| okdhgred|      4|   12|276.09900447780444|
|    6| hmlpmbee|      2|   10| 9536.927951560743|
|    7| tyegbebb|      3|    9| 9286.739083559703|
|    8| mgifqqyk|      2|    6|1187.9236726452248|
|    9| ifapfcib|      1|    1| 7746.014392663392|
|   10| vgsirpgx|      3|    4|2773.1656526600714|
|   11| xwgwkcds|      3|   11|1219.0549284092378|
|   12| vipkcbag|      4|   10| 9677.401001532802|
|   13| cssmcddg|      1|   12|3563.4784574583246|
|   14| cujfjyra|      2|    1| 9453.952312881494|
|   15| rjgzczvx|      1|    3| 2465.080085638733|
|   16| qwxtfcwm|      1|   11| 4954.020688559832|
|   17| cdfsadig|      3|    7|

In [84]:
df4.select("index").distinct().count()

500

### Ex3. Combine arrays, lists into DataFrame.

In [30]:
x = DataFrameBuild(ctx.spark)
num = 5000
lst_cars = [random.choice(['Honda','Toyota','Chevy','Ford','Tesla','Volkswagon','Hyundai','Jeep']) for x in range(num)]
df4 = x.arrays_to_dataframe([[int(x) for x in np.linspace(1,num,num)],
                             x.build_array("string",num=num,width=8),
                             x.build_array("integer",num=num,nrange=(1,4)),
                             x.build_array("integer",num=num,nrange=(1,12)),
                             x.build_array("double",num=num,nrange=(0.0,50000)),
                             lst_cars],
                             ['index','passwords','quarter','month','price','cars'])

In [31]:
df4.show(20)

+-----+---------+-------+-----+------------------+----------+
|index|passwords|quarter|month|             price|      cars|
+-----+---------+-------+-----+------------------+----------+
|    1| upaiogwd|      2|    1|31834.913179518964|Volkswagon|
|    2| aofdlaoj|      3|    4| 46222.26741716476|     Chevy|
|    3| rqbugoax|      4|    9| 5729.822129404549|      Ford|
|    4| vctpzwfm|      2|    9| 540.3028094369378|     Chevy|
|    5| pxumxlyj|      2|    1|23723.352432642987|    Toyota|
|    6| mrdrttae|      2|    4|15590.222872908716|     Honda|
|    7| ykyretny|      2|    5| 49120.26087935749|      Ford|
|    8| fetqaacg|      1|    9| 3148.676126666838|Volkswagon|
|    9| buqokjjj|      1|    2| 7208.513561178865|     Chevy|
|   10| awgyxjrm|      2|   11|49425.871056957985|     Tesla|
|   11| esvezafb|      1|    4| 4487.474654253665|    Toyota|
|   12| gglnloij|      4|    2| 41794.59162045686|     Chevy|
|   13| idtpxsgc|      3|   12| 8166.171991164645|Volkswagon|
|   14| 

In [32]:
df4.count() 

5000

### Create DataFrames

#### with a list of list of strings, and list of string of length 1

In [77]:
dfx = x.arrays_to_dataframe([mystr],['random_letters'])
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

check:
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']] 1
['random_letters']
12
+--------------+
|random_letters|
+--------------+
|      qvfdlpks|
|      vpyviyux|
|      ypqyinsl|
|      rigvlcog|
|      qngoiobg|
|      zurnglnj|
|      hsuemdkh|
|      vlfalaxs|
|      hsmjolqj|
|      uhvkberg|
|      ladwcapa|
|      qrbkjsql|
+--------------+



#### with a list of list of strings, and a string

In [87]:
dfx = x.arrays_to_dataframe([mystr],'random_letters')
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

Levels:
list list
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']]
check:
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']] 1
['random_letters']
12
+--------------+
|random_letters|
+--------------+
|      qvfdlpks|
|      vpyviyux|
|      ypqyinsl|
|      rigvlcog|
|      qngoiobg|
|      zurnglnj|
|      hsuemdkh|
|      vlfalaxs|
|      hsmjolqj|
|      uhvkberg|
|      ladwcapa|
|      qrbkjsql|
+--------------+



#### with lists of length 2

In [89]:
print(myint)
print(mystr)
dfx = x.arrays_to_dataframe([myint,mystr],['random_ints','random_letters'])
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

[3 1 3 1 0 3 4 1 2 1 0 3]
['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']
Levels:
list ndarray
[array([3, 1, 3, 1, 0, 3, 4, 1, 2, 1, 0, 3]), ['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']]
check:
[array([3, 1, 3, 1, 0, 3, 4, 1, 2, 1, 0, 3]), ['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']] 2
['random_ints', 'random_letters']
12
+-----------+--------------+
|random_ints|random_letters|
+-----------+--------------+
|          3|      qvfdlpks|
|          1|      vpyviyux|
|          3|      ypqyinsl|
|          1|      rigvlcog|
|          0|      qngoiobg|
|          3|      zurnglnj|
|          4|      hsuemdkh|
|          1|      vlfalaxs|
|          2|      hsmjolqj|
|          1|      uhvkberg|
|

#### with a list and a string

In [90]:
dfx = x.arrays_to_dataframe(mystr,'crazy_strings')
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

Levels:
list str
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']]
check:
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']] 1
['crazy_strings']
12
+-------------+
|crazy_strings|
+-------------+
|     qvfdlpks|
|     vpyviyux|
|     ypqyinsl|
|     rigvlcog|
|     qngoiobg|
|     zurnglnj|
|     hsuemdkh|
|     vlfalaxs|
|     hsmjolqj|
|     uhvkberg|
|     ladwcapa|
|     qrbkjsql|
+-------------+



#### with an array and a string

In [91]:
dfx = x.arrays_to_dataframe(np.linspace(0,30,31),'integers')
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

Levels:
ndarray None
[array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
       26., 27., 28., 29., 30.])]
check:
[array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
       26., 27., 28., 29., 30.])] 1
['integers']
31
+--------+
|integers|
+--------+
|     0.0|
|     1.0|
|     2.0|
|     3.0|
|     4.0|
|     5.0|
|     6.0|
|     7.0|
|     8.0|
|     9.0|
|    10.0|
|    11.0|
|    12.0|
|    13.0|
|    14.0|
|    15.0|
|    16.0|
|    17.0|
|    18.0|
|    19.0|
+--------+
only showing top 20 rows



#### with 1 array, no names

In [92]:
dfx = x.arrays_to_dataframe(mystr)
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

Levels:
list str
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']]
check:
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']] 1
['str']
12
+--------+
|     str|
+--------+
|qvfdlpks|
|vpyviyux|
|ypqyinsl|
|rigvlcog|
|qngoiobg|
|zurnglnj|
|hsuemdkh|
|vlfalaxs|
|hsmjolqj|
|uhvkberg|
|ladwcapa|
|qrbkjsql|
+--------+



#### with 3 arrays, no names

In [93]:
dfx = x.arrays_to_dataframe([mystr,myint,mydoub])
print(dfx.count)
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

Levels:
list list
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql'], array([3, 1, 3, 1, 0, 3, 4, 1, 2, 1, 0, 3]), array([10.03311284, 10.03815753, 10.04005895, 10.09301454, 10.0645929 ,
       10.04592987, 10.02605554, 10.0741538 , 10.00008683, 10.04075202,
       10.03649303, 10.06943678])]
check:
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql'], array([3, 1, 3, 1, 0, 3, 4, 1, 2, 1, 0, 3]), array([10.03311284, 10.03815753, 10.04005895, 10.09301454, 10.0645929 ,
       10.04592987, 10.02605554, 10.0741538 , 10.00008683, 10.04075202,
       10.03649303, 10.06943678])] 3
['str', 'int32', 'float64']
<bound method DataFrame.count of DataFrame[str: string, int32: bigint, float64: double]>
+--------+-----+------------------+
|     str|int32|           float64|
+--------+-----+------------------+
|qvfdl