# DataFrame Builder:

In [29]:
import os
import sys
import numpy as np
import pandas as pd

pd.options.display.float_format = '{:8,.2f}'.format

In [2]:
# mylib:
my_library = os.path.expanduser('~/.myconfigs')
my_spark = os.path.expanduser('~/spark2_dfanalysis')
sys.path.append(my_library)
sys.path.append(my_spark)

In [3]:
import pyspark as spark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# spark = SparkSession.builder.appName('myappname').getOrCreate()
# print(spark)

In [4]:
# print(dir(pyspark))
# print(dir(pyspark.sql))
# print(dir(pyspark.rdd))

In [5]:
from shared.app_context import *

ctx = ApplicationContext("Dev-Job")
print(ctx.spark)

<pyspark.sql.session.SparkSession object at 0x00000258EC00FEF0>


In [6]:
# print(sys.path)
from builder.DataFrameBuild import *

## DataFrameBuild Class:
### x = DataFrameBuild(ctx.spark)

In [7]:
x = DataFrameBuild(ctx.spark)

### build_array function

In [8]:
mystr = x.build_array("string",num=12,width=8) 
myint = x.build_array("integer",num=12,nrange=(0,4))    # inclusive on range
mydoub = x.build_array("double",num=12,nrange=(10,10.1))

print(len(mystr),mystr)
print(len(myint),myint)
print(len(mydoub),mydoub)

12 ['opeizyzr', 'attpzusd', 'quvokdqv', 'wxmouazi', 'dmkeherw', 'ehlkthcc', 'wnvrknzf', 'fldkrdji', 'nmplkbti', 'bnitphox', 'cxyaeaow', 'phhovscg']
12 [4 4 4 1 3 0 4 1 4 4 1 0]
12 [10.05958403 10.04038789 10.02415255 10.03016893 10.09653555 10.06439566
 10.05368564 10.09777727 10.03880275 10.03040578 10.01831231 10.09674956]


### Ex1. Combine arrays, lists into DataFrame.

In [10]:
x = DataFrameBuild(ctx.spark)
num = 500
df4 = x.arrays_to_dataframe([x.build_array("string",num=num,width=8),
                        x.build_array("integer",num=num,nrange=(1,4)),
                        x.build_array("integer",num=num,nrange=(1,12)),
                        x.build_array("double",num=num,nrange=(0.0,10000))],
                      ['passwords','quarter','month','price'])

In [11]:
df4.limit(10).toPandas()

Unnamed: 0,passwords,quarter,month,price
0,xdjmhvdv,3,7,512.07
1,hdttuclf,1,2,9135.76
2,fqxfoius,3,2,2469.62
3,srsvkoaj,4,11,2648.54
4,jtqpjbuh,3,11,3399.43
5,sfgvcmph,2,4,8059.82
6,ktyhaltn,1,4,6793.42
7,oqvsrbjf,4,2,6056.37
8,yomocvlg,1,12,8122.58
9,mmeiyrcn,1,2,7984.3


### Ex2. Combine arrays, lists into DataFrame.

In [12]:
x = DataFrameBuild(ctx.spark)
num = 500
df4 = x.arrays_to_dataframe([[int(x) for x in np.linspace(1,num,num)],
                             x.build_array("string",num=num,width=8),
                             x.build_array("integer",num=num,nrange=(1,4)),
                             x.build_array("integer",num=num,nrange=(1,12)),
                             x.build_array("double",num=num,nrange=(0.0,10000))],
                             ['index','passwords','quarter','month','price'])

In [13]:
df4.show()

+-----+---------+-------+-----+------------------+
|index|passwords|quarter|month|             price|
+-----+---------+-------+-----+------------------+
|    1| cyztyrko|      4|    5|314.34591154064105|
|    2| dnjjnmog|      3|   12|2539.7816194662882|
|    3| zoxkrvlh|      2|   12| 9801.939184016917|
|    4| cvijsyye|      4|    9| 3677.520232671988|
|    5| okdhgred|      4|   12|276.09900447780444|
|    6| hmlpmbee|      2|   10| 9536.927951560743|
|    7| tyegbebb|      3|    9| 9286.739083559703|
|    8| mgifqqyk|      2|    6|1187.9236726452248|
|    9| ifapfcib|      1|    1| 7746.014392663392|
|   10| vgsirpgx|      3|    4|2773.1656526600714|
|   11| xwgwkcds|      3|   11|1219.0549284092378|
|   12| vipkcbag|      4|   10| 9677.401001532802|
|   13| cssmcddg|      1|   12|3563.4784574583246|
|   14| cujfjyra|      2|    1| 9453.952312881494|
|   15| rjgzczvx|      1|    3| 2465.080085638733|
|   16| qwxtfcwm|      1|   11| 4954.020688559832|
|   17| cdfsadig|      3|    7|

In [14]:
df4.orderBy("index").show()

+-----+---------+-------+-----+------------------+
|index|passwords|quarter|month|             price|
+-----+---------+-------+-----+------------------+
|    1| cyztyrko|      4|    5|314.34591154064105|
|    2| dnjjnmog|      3|   12|2539.7816194662882|
|    3| zoxkrvlh|      2|   12| 9801.939184016917|
|    4| cvijsyye|      4|    9| 3677.520232671988|
|    5| okdhgred|      4|   12|276.09900447780444|
|    6| hmlpmbee|      2|   10| 9536.927951560743|
|    7| tyegbebb|      3|    9| 9286.739083559703|
|    8| mgifqqyk|      2|    6|1187.9236726452248|
|    9| ifapfcib|      1|    1| 7746.014392663392|
|   10| vgsirpgx|      3|    4|2773.1656526600714|
|   11| xwgwkcds|      3|   11|1219.0549284092378|
|   12| vipkcbag|      4|   10| 9677.401001532802|
|   13| cssmcddg|      1|   12|3563.4784574583246|
|   14| cujfjyra|      2|    1| 9453.952312881494|
|   15| rjgzczvx|      1|    3| 2465.080085638733|
|   16| qwxtfcwm|      1|   11| 4954.020688559832|
|   17| cdfsadig|      3|    7|

In [84]:
df4.select("index").distinct().count()

500

### Ex3. Combine arrays, lists into DataFrame.

In [30]:
x = DataFrameBuild(ctx.spark)
num = 5000
lst_cars = [random.choice(['Honda','Toyota','Chevy','Ford','Tesla','Volkswagon','Hyundai','Jeep']) for x in range(num)]
df4 = x.arrays_to_dataframe([[int(x) for x in np.linspace(1,num,num)],
                             x.build_array("string",num=num,width=8),
                             x.build_array("integer",num=num,nrange=(1,4)),
                             x.build_array("integer",num=num,nrange=(1,12)),
                             x.build_array("double",num=num,nrange=(0.0,50000)),
                             lst_cars],
                             ['index','passwords','quarter','month','price','cars'])

In [31]:
df4.show(20)

+-----+---------+-------+-----+------------------+----------+
|index|passwords|quarter|month|             price|      cars|
+-----+---------+-------+-----+------------------+----------+
|    1| upaiogwd|      2|    1|31834.913179518964|Volkswagon|
|    2| aofdlaoj|      3|    4| 46222.26741716476|     Chevy|
|    3| rqbugoax|      4|    9| 5729.822129404549|      Ford|
|    4| vctpzwfm|      2|    9| 540.3028094369378|     Chevy|
|    5| pxumxlyj|      2|    1|23723.352432642987|    Toyota|
|    6| mrdrttae|      2|    4|15590.222872908716|     Honda|
|    7| ykyretny|      2|    5| 49120.26087935749|      Ford|
|    8| fetqaacg|      1|    9| 3148.676126666838|Volkswagon|
|    9| buqokjjj|      1|    2| 7208.513561178865|     Chevy|
|   10| awgyxjrm|      2|   11|49425.871056957985|     Tesla|
|   11| esvezafb|      1|    4| 4487.474654253665|    Toyota|
|   12| gglnloij|      4|    2| 41794.59162045686|     Chevy|
|   13| idtpxsgc|      3|   12| 8166.171991164645|Volkswagon|
|   14| 

In [32]:
df4.count() 

5000