In [32]:
# My Standard Spark Session!

# Python libraries:
import os
import sys
import re
from dateutil import parser
# import datetime
from datetime import datetime
from datetime import date
import builtins
import json
import functools
import operator
from itertools import product

# Numpy & Pandas!
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:20,.2f}'.format
pd.options.display.max_columns = None

# Spark!
from pyspark import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *
from pyspark.sql import SparkSession, Row


spark = SparkSession.builder.appName("myapp").getOrCreate()

#     spark = SparkSession.builder.master("yarn")\
#     .config("spark.executor.instances", "32")\
#     .config("spark.executor.cores", "4")\
#     .config("spark.executor.memory", "4G")\
#     .config("spark.driver.memory", "4G")\
#     .config("spark.executor.memoryOverhead","4G")\
#     .config("spark.yarn.queue","Medium")\
#     .appName("myapp")\
#     .getOrCreate()

sc = spark.sparkContext
spark.conf.set("spark.sql.sources.partitionColumnTypeInference.enabled", "false")
spark.conf.set("spark.debug.maxToStringFields","true")

## DataFrameBuild Class:
DFB = DataFrameBuild(ctx.spark)

In [12]:
# mylib:
my_library = os.path.expanduser('~/.myconfigs')
my_spark = os.path.expanduser('~/spark2_dfanalysis')
sys.path.append(my_library)
sys.path.append(my_spark)

from shared.app_context import *

ctx = ApplicationContext("Dev-Job")
print(ctx.spark)

from builder.DataFrameBuild import *

DFB = DataFrameBuild(ctx.spark)

<pyspark.sql.session.SparkSession object at 0x000001B66C6396D8>


### Construct some arrays. Put them into a dataframe.

In [13]:
mystr = DFB.build_array("string",num=12,width=8) 
myint = DFB.build_array("integer",num=12,nrange=(0,4))    # inclusive on range
mydoub = DFB.build_array("double",num=12,nrange=(10,10.1))

print(len(mystr),mystr)
print(len(myint),myint)
print(len(mydoub),mydoub)

12 ['kyfwpywr', 'kxawuujf', 'ktsjabfg', 'qmtilzdp', 'adfqsgim', 'ygcyuanc', 'zfdivbac', 'nowembld', 'hlainypj', 'bzwwupqv', 'gyrtndvw', 'zmxvfrny']
12 [0 4 4 4 3 1 1 3 2 2 4 2]
12 [10.03300854 10.0578753  10.05915789 10.05929665 10.0571665  10.08244016
 10.02967528 10.03900109 10.01081468 10.09808571 10.03417946 10.00277596]


In [22]:
df1 = DFB.arrays_to_dataframe([mystr,myint,mydoub,],['strings','integers','doubles'])

In [23]:
df1.show(5)

+--------+--------+------------------+
| strings|integers|           doubles|
+--------+--------+------------------+
|kyfwpywr|       0|10.033008538507271|
|kxawuujf|       4|10.057875298946321|
|ktsjabfg|       4| 10.05915788504964|
|qmtilzdp|       4|10.059296654468106|
|adfqsgim|       3|10.057166502376846|
+--------+--------+------------------+
only showing top 5 rows



### Do it again. Put arrays or lists into a dataframe.

In [15]:
num = 500
df4 = DFB.arrays_to_dataframe(
    [DFB.build_array("string",num=num,width=8),
     DFB.build_array("integer",num=num,nrange=(1,4)),
     DFB.build_array("integer",num=num,nrange=(1,12)),
     DFB.build_array("double",num=num,nrange=(0.0,10000))],
    ['passwords','quarter','month','price'])

In [16]:
df4.limit(10).toPandas()

Unnamed: 0,passwords,quarter,month,price
0,pcvtkbwz,4,7,3192.24
1,hhkjqeyj,3,9,9046.05
2,bajzarmu,3,7,1063.65
3,akmehpwx,3,12,6813.92
4,cefspgkv,3,9,7846.75
5,ywdtjqml,4,1,7503.86
6,wolgbdtm,1,9,3494.69
7,jguxnsco,2,12,9564.74
8,woenzdqh,1,7,3989.97
9,oerhdwhy,1,4,7230.76


### Manually use an index column in my dataframe.

In [24]:
num = 5000
df4 = DFB.arrays_to_dataframe(
    [[int(x) for x in np.linspace(1,num,num)],
     DFB.build_array("string",num=num,width=8),
     DFB.build_array("integer",num=num,nrange=(1,4)),
     DFB.build_array("integer",num=num,nrange=(1,12)),
     DFB.build_array("double",num=num,nrange=(0.0,10000))],
    ['index','passwords','quarter','month','price'])

In [25]:
df4.show()

+-----+---------+-------+-----+------------------+
|index|passwords|quarter|month|             price|
+-----+---------+-------+-----+------------------+
|    1| jlpiswsg|      3|    3| 8964.198245068565|
|    2| ejdrdeus|      3|    6| 4245.420764266281|
|    3| tsetpsdf|      1|    2| 5728.924778062631|
|    4| ycemmlsf|      3|   12|  9991.84568292378|
|    5| xjcnmgja|      3|   10| 3297.053065202732|
|    6| vxayjfka|      1|    1|  8711.36575782409|
|    7| vmpqjfmv|      1|    2|1950.3502282619122|
|    8| hkxhwyyn|      4|    8| 7298.712922583214|
|    9| cemveshd|      1|    4| 6771.822248503308|
|   10| nioqczha|      4|    5| 6649.509515929742|
|   11| ggvznopg|      2|    7| 6740.049227920286|
|   12| wgdmmvcs|      1|    6| 5995.662049202913|
|   13| bwianeoc|      3|    2| 6592.478707689565|
|   14| rgepvkmi|      4|    1| 7222.401984581723|
|   15| sfkqsblv|      2|    7| 8119.525369568308|
|   16| fmzcvckd|      1|    6| 8910.197525803118|
|   17| ctgdppne|      2|    8|

In [26]:
df4.orderBy("index").show()

+-----+---------+-------+-----+------------------+
|index|passwords|quarter|month|             price|
+-----+---------+-------+-----+------------------+
|    1| jlpiswsg|      3|    3| 8964.198245068565|
|    2| ejdrdeus|      3|    6| 4245.420764266281|
|    3| tsetpsdf|      1|    2| 5728.924778062631|
|    4| ycemmlsf|      3|   12|  9991.84568292378|
|    5| xjcnmgja|      3|   10| 3297.053065202732|
|    6| vxayjfka|      1|    1|  8711.36575782409|
|    7| vmpqjfmv|      1|    2|1950.3502282619122|
|    8| hkxhwyyn|      4|    8| 7298.712922583214|
|    9| cemveshd|      1|    4| 6771.822248503308|
|   10| nioqczha|      4|    5| 6649.509515929742|
|   11| ggvznopg|      2|    7| 6740.049227920286|
|   12| wgdmmvcs|      1|    6| 5995.662049202913|
|   13| bwianeoc|      3|    2| 6592.478707689565|
|   14| rgepvkmi|      4|    1| 7222.401984581723|
|   15| sfkqsblv|      2|    7| 8119.525369568308|
|   16| fmzcvckd|      1|    6| 8910.197525803118|
|   17| ctgdppne|      2|    8|

In [27]:
df4.select("index").distinct().count()

5000

### Let's do a random car example and pretend we have used car prices.

In [28]:
x = DataFrameBuild(ctx.spark)
num = 5000
lst_cars = [random.choice(['Honda','Toyota','Chevy','Ford','Tesla','Volkswagon','Hyundai','Jeep']) for x in range(num)]
df4 = x.arrays_to_dataframe([[int(x) for x in np.linspace(1,num,num)],
                             x.build_array("string",num=num,width=8),
                             x.build_array("integer",num=num,nrange=(1,4)),
                             x.build_array("integer",num=num,nrange=(1,12)),
                             x.build_array("double",num=num,nrange=(0.0,50000)),
                             lst_cars],
                             ['index','passwords','quarter','month','price','cars'])

In [29]:
df4.show(20)

+-----+---------+-------+-----+------------------+----------+
|index|passwords|quarter|month|             price|      cars|
+-----+---------+-------+-----+------------------+----------+
|    1| ndbiiihs|      2|    3|20838.777549931274|     Tesla|
|    2| vcisnjet|      1|   12| 45341.68113124216|   Hyundai|
|    3| wuavbpwe|      2|    1|26244.359751626762|    Toyota|
|    4| atahdxzr|      2|    3| 45047.03759353193|     Tesla|
|    5| lfuqndxv|      3|    9|  39836.4657254842|      Ford|
|    6| ncbgwbpl|      2|    7|22669.262209909146|     Honda|
|    7| olzvytpz|      4|   11| 172.6199441168641|    Toyota|
|    8| iabtdpwz|      3|   12|15924.463977921738|      Jeep|
|    9| apfhwmwd|      4|   11|24806.376448203722|Volkswagon|
|   10| hvlgxpyn|      3|   12| 42010.34197572011|     Honda|
|   11| xoyvmjuw|      2|    1|2535.9504418552447|     Chevy|
|   12| toyoggpa|      2|    8| 23823.19595397488|      Jeep|
|   13| dniyzhyt|      1|   10| 4711.464054547971|      Ford|
|   14| 

In [30]:
df4.count() 

5000

### Create DataFrames

#### with a list of list of strings, and list of string of length 1

In [31]:
dfx = x.arrays_to_dataframe([mystr],['random_letters'])
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

12
+--------------+
|random_letters|
+--------------+
|      kyfwpywr|
|      kxawuujf|
|      ktsjabfg|
|      qmtilzdp|
|      adfqsgim|
|      ygcyuanc|
|      zfdivbac|
|      nowembld|
|      hlainypj|
|      bzwwupqv|
|      gyrtndvw|
|      zmxvfrny|
+--------------+



#### with a list of list of strings, and a string

In [87]:
dfx = x.arrays_to_dataframe([mystr],'random_letters')
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

Levels:
list list
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']]
check:
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']] 1
['random_letters']
12
+--------------+
|random_letters|
+--------------+
|      qvfdlpks|
|      vpyviyux|
|      ypqyinsl|
|      rigvlcog|
|      qngoiobg|
|      zurnglnj|
|      hsuemdkh|
|      vlfalaxs|
|      hsmjolqj|
|      uhvkberg|
|      ladwcapa|
|      qrbkjsql|
+--------------+



#### with lists of length 2

In [89]:
print(myint)
print(mystr)
dfx = x.arrays_to_dataframe([myint,mystr],['random_ints','random_letters'])
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

[3 1 3 1 0 3 4 1 2 1 0 3]
['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']
Levels:
list ndarray
[array([3, 1, 3, 1, 0, 3, 4, 1, 2, 1, 0, 3]), ['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']]
check:
[array([3, 1, 3, 1, 0, 3, 4, 1, 2, 1, 0, 3]), ['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']] 2
['random_ints', 'random_letters']
12
+-----------+--------------+
|random_ints|random_letters|
+-----------+--------------+
|          3|      qvfdlpks|
|          1|      vpyviyux|
|          3|      ypqyinsl|
|          1|      rigvlcog|
|          0|      qngoiobg|
|          3|      zurnglnj|
|          4|      hsuemdkh|
|          1|      vlfalaxs|
|          2|      hsmjolqj|
|          1|      uhvkberg|
|

#### with a list and a string

In [90]:
dfx = x.arrays_to_dataframe(mystr,'crazy_strings')
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

Levels:
list str
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']]
check:
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']] 1
['crazy_strings']
12
+-------------+
|crazy_strings|
+-------------+
|     qvfdlpks|
|     vpyviyux|
|     ypqyinsl|
|     rigvlcog|
|     qngoiobg|
|     zurnglnj|
|     hsuemdkh|
|     vlfalaxs|
|     hsmjolqj|
|     uhvkberg|
|     ladwcapa|
|     qrbkjsql|
+-------------+



#### with an array and a string

In [91]:
dfx = x.arrays_to_dataframe(np.linspace(0,30,31),'integers')
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

Levels:
ndarray None
[array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
       26., 27., 28., 29., 30.])]
check:
[array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
       26., 27., 28., 29., 30.])] 1
['integers']
31
+--------+
|integers|
+--------+
|     0.0|
|     1.0|
|     2.0|
|     3.0|
|     4.0|
|     5.0|
|     6.0|
|     7.0|
|     8.0|
|     9.0|
|    10.0|
|    11.0|
|    12.0|
|    13.0|
|    14.0|
|    15.0|
|    16.0|
|    17.0|
|    18.0|
|    19.0|
+--------+
only showing top 20 rows



#### with 1 array, no names

In [92]:
dfx = x.arrays_to_dataframe(mystr)
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

Levels:
list str
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']]
check:
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql']] 1
['str']
12
+--------+
|     str|
+--------+
|qvfdlpks|
|vpyviyux|
|ypqyinsl|
|rigvlcog|
|qngoiobg|
|zurnglnj|
|hsuemdkh|
|vlfalaxs|
|hsmjolqj|
|uhvkberg|
|ladwcapa|
|qrbkjsql|
+--------+



#### with 3 arrays, no names

In [93]:
dfx = x.arrays_to_dataframe([mystr,myint,mydoub])
print(dfx.count)
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

Levels:
list list
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql'], array([3, 1, 3, 1, 0, 3, 4, 1, 2, 1, 0, 3]), array([10.03311284, 10.03815753, 10.04005895, 10.09301454, 10.0645929 ,
       10.04592987, 10.02605554, 10.0741538 , 10.00008683, 10.04075202,
       10.03649303, 10.06943678])]
check:
[['qvfdlpks', 'vpyviyux', 'ypqyinsl', 'rigvlcog', 'qngoiobg', 'zurnglnj', 'hsuemdkh', 'vlfalaxs', 'hsmjolqj', 'uhvkberg', 'ladwcapa', 'qrbkjsql'], array([3, 1, 3, 1, 0, 3, 4, 1, 2, 1, 0, 3]), array([10.03311284, 10.03815753, 10.04005895, 10.09301454, 10.0645929 ,
       10.04592987, 10.02605554, 10.0741538 , 10.00008683, 10.04075202,
       10.03649303, 10.06943678])] 3
['str', 'int32', 'float64']
<bound method DataFrame.count of DataFrame[str: string, int32: bigint, float64: double]>
+--------+-----+------------------+
|     str|int32|           float64|
+--------+-----+------------------+
|qvfdl