In [1]:
# My Standard Spark Session!

# Python libraries:
import os
import sys
import re
from dateutil import parser
# import datetime
from datetime import datetime
from datetime import date
import builtins
import json
import functools
import operator
from itertools import product

# Numpy & Pandas!
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:20,.2f}'.format
pd.options.display.max_columns = None

# Spark!
from pyspark import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *
from pyspark.sql import SparkSession, Row


spark = SparkSession.builder.appName("myapp").getOrCreate()

#     spark = SparkSession.builder.master("yarn")\
#     .config("spark.executor.instances", "32")\
#     .config("spark.executor.cores", "4")\
#     .config("spark.executor.memory", "4G")\
#     .config("spark.driver.memory", "4G")\
#     .config("spark.executor.memoryOverhead","4G")\
#     .config("spark.yarn.queue","Medium")\
#     .appName("myapp")\
#     .getOrCreate()

sc = spark.sparkContext
spark.conf.set("spark.sql.sources.partitionColumnTypeInference.enabled", "false")
spark.conf.set("spark.debug.maxToStringFields","true")

## DataFrameBuild Class:
DFB = DataFrameBuild(ctx.spark)

In [2]:
# mylib:
my_library = os.path.expanduser('~/.myconfigs')
my_spark = os.path.expanduser('~/spark2_dfanalysis')
sys.path.append(my_library)
sys.path.append(my_spark)

from shared.app_context import *

ctx = ApplicationContext("Dev-Job")
print(ctx.spark)

from builder.DataFrameBuild import *

DFB = DataFrameBuild(ctx.spark)

<pyspark.sql.session.SparkSession object at 0x0000015DF8E849B0>


### Construct some arrays. Put them into a dataframe.

In [3]:
mystr = DFB.build_array("string",num=12,width=8) 
myint = DFB.build_array("integer",num=12,nrange=(0,4))    # inclusive on range
mydoub = DFB.build_array("double",num=12,nrange=(10,10.1))

print(len(mystr),mystr)
print(len(myint),myint)
print(len(mydoub),mydoub)

12 ['noeqgyak', 'tipixkqw', 'lhtsbkpo', 'cmfjbbod', 'jooksore', 'vakjuqgt', 'orluizmb', 'hntxesfm', 'ujwcdavp', 'cwqrwxet', 'diqbzyin', 'ghiyppfg']
12 [4 2 4 4 4 3 0 1 0 1 2 1]
12 [10.03855917 10.01649208 10.00052008 10.09815778 10.00029118 10.0148786
 10.05788824 10.05350295 10.0212721  10.03265851 10.06792971 10.05121041]


In [4]:
df1 = DFB.arrays_to_dataframe([mystr,myint,mydoub,],['strings','integers','doubles'])

In [12]:
df1.limit(4).toPandas()

Unnamed: 0,strings,integers,doubles
0,noeqgyak,4,10.04
1,tipixkqw,2,10.02
2,lhtsbkpo,4,10.0
3,cmfjbbod,4,10.1


### Do it again. Put arrays or lists into a dataframe.

In [6]:
num = 500
df4 = DFB.arrays_to_dataframe(
    [DFB.build_array("string",num=num,width=8),
     DFB.build_array("integer",num=num,nrange=(1,4)),
     DFB.build_array("integer",num=num,nrange=(1,12)),
     DFB.build_array("double",num=num,nrange=(0.0,10000))],
    ['passwords','quarter','month','price'])

In [11]:
df4.limit(4).toPandas()

Unnamed: 0,index,passwords,quarter,month,price
0,1,oejkguml,2,4,1724.64
1,2,ydvsvdtt,1,1,2553.44
2,3,vpbchkyf,4,11,3941.07
3,4,hexvraiy,3,2,2856.28


### Manually use an index column in my dataframe.

In [8]:
num = 5000
df4 = DFB.arrays_to_dataframe(
    [[int(x) for x in np.linspace(1,num,num)],
     DFB.build_array("string",num=num,width=8),
     DFB.build_array("integer",num=num,nrange=(1,4)),
     DFB.build_array("integer",num=num,nrange=(1,12)),
     DFB.build_array("double",num=num,nrange=(0.0,10000))],
    ['index','passwords','quarter','month','price'])

In [10]:
df4.limit(4).toPandas()

Unnamed: 0,index,passwords,quarter,month,price
0,1,oejkguml,2,4,1724.64
1,2,ydvsvdtt,1,1,2553.44
2,3,vpbchkyf,4,11,3941.07
3,4,hexvraiy,3,2,2856.28


In [26]:
df4.orderBy("index").show()

+-----+---------+-------+-----+------------------+
|index|passwords|quarter|month|             price|
+-----+---------+-------+-----+------------------+
|    1| jlpiswsg|      3|    3| 8964.198245068565|
|    2| ejdrdeus|      3|    6| 4245.420764266281|
|    3| tsetpsdf|      1|    2| 5728.924778062631|
|    4| ycemmlsf|      3|   12|  9991.84568292378|
|    5| xjcnmgja|      3|   10| 3297.053065202732|
|    6| vxayjfka|      1|    1|  8711.36575782409|
|    7| vmpqjfmv|      1|    2|1950.3502282619122|
|    8| hkxhwyyn|      4|    8| 7298.712922583214|
|    9| cemveshd|      1|    4| 6771.822248503308|
|   10| nioqczha|      4|    5| 6649.509515929742|
|   11| ggvznopg|      2|    7| 6740.049227920286|
|   12| wgdmmvcs|      1|    6| 5995.662049202913|
|   13| bwianeoc|      3|    2| 6592.478707689565|
|   14| rgepvkmi|      4|    1| 7222.401984581723|
|   15| sfkqsblv|      2|    7| 8119.525369568308|
|   16| fmzcvckd|      1|    6| 8910.197525803118|
|   17| ctgdppne|      2|    8|

In [27]:
df4.select("index").distinct().count()

5000

### Let's do a random car example and pretend we have used car prices.

In [13]:
x = DataFrameBuild(ctx.spark)
num = 5000
lst_cars = [random.choice(['Honda','Toyota','Chevy','Ford','Tesla','Volkswagon','Hyundai','Jeep']) for x in range(num)]
df4 = x.arrays_to_dataframe([[int(x) for x in np.linspace(1,num,num)],
                             x.build_array("string",num=num,width=8),
                             x.build_array("integer",num=num,nrange=(1,4)),
                             x.build_array("integer",num=num,nrange=(1,12)),
                             x.build_array("double",num=num,nrange=(0.0,50000)),
                             lst_cars],
                             ['index','passwords','quarter','month','price','cars'])

In [14]:
df4.limit(4).toPandas()

Unnamed: 0,index,passwords,quarter,month,price,cars
0,1,bzplzrsi,1,11,3384.64,Tesla
1,2,gtpbbeny,3,7,3737.24,Volkswagon
2,3,lwiquodt,2,9,48822.62,Ford
3,4,mxjbqdxm,3,5,15333.94,Tesla


In [15]:
df4.count() 

5000

### Create DataFrames

#### with a list of list of strings, and list of string of length 1

In [16]:
dfx = x.arrays_to_dataframe([mystr],['random_letters'])
print(dfx.count())
dfx.limit(4).toPandas()

# [StructField(str,StringType,true)]
# ['str']

12


Unnamed: 0,random_letters
0,noeqgyak
1,tipixkqw
2,lhtsbkpo
3,cmfjbbod


#### with a list of list of strings, and a string

In [17]:
dfx = x.arrays_to_dataframe([mystr],'random_letters')
print(dfx.count())
dfx.limit(4).toPandas()

# [StructField(str,StringType,true)]
# ['str']

12


Unnamed: 0,random_letters
0,noeqgyak
1,tipixkqw
2,lhtsbkpo
3,cmfjbbod


#### with lists of length 2

In [18]:
print(myint)
print(mystr)
dfx = x.arrays_to_dataframe([myint,mystr],['random_ints','random_letters'])
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

[4 2 4 4 4 3 0 1 0 1 2 1]
['noeqgyak', 'tipixkqw', 'lhtsbkpo', 'cmfjbbod', 'jooksore', 'vakjuqgt', 'orluizmb', 'hntxesfm', 'ujwcdavp', 'cwqrwxet', 'diqbzyin', 'ghiyppfg']
12
+-----------+--------------+
|random_ints|random_letters|
+-----------+--------------+
|          4|      noeqgyak|
|          2|      tipixkqw|
|          4|      lhtsbkpo|
|          4|      cmfjbbod|
|          4|      jooksore|
|          3|      vakjuqgt|
|          0|      orluizmb|
|          1|      hntxesfm|
|          0|      ujwcdavp|
|          1|      cwqrwxet|
|          2|      diqbzyin|
|          1|      ghiyppfg|
+-----------+--------------+



#### with a list and a string

In [19]:
dfx = x.arrays_to_dataframe(mystr,'crazy_strings')
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

12
+-------------+
|crazy_strings|
+-------------+
|     noeqgyak|
|     tipixkqw|
|     lhtsbkpo|
|     cmfjbbod|
|     jooksore|
|     vakjuqgt|
|     orluizmb|
|     hntxesfm|
|     ujwcdavp|
|     cwqrwxet|
|     diqbzyin|
|     ghiyppfg|
+-------------+



#### with an array and a string

In [20]:
dfx = x.arrays_to_dataframe(np.linspace(0,30,31),'integers')
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

31
+--------+
|integers|
+--------+
|     0.0|
|     1.0|
|     2.0|
|     3.0|
|     4.0|
|     5.0|
|     6.0|
|     7.0|
|     8.0|
|     9.0|
|    10.0|
|    11.0|
|    12.0|
|    13.0|
|    14.0|
|    15.0|
|    16.0|
|    17.0|
|    18.0|
|    19.0|
+--------+
only showing top 20 rows



#### with 1 array, no names

In [21]:
dfx = x.arrays_to_dataframe(mystr)
print(dfx.count())
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

12
+--------+
|     str|
+--------+
|noeqgyak|
|tipixkqw|
|lhtsbkpo|
|cmfjbbod|
|jooksore|
|vakjuqgt|
|orluizmb|
|hntxesfm|
|ujwcdavp|
|cwqrwxet|
|diqbzyin|
|ghiyppfg|
+--------+



#### with 3 arrays, no names

In [22]:
dfx = x.arrays_to_dataframe([mystr,myint,mydoub])
print(dfx.count)
dfx.show()

# [StructField(str,StringType,true)]
# ['str']

<bound method DataFrame.count of DataFrame[str: string, int32: bigint, float64: double]>
+--------+-----+------------------+
|     str|int32|           float64|
+--------+-----+------------------+
|noeqgyak|    4|10.038559168717285|
|tipixkqw|    2|10.016492078950616|
|lhtsbkpo|    4|10.000520082614512|
|cmfjbbod|    4|10.098157775191895|
|jooksore|    4|10.000291182712905|
|vakjuqgt|    3|10.014878603120152|
|orluizmb|    0| 10.05788823870134|
|hntxesfm|    1|10.053502949224294|
|ujwcdavp|    0|10.021272097080564|
|cwqrwxet|    1|10.032658507679978|
|diqbzyin|    2|10.067929705764994|
|ghiyppfg|    1|10.051210407823987|
+--------+-----+------------------+

