# Example usage

## Version check

In [1]:
import sparkit

print(sparkit.__version__)

0.0.2


In [2]:
from pyspark.sql import DataFrame, Row, SparkSession

In [3]:
spark = (
    SparkSession.builder.master("local[1]")
    .appName("example-usage")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.cores", "1")
    .config("spark.executor.instances", "1")
    .config("spark.sql.shuffle.partitions", "1")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

23/04/28 14:45:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### `peek`

Default spark view:

In [4]:
df = spark.createDataFrame([Row(x=1, y="a"), Row(x=3, y=None), Row(x=None, y="c")])
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+----+----+
|   x|   y|
+----+----+
|   1|   a|
|   3|null|
|null|   c|
+----+----+



                                                                                

Default pandas view:

In [5]:
df.toPandas()

Unnamed: 0,x,y
0,1.0,a
1,3.0,
2,,c


Default `peek` view (shape of the data frame is always printed):

In [6]:
df.transform(sparkit.peek);

shape = (3, 2)


x,y
1.0,a
3.0,
,c


Cache data frame and show no rows:

In [7]:
df.transform(sparkit.peek(n=0, cache=True));

shape = (3, 2)


Print schema and show only one row:

In [8]:
df.transform(sparkit.peek(n=1, schema=True));

root
 |-- x: long (nullable = true)
 |-- y: string (nullable = true)

shape = (3, 2)


x,y
1,a


Peek before and after filtering and column drop (with a consecutive row index of the resulting data frame):

In [9]:
filtered_df = (
    df.transform(sparkit.peek(index=True))
    .where("y IS NOT NULL")
    .drop("x")
    .transform(sparkit.peek(index=True))
)

shape = (3, 2)


Unnamed: 0,x,y
1,1.0,a
2,3.0,
3,,c


shape = (2, 1)


Unnamed: 0,y
1,a
2,c


In [10]:
isinstance(filtered_df, DataFrame)

True

In [11]:
spark.stop()

# `pandastools`

In [12]:
import pandas as pd
from sparkit import pandastools

In [13]:
df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["c", "d"])

## `join`

In [14]:
pandastools.join(df1, df2)

Unnamed: 0,a,b,c,d
0,1,2,5,6
1,3,4,7,8


## `profile`

In [15]:
df = pd.DataFrame(
    [
        [1.0, 2, "a"],
        [3.0, 2, "b"],
        [float("nan"), 6, None],
        [5, 6, "d"],
    ],
    columns=["x", "y", "z"],
)
df

Unnamed: 0,x,y,z
0,1.0,2,a
1,3.0,2,b
2,,6,
3,5.0,6,d


In [16]:
pandastools.profile(df)

Unnamed: 0,type,count,isnull,isnull%,unique,unique%,mean,std,skewness,kurtosis,min,5%,25%,50%,75%,95%,max
x,float64,3,1,0.25,3,0.75,3.0,2.0,0.0,,1.0,1.2,2.0,3.0,4.0,4.8,5.0
y,int64,4,0,0.0,2,0.5,4.0,2.309401,0.0,-6.0,2.0,2.0,2.0,4.0,6.0,6.0,6.0
z,object,3,1,0.25,3,0.75,,,,,,,,,,,


## `union`

In [17]:
df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["c", "d"])
pandastools.union(df1, df2)

Unnamed: 0,a,b,c,d
0,1.0,2.0,,
1,3.0,4.0,,
0,,,5.0,6.0
1,,,7.0,8.0
