# Example usage

## Version check

In [1]:
import sparkit

print(sparkit.__version__)

0.0.2


In [2]:
from pyspark.sql import Row, SparkSession

In [3]:
spark = (
    SparkSession.builder.master("local[1]")
    .appName("example-usage")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.cores", "1")
    .config("spark.executor.instances", "1")
    .config("spark.sql.shuffle.partitions", "1")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

23/03/13 21:21:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### `peek`

Default spark view:

In [4]:
df = spark.createDataFrame([Row(x=1, y="a"), Row(x=3, y=None), Row(x=None, y="c")])
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+----+----+
|   x|   y|
+----+----+
|   1|   a|
|   3|null|
|null|   c|
+----+----+





Default pandas view:

In [5]:
df.toPandas()

Unnamed: 0,x,y
0,1.0,a
1,3.0,
2,,c


Default `peek` view:

In [6]:
df.transform(sparkit.peek);

shape=(3, 2)


x,y
1.0,a
3.0,
,c


Show no rows:

In [7]:
df.transform(sparkit.peek(n=0));

shape=(3, 2)


Show two rows with schema and index:

In [8]:
df.transform(sparkit.peek(n=2, schema=True, index=True));

shape=(3, 2)

root
 |-- x: long (nullable = true)
 |-- y: string (nullable = true)



Unnamed: 0,x,y
1,1,a
2,3,


Peek before and after filtering:

In [9]:
filtered_df = df.transform(sparkit.peek).where("x IS NOT NULL").transform(sparkit.peek)

shape=(3, 2)


x,y
1.0,a
3.0,
,c


shape=(2, 2)


x,y
1,a
3,


In [10]:
spark.stop()