# DataFrames 1

Working with Pandas and Spark dataframes

## Step 1 - Initialize Spark

In [None]:
try:
    spark
except NameError:
    import findspark
    findspark.init()  # uses SPARK_HOME
    print("Spark found in : ", findspark.find())

    import pyspark
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    # use a unique tmep dir for warehouse dir, so we can run multiple spark sessions in one dir
    import tempfile
    tmpdir = tempfile.TemporaryDirectory()

    config = ( SparkConf()
             .setAppName("TestApp")
             .setMaster("local[*]")
             .set('executor.memory', '2g')
             .set('spark.sql.warehouse.dir', tmpdir.name)
             .set("some_property", "some_value") # another example
             )

    spark = SparkSession.builder.config(conf=config).getOrCreate()

print('Spark UI running on port ' + spark.sparkContext.uiWebUrl.split(':')[2])

## Step 2 - Create a Pandas DataFrame

Here, we will create a Pandas DF and then convert it to Spark.

In [None]:
import pandas as pd

pd_df = pd.DataFrame ({'col1' : ['A', 'B', 'C', 'D'], 
                       'col2' : [10, 20, 30, 40], 
                       'col3' : [1.1, 2.2, 3.3, 4.4]})
pd_df

## Step 3 - Convert it to Spark DF

In [None]:
spark_df = spark.createDataFrame(pd_df)
spark_df.printSchema()
spark_df.show()

## Step 4 - Convering from Spark --> Pandas

In [None]:
# this is spark df
summary = spark_df.describe()

summary_pd = summary.toPandas()

summary_pd