In [7]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Spark API Mini Exercises

Copy the code below to create a pandas dataframe with 20 rows and 3 columns:

```python
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)
```

**1. Spark Dataframe Basics**

    1. Use the starter code above to create a pandas dataframe.
    2. Convert the pandas dataframe to a spark dataframe. From this point
       forward, do all of your work with the spark dataframe, not the pandas
       dataframe.
    3. Show the first 3 rows of the dataframe.
    4. Show the first 7 rows of the dataframe.
    5. View a summary of the data using `.describe`.
    6. Use `.select` to create a new dataframe with just the `n` and `abool`
       columns. View the first 5 rows of this dataframe.
    7. Use `.select` to create a new dataframe with just the `group` and `abool`
       columns. View the first 5 rows of this dataframe.
    8. Use `.select` to create a new dataframe with the `group` column and the
       `abool` column renamed to `a_boolean_value`. Show the first 3 rows of
       this dataframe.
    9. Use `.select` to create a new dataframe with the `group` column and the
       `n` column renamed to `a_numeric_value`. Show the first 6 rows of this
       dataframe.


#### 1. Use the starter code above to create a pandas dataframe.

In [4]:
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [5]:
pandas_dataframe

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False
5,0.532338,y,False
6,1.350188,z,False
7,0.861211,x,False
8,1.478686,z,True
9,-1.045377,y,True


#### 2. Convert the pandas dataframe to a spark dataframe. From this pointforward, do all of your work with the spark dataframe, not the pandas dataframe.

In [8]:
df = spark.createDataFrame(pandas_dataframe)

In [9]:
df.show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



#### 3. Show the first 3 rows of the dataframe.

In [10]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



#### 4. Show the first 7 rows of the dataframe.

In [12]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



#### 5. View a summary of the data using `.describe`.

In [13]:
df.describe()

DataFrame[summary: string, n: string, group: string]

In [14]:
df.describe().show()

+-------+-------------------+-----+
|summary|                  n|group|
+-------+-------------------+-----+
|  count|                 20|   20|
|   mean|0.36640264498852165| null|
| stddev| 0.8905322898155364| null|
|    min| -1.261605945319069|    x|
|    max| 2.1503829673811126|    z|
+-------+-------------------+-----+



#### 6. Use `.select` to create a new dataframe with just the `n` and `abool` columns. View the first 5 rows of this dataframe.

In [16]:
df.select('n', 'abool').show()

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
|  0.5323378882945463|false|
|  1.3501878997225267|false|
|  0.8612113741693206|false|
|  1.4786857374358966| true|
| -1.0453771305385342| true|
| -0.7889890249515489|false|
|  -1.261605945319069|false|
|  0.5628467852810314| true|
|-0.24332625188556253| true|
|  0.9137407048596775|false|
| 0.31735092273633597|false|
| 0.12730328020698067|false|
|  2.1503829673811126| true|
|  0.6062886568962988|false|
|-0.02677164998644...| true|
+--------------------+-----+



In [17]:
new_df = df.select('n', 'abool')

In [18]:
new_df.show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



#### 7. Use `.select` to create a new dataframe with just the `group` and `abool` columns. View the first 5 rows of this dataframe.

In [19]:
new_ga = df.select('group', 'abool')

In [20]:
new_ga.show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



#### 8. Use `.select` to create a new dataframe with the `group` column and the `abool` column renamed to `a_boolean_value`. Show the first 3 rows of this dataframe.

In [24]:
df.select('group', df.abool.alias('a_boolean_value')).show()

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
|    y|          false|
|    z|          false|
|    y|          false|
|    z|          false|
|    x|          false|
|    z|           true|
|    y|           true|
|    x|          false|
|    y|          false|
|    y|           true|
|    y|           true|
|    y|          false|
|    x|          false|
|    z|          false|
|    y|           true|
|    x|          false|
|    x|           true|
+-----+---------------+



In [25]:
df_new_name = df.select('group', df.abool.alias('a_boolean_value'))

In [26]:
df_new_name.show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



#### 9. Use `.select` to create a new dataframe with the `group` column and the `n` column renamed to `a_numeric_value`. Show the first 6 rows of thisdataframe.

In [27]:
df_num_val = df.select('group', df.n.alias('a_numeric_value'))

In [28]:
df_num_val.show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows

