<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">


# Dataframe Practice

*Authors: Christoph Rahmede (LDN)*

In [1]:
import pyspark as ps    # for the pyspark suite
from pyspark.sql import SQLContext

In [2]:
sc = ps.SparkContext('local[*]')
sqlContext = SQLContext(sc)
spark = ps.sql.SparkSession(sc)

## Load the titanic dataset

In [3]:
df = spark.read.csv(
    path="data/titanic_clean.csv",
    header=True,
    # Poorly formed rows in CSV are dropped rather than erroring entire operation
    mode="DROPMALFORMED",
    # Not always perfect but works well in most cases as of 2.1+
    inferSchema=True
)

## Print the schema

In [4]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)



## Create a temporary table view for SQL queries

In [5]:
df.createTempView('titanic')

## In all following questions, use both, dataframe queries and SQL queries

## Show the top of the dataframe.

In [6]:
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|   7.25|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|71.2833|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|  7.925|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|   53.1|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|   8.05|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|51.8625|       S|
|          8|       0|     3|Palsson, Master. ...|  male| 2.0|    3|    1| 21.075|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|11.133

In [7]:
df.describe().show()

+-------+------------------+------------------+------------------+--------------------+------+-----------------+------------------+-------------------+------------------+--------+
|summary|       PassengerId|          Survived|            Pclass|                Name|   Sex|              Age|             SibSp|              Parch|              Fare|Embarked|
+-------+------------------+------------------+------------------+--------------------+------+-----------------+------------------+-------------------+------------------+--------+
|  count|               712|               712|               712|                 712|   712|              712|               712|                712|               712|     712|
|   mean|448.58988764044943|0.4044943820224719| 2.240168539325843|                null|  null|29.64209269662921|0.5140449438202247|0.43258426966292135| 34.56725140449432|    null|
| stddev| 258.6831910181214|0.4911389472541192|0.8368543166903446|                null|  null|14.492

In [8]:
sqlContext.sql('SELECT * FROM titanic').show()

+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|   7.25|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|71.2833|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|  7.925|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|   53.1|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|   8.05|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|51.8625|       S|
|          8|       0|     3|Palsson, Master. ...|  male| 2.0|    3|    1| 21.075|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|11.133

## Determine the number of observations

In [9]:
df.count()

712

In [10]:
sqlContext.sql('SELECT COUNT(*) AS count FROM titanic').show()

+-----+
|count|
+-----+
|  712|
+-----+



## Determine the correlation between `Fare` and `Age`

In [11]:
df.corr('Fare', 'Age')

0.09314251789411518

In [12]:
sqlContext.sql('SELECT CORR(Fare, Age) FROM titanic').show(100)

+-------------------+
|    corr(Fare, Age)|
+-------------------+
|0.09314251789411518|
+-------------------+



## Determine the minimal and maximal ages

In [13]:
df.select('Age').rdd.flatMap(lambda x: x).min()

0.42

In [14]:
df.select('Age').rdd.flatMap(lambda x: x).max()

80.0

In [15]:
sqlContext.sql('SELECT MIN(Age), MAX(Age) FROM titanic').show()

+--------+--------+
|min(Age)|max(Age)|
+--------+--------+
|    0.42|    80.0|
+--------+--------+



## Determine the number of passengers with `Age` below 40

In [16]:
df.filter(df['Age']<40).count()

550

In [17]:
sqlContext.sql('SELECT COUNT(*) FROM titanic WHERE Age<40').show()

+--------+
|count(1)|
+--------+
|     550|
+--------+



## Determine the number of survivors and non-survivors in the under 40 age group. Sort by `Survived`.

In [18]:
df.filter(df['Age']<40).groupby(df['Survived']).count().sort(df['Survived']).show()

+--------+-----+
|Survived|count|
+--------+-----+
|       0|  322|
|       1|  228|
+--------+-----+



In [19]:
sqlContext.sql('''
SELECT Survived, COUNT(*) FROM titanic 
WHERE Age<40 
GROUP BY Survived 
ORDER BY Survived
''').show()

+--------+--------+
|Survived|count(1)|
+--------+--------+
|       0|     322|
|       1|     228|
+--------+--------+

