# First Data Check
---

In [8]:
# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

23/04/06 12:48:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/04/06 12:48:23 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Import Data

Data source: https://www.kaggle.com/c/titanic/data

In [9]:
titanic = spark.read.csv('data/SparkData/kaggle-titanic-train.csv', header=True, inferSchema=True)
titanic.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

## Data type

First, we want to check if string and numeric variables are imported as we expect.

In [10]:
titanic.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



## Data summary

### Number of variables

In [11]:
len(titanic.columns)

12

### Number of observations

In [12]:
titanic.count()

891

### Sumarise columns

In [13]:
def describe_columns(df):
    for i in df.columns:
        print('Column: ' + i)
        titanic.select(i).describe().show()

In [14]:
describe_columns(titanic)

Column: PassengerId
+-------+-----------------+
|summary|      PassengerId|
+-------+-----------------+
|  count|              891|
|   mean|            446.0|
| stddev|257.3538420152301|
|    min|                1|
|    max|              891|
+-------+-----------------+

Column: Survived
+-------+-------------------+
|summary|           Survived|
+-------+-------------------+
|  count|                891|
|   mean| 0.3838383838383838|
| stddev|0.48659245426485753|
|    min|                  0|
|    max|                  1|
+-------+-------------------+

Column: Pclass
+-------+------------------+
|summary|            Pclass|
+-------+------------------+
|  count|               891|
|   mean| 2.308641975308642|
| stddev|0.8360712409770491|
|    min|                 1|
|    max|                 3|
+-------+------------------+

Column: Name
+-------+--------------------+
|summary|                Name|
+-------+--------------------+
|  count|                 891|
|   mean|                

### Find columns with missing values

In [15]:
def find_missing_values_columns(df):
    nrow = df.count()
    for v in df.columns:
        summary_df = df.select(v).describe()
        v_count = int(summary_df.collect()[0][v])
        if v_count < nrow:
            missing_percentage = (1 - v_count/nrow) * 100
            print("Total observations: " + str(nrow) + "\n"
                 "Total observations of " + v + ": " + str(v_count) + "\n"
                 "Percentage of missing values: " + str(missing_percentage) + "%" + "\n"
                 "----------------------------")

In [16]:
find_missing_values_columns(titanic)

Total observations: 891
Total observations of Age: 714
Percentage of missing values: 19.865319865319865%
----------------------------
Total observations: 891
Total observations of Cabin: 204
Percentage of missing values: 77.1043771043771%
----------------------------
Total observations: 891
Total observations of Embarked: 889
Percentage of missing values: 0.22446689113355678%
----------------------------


In [18]:
titanic.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [22]:
titanic.select('Ticket').collect()

[Row(Ticket='A/5 21171'),
 Row(Ticket='PC 17599'),
 Row(Ticket='STON/O2. 3101282'),
 Row(Ticket='113803'),
 Row(Ticket='373450'),
 Row(Ticket='330877'),
 Row(Ticket='17463'),
 Row(Ticket='349909'),
 Row(Ticket='347742'),
 Row(Ticket='237736'),
 Row(Ticket='PP 9549'),
 Row(Ticket='113783'),
 Row(Ticket='A/5. 2151'),
 Row(Ticket='347082'),
 Row(Ticket='350406'),
 Row(Ticket='248706'),
 Row(Ticket='382652'),
 Row(Ticket='244373'),
 Row(Ticket='345763'),
 Row(Ticket='2649'),
 Row(Ticket='239865'),
 Row(Ticket='248698'),
 Row(Ticket='330923'),
 Row(Ticket='113788'),
 Row(Ticket='349909'),
 Row(Ticket='347077'),
 Row(Ticket='2631'),
 Row(Ticket='19950'),
 Row(Ticket='330959'),
 Row(Ticket='349216'),
 Row(Ticket='PC 17601'),
 Row(Ticket='PC 17569'),
 Row(Ticket='335677'),
 Row(Ticket='C.A. 24579'),
 Row(Ticket='PC 17604'),
 Row(Ticket='113789'),
 Row(Ticket='2677'),
 Row(Ticket='A./5. 2152'),
 Row(Ticket='345764'),
 Row(Ticket='2651'),
 Row(Ticket='7546'),
 Row(Ticket='11668'),
 Row(Ticket='3

In [25]:
titanic.select('Ticket').collect()[0]

Row(Ticket='A/5 21171')