In [1]:
from pyspark import SparkContext, SQLContext

In [2]:
sc = SparkContext()
sqlContext = SQLContext(sc)

In [3]:
train = sqlContext.read.csv(
    "./data/train.csv", header=True, mode="DROPMALFORMED", inferSchema = True
)
test = sqlContext.read.csv(
    "./data/test.csv", header=True, mode="DROPMALFORMED", inferSchema = True
)


We are using inferSchema is True
for telling sqlContext to automatically detect the data type of each column in data frame.
If we do not set inferSchema to true, all columns will be read as string

In [4]:
print((train.count(), len(train.columns)))

(891, 12)


In [5]:
train.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [6]:
train.count()

891

In [7]:
train.head(1)

[Row(PassengerId=1, Survived=0, Pclass=3, Name=u'Braund, Mr. Owen Harris', Sex=u'male', Age=22.0, SibSp=1, Parch=0, Ticket=u'A/5 21171', Fare=7.25, Cabin=None, Embarked=u'S')]

In [8]:
train.show(4)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 4 rows



In [9]:
train.na.drop().count(),test.na.drop('any').count()

(183, 87)

In [10]:
train = train.fillna(-1)
test = test.fillna(-1)

In [11]:
train.na.drop().count(),test.na.drop('any').count()

(202, 91)

In [12]:
train.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               891|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null|23.600639730639728|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [13]:
train.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [14]:
train.select('Survived').show(4)

+--------+
|Survived|
+--------+
|       0|
|       1|
|       1|
|       1|
+--------+
only showing top 4 rows



In [15]:
train.select('Embarked').distinct().show()

+--------+
|Embarked|
+--------+
|       Q|
|    null|
|       C|
|       S|
+--------+



In [16]:
train.select('Embarked').distinct().count(), test.select('Embarked').distinct().count()

(4, 3)

In [17]:
diff_cat_in_train_test = train.select('Embarked').distinct().subtract(test.select('Embarked').distinct())

In [18]:
diff_cat_in_train_test.distinct().count()# For distict count

1

# Transforming categorical variables to labels

In [19]:
from pyspark.ml.feature import StringIndexer
plan_indexer = StringIndexer(inputCol = 'Embarked', outputCol = 'ebk')
labeller = plan_indexer.fit(train)

In [20]:
Train1 = labeller.transform(train)
Test1 = labeller.transform(test)

In [21]:
Train1.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked',
 'ebk']

In [22]:
Train1.select("Sex")

DataFrame[Sex: string]