Topics covered:
- **schema**
- **count**
- **columns**
- **describe**
- **summary**

In [0]:
df = spark.read.csv("dbfs:/FileStore/tables/titanic.csv", header=True, inferSchema=True)
display(df)
# df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

#### **1) schema**

     df.printSchema()

     df.schema
     print(df.schema)

     df.schema.fields

     # Returns dataframe column names and data types
     df.dtypes

In [0]:
df.printSchema()

In [0]:
df.schema

In [0]:
print(df.schema)

StructType([StructField('PassengerId', IntegerType(), True), StructField('Survived', IntegerType(), True), StructField('Pclass', IntegerType(), True), StructField('Name', StringType(), True), StructField('Sex', StringType(), True), StructField('Age', DoubleType(), True), StructField('SibSp', IntegerType(), True), StructField('Parch', IntegerType(), True), StructField('Ticket', StringType(), True), StructField('Fare', DoubleType(), True), StructField('Cabin', StringType(), True), StructField('Embarked', StringType(), True)])


In [0]:
df.schema.fields

Out[18]: [StructField('PassengerId', IntegerType(), True),
 StructField('Survived', IntegerType(), True),
 StructField('Pclass', IntegerType(), True),
 StructField('Name', StringType(), True),
 StructField('Sex', StringType(), True),
 StructField('Age', DoubleType(), True),
 StructField('SibSp', IntegerType(), True),
 StructField('Parch', IntegerType(), True),
 StructField('Ticket', StringType(), True),
 StructField('Fare', DoubleType(), True),
 StructField('Cabin', StringType(), True),
 StructField('Embarked', StringType(), True)]

In [0]:
# Returns dataframe column names and data types
df.dtypes

Out[17]: [('PassengerId', 'int'),
 ('Survived', 'int'),
 ('Pclass', 'int'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'double'),
 ('SibSp', 'int'),
 ('Parch', 'int'),
 ('Ticket', 'string'),
 ('Fare', 'double'),
 ('Cabin', 'string'),
 ('Embarked', 'string')]

#### **2) count**

     df.count()
     df.distinct().show()
     df.distinct().count()

In [0]:
#df.count()
#df.distinct().show()
df.distinct().count()

Out[21]: 891

#### **3) List of the column names / Number of Columns**

     # All gives same output
     df.columns
     df.schema.names
     df.schema.fieldNames()

In [0]:
df.columns

In [0]:
df.schema.names

In [0]:
df.schema.fieldNames()

Out[24]: ['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [0]:
df.columns[0]

In [0]:
for col in df.columns:
    print(col)

#### **4) Number of columns**

     len(df.columns)
     len(df.dtypes)

In [0]:
#len(df.columns)
len(df.dtypes)

Out[26]: 12

#### **5) describe**

In [0]:
#df.describe().show()
df.describe(['Name']).show(truncate=False)

+-------+------------------------------------------------+
|summary|Name                                            |
+-------+------------------------------------------------+
|count  |891                                             |
|mean   |null                                            |
|stddev |null                                            |
|min    |"Andersson, Mr. August Edvard (""Wennerstrom"")"|
|max    |van Melkebeke, Mr. Philemon                     |
+-------+------------------------------------------------+



#### **6) summary**

In [0]:
df.summary().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [0]:
df.summary("count", "33%", "50%", "66%").show()

+-------+-----------+--------+------+----+----+----+-----+-----+--------+-------+-----+--------+
|summary|PassengerId|Survived|Pclass|Name| Sex| Age|SibSp|Parch|  Ticket|   Fare|Cabin|Embarked|
+-------+-----------+--------+------+----+----+----+-----+-----+--------+-------+-----+--------+
|  count|        891|     891|   891| 891| 891| 714|  891|  891|     891|    891|  204|     889|
|    33%|        295|       0|     2|null|null|23.0|    0|    0| 36928.0| 8.6542| null|    null|
|    50%|        446|       0|     3|null|null|28.0|    0|    0|236171.0|14.4542| null|    null|
|    66%|        589|       1|     3|null|null|34.0|    0|    0|345767.0|   26.0| null|    null|
+-------+-----------+--------+------+----+----+----+-----+-----+--------+-------+-----+--------+



In [0]:
df.summary("count", "count_distinct").show()

+--------------+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|       summary|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+--------------+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|         count|        891|     891|   891| 891|891|714|  891|  891|   891| 891|  204|     889|
|count_distinct|        891|       2|     3| 891|  2| 88|    7|    7|   681| 248|  147|       3|
+--------------+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



In [0]:
df.summary("count", "approx_count_distinct").show(truncate=False)

+---------------------+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|summary              |PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+---------------------+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|count                |891        |891     |891   |891 |891|714|891  |891  |891   |891 |204  |889     |
|approx_count_distinct|890        |2       |3     |950 |2  |86 |7    |7    |690   |236 |149  |3       |
+---------------------+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



In [0]:
df.select("Name").summary("count", "33%", "50%", "66%").show()

+-------+----+
|summary|Name|
+-------+----+
|  count| 891|
|    33%|null|
|    50%|null|
|    66%|null|
+-------+----+

