In [1]:
%load_ext autoreload
%autoreload 2

![](../images/chap05_0.jpg)

In [2]:
from modules.my_pyspark import *
from modules.my_drawer import MyDrawer
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [3]:
drawer = MyDrawer()
spark = MyPySpark(session=True, sql=True)

#### Task 1

In [4]:
file_path = r'data/DallasCouncilVoters.csv'

In [5]:
df = spark.readFile(file_path)

#### Task 2

_Số dòng của dữ liệu_

In [6]:
df.count()

44625

_In schema_

In [7]:
df.printSchema()

root
 |-- DATE: string (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- VOTER_NAME: string (nullable = true)



_Hiển thị 5 dòng đầu_

In [8]:
df.show(5)

+----------+-------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|
+----------+-------------+-------------------+
|02/08/2017|Councilmember|  Jennifer S. Gates|
|02/08/2017|Councilmember| Philip T. Kingston|
|02/08/2017|        Mayor|Michael S. Rawlings|
|02/08/2017|Councilmember|       Adam Medrano|
|02/08/2017|Councilmember|       Casey Thomas|
+----------+-------------+-------------------+
only showing top 5 rows



#### Task 3

_Kiểm tra dữ liệu có NaN hay ko_

In [9]:
mask = [count(when(isnan(c), c)).alias(c) for c in df.columns]

In [10]:
mask

[Column<'count(CASE WHEN isnan(DATE) THEN DATE END) AS `DATE`'>,
 Column<'count(CASE WHEN isnan(TITLE) THEN TITLE END) AS `TITLE`'>,
 Column<'count(CASE WHEN isnan(VOTER_NAME) THEN VOTER_NAME END) AS `VOTER_NAME`'>]

In [11]:
df.select(mask).show()

+----+-----+----------+
|DATE|TITLE|VOTER_NAME|
+----+-----+----------+
|   0|    0|         0|
+----+-----+----------+



> * Ko có dữ liệu NaN

_Kiểm tra dữ liệu có null hay ko_

In [12]:
mask1 = [count(when(col(c).isNull(), c)).alias(c) for c in df.columns]

In [13]:
mask1

[Column<'count(CASE WHEN (DATE IS NULL) THEN DATE END) AS `DATE`'>,
 Column<'count(CASE WHEN (TITLE IS NULL) THEN TITLE END) AS `TITLE`'>,
 Column<'count(CASE WHEN (VOTER_NAME IS NULL) THEN VOTER_NAME END) AS `VOTER_NAME`'>]

In [14]:
df.select(mask1).show()

+----+-----+----------+
|DATE|TITLE|VOTER_NAME|
+----+-----+----------+
|   0|  195|       503|
+----+-----+----------+



* Có dữ liệu null

In [15]:
df = df.dropna(subset='VOTER_NAME')

In [16]:
df.select(mask1).show()

+----+-----+----------+
|DATE|TITLE|VOTER_NAME|
+----+-----+----------+
|   0|    0|         0|
+----+-----+----------+



#### Task 4

In [17]:
distinct_rows = df.distinct().count()

In [18]:
distinct_rows

1273

In [19]:
df.count()

44122

In [20]:
df = df.drop_duplicates()

In [21]:
df.count()

1273

#### Task 5

In [22]:
df.select('VOTER_NAME').distinct().show(10)

+--------------------+
|          VOTER_NAME|
+--------------------+
|      Tennell Atkins|
|  the  final   20...|
|        Scott Griggs|
|       Scott  Griggs|
|       Sandy Greyson|
| Michael S. Rawlings|
| the final 2018 A...|
|        Kevin Felder|
|        Adam Medrano|
|       Casey  Thomas|
+--------------------+
only showing top 10 rows



#### Task 6

In [23]:
df = df.filter('length(VOTER_NAME) < 20 and length(VOTER_NAME) < 20')

In [24]:
df.show(5)

+----------+--------------------+------------------+
|      DATE|               TITLE|        VOTER_NAME|
+----------+--------------------+------------------+
|04/11/2018|Deputy Mayor Pro Tem|      Adam Medrano|
|02/14/2018|       Councilmember|   Lee M. Kleinman|
|04/25/2018|       Councilmember|    Tennell Atkins|
|08/29/2018|       Councilmember|      Kevin Felder|
|10/18/2017|       Councilmember|Jennifer S.  Gates|
+----------+--------------------+------------------+
only showing top 5 rows



#### Task 7

In [25]:
df = df.filter(~col('VOTER_NAME').contains('_'))

In [26]:
df.select('VOTER_NAME').distinct().show(10, truncate=False)

+-------------------+
|VOTER_NAME         |
+-------------------+
|Tennell Atkins     |
|Scott Griggs       |
|Scott  Griggs      |
|Sandy Greyson      |
|Michael S. Rawlings|
|Kevin Felder       |
|Adam Medrano       |
|Casey  Thomas      |
|Mark  Clayton      |
|Casey Thomas       |
+-------------------+
only showing top 10 rows



#### Task 8

In [27]:
df = df.withColumn('splits', split('VOTER_NAME', '\s+'))

In [28]:
df.show(5)

+----------+--------------------+------------------+--------------------+
|      DATE|               TITLE|        VOTER_NAME|              splits|
+----------+--------------------+------------------+--------------------+
|04/11/2018|Deputy Mayor Pro Tem|      Adam Medrano|     [Adam, Medrano]|
|02/14/2018|       Councilmember|   Lee M. Kleinman| [Lee, M., Kleinman]|
|04/25/2018|       Councilmember|    Tennell Atkins|   [Tennell, Atkins]|
|08/29/2018|       Councilmember|      Kevin Felder|     [Kevin, Felder]|
|10/18/2017|       Councilmember|Jennifer S.  Gates|[Jennifer, S., Ga...|
+----------+--------------------+------------------+--------------------+
only showing top 5 rows



#### Task 9

In [29]:
df = df.withColumn('first_name', df['splits'].getItem(0))

In [30]:
df.show(5)

+----------+--------------------+------------------+--------------------+----------+
|      DATE|               TITLE|        VOTER_NAME|              splits|first_name|
+----------+--------------------+------------------+--------------------+----------+
|04/11/2018|Deputy Mayor Pro Tem|      Adam Medrano|     [Adam, Medrano]|      Adam|
|02/14/2018|       Councilmember|   Lee M. Kleinman| [Lee, M., Kleinman]|       Lee|
|04/25/2018|       Councilmember|    Tennell Atkins|   [Tennell, Atkins]|   Tennell|
|08/29/2018|       Councilmember|      Kevin Felder|     [Kevin, Felder]|     Kevin|
|10/18/2017|       Councilmember|Jennifer S.  Gates|[Jennifer, S., Ga...|  Jennifer|
+----------+--------------------+------------------+--------------------+----------+
only showing top 5 rows



#### Task 10

In [31]:
df = df.withColumn('last_name', df['splits'].getItem(size('splits') - 1))

In [32]:
df.show(5)

+----------+--------------------+------------------+--------------------+----------+---------+
|      DATE|               TITLE|        VOTER_NAME|              splits|first_name|last_name|
+----------+--------------------+------------------+--------------------+----------+---------+
|04/11/2018|Deputy Mayor Pro Tem|      Adam Medrano|     [Adam, Medrano]|      Adam|  Medrano|
|02/14/2018|       Councilmember|   Lee M. Kleinman| [Lee, M., Kleinman]|       Lee| Kleinman|
|04/25/2018|       Councilmember|    Tennell Atkins|   [Tennell, Atkins]|   Tennell|   Atkins|
|08/29/2018|       Councilmember|      Kevin Felder|     [Kevin, Felder]|     Kevin|   Felder|
|10/18/2017|       Councilmember|Jennifer S.  Gates|[Jennifer, S., Ga...|  Jennifer|    Gates|
+----------+--------------------+------------------+--------------------+----------+---------+
only showing top 5 rows



#### Task 11

In [33]:
df = df.withColumn('random_val', when(df['TITLE'] == 'Councilmember', rand())
                                .when(df['TITLE'] == 'Mayor', 2)
                                .otherwise(0))

In [34]:
df.show()

+----------+--------------------+-------------------+--------------------+----------+---------+-------------------+
|      DATE|               TITLE|         VOTER_NAME|              splits|first_name|last_name|         random_val|
+----------+--------------------+-------------------+--------------------+----------+---------+-------------------+
|04/11/2018|Deputy Mayor Pro Tem|       Adam Medrano|     [Adam, Medrano]|      Adam|  Medrano|                0.0|
|02/14/2018|       Councilmember|    Lee M. Kleinman| [Lee, M., Kleinman]|       Lee| Kleinman| 0.7287254700624713|
|04/25/2018|       Councilmember|     Tennell Atkins|   [Tennell, Atkins]|   Tennell|   Atkins| 0.8107666275802718|
|08/29/2018|       Councilmember|       Kevin Felder|     [Kevin, Felder]|     Kevin|   Felder|  0.581760352824348|
|10/18/2017|       Councilmember| Jennifer S.  Gates|[Jennifer, S., Ga...|  Jennifer|    Gates|0.24122430255090432|
|12/13/2017|       Councilmember|     Sandy  Greyson|    [Sandy, Greyson

#### Task 12

In [35]:
df.filter(df['random_val'] == 0).show(5)

+----------+--------------------+-----------------+--------------------+----------+---------+----------+
|      DATE|               TITLE|       VOTER_NAME|              splits|first_name|last_name|random_val|
+----------+--------------------+-----------------+--------------------+----------+---------+----------+
|04/11/2018|Deputy Mayor Pro Tem|     Adam Medrano|     [Adam, Medrano]|      Adam|  Medrano|       0.0|
|04/12/2017|       Mayor Pro Tem| Monica R. Alonzo|[Monica, R., Alonzo]|    Monica|   Alonzo|       0.0|
|06/28/2017|Deputy Mayor Pro Tem|     Adam Medrano|     [Adam, Medrano]|      Adam|  Medrano|       0.0|
|01/03/2018|Deputy Mayor Pro Tem|     Adam Medrano|     [Adam, Medrano]|      Adam|  Medrano|       0.0|
|01/17/2018|       Mayor Pro Tem|Dwaine R. Caraway|[Dwaine, R., Cara...|    Dwaine|  Caraway|       0.0|
+----------+--------------------+-----------------+--------------------+----------+---------+----------+
only showing top 5 rows



#### Task 13

In [36]:
def getFirstAndMiddle(name):
    return ' '.join(name[:-1])

In [37]:
udfFirstAndMiddle = udf(getFirstAndMiddle, StringType())

#### Task 14

In [59]:
df = df.withColumn('first_and_middle_name', udfFirstAndMiddle(df['splits']))

In [60]:
df.show(5)

+----------+--------------------+------------------+--------------------+----------+---------+-------------------+---------------------+
|      DATE|               TITLE|        VOTER_NAME|              splits|first_name|last_name|         random_val|first_and_middle_name|
+----------+--------------------+------------------+--------------------+----------+---------+-------------------+---------------------+
|04/11/2018|Deputy Mayor Pro Tem|      Adam Medrano|     [Adam, Medrano]|      Adam|  Medrano|                0.0|                 Adam|
|02/14/2018|       Councilmember|   Lee M. Kleinman| [Lee, M., Kleinman]|       Lee| Kleinman| 0.3902904485654395|               Lee M.|
|04/25/2018|       Councilmember|    Tennell Atkins|   [Tennell, Atkins]|   Tennell|   Atkins| 0.8023088428290783|              Tennell|
|08/29/2018|       Councilmember|      Kevin Felder|     [Kevin, Felder]|     Kevin|   Felder| 0.5195703231373053|                Kevin|
|10/18/2017|       Councilmember|Jennifer

#### Task 15

In [61]:
df = df.drop('first_name', 'splits')

In [62]:
df.show(5)

+----------+--------------------+------------------+---------+-------------------+---------------------+
|      DATE|               TITLE|        VOTER_NAME|last_name|         random_val|first_and_middle_name|
+----------+--------------------+------------------+---------+-------------------+---------------------+
|04/11/2018|Deputy Mayor Pro Tem|      Adam Medrano|  Medrano|                0.0|                 Adam|
|02/14/2018|       Councilmember|   Lee M. Kleinman| Kleinman| 0.3902904485654395|               Lee M.|
|04/25/2018|       Councilmember|    Tennell Atkins|   Atkins| 0.8023088428290783|              Tennell|
|08/29/2018|       Councilmember|      Kevin Felder|   Felder| 0.5195703231373053|                Kevin|
|10/18/2017|       Councilmember|Jennifer S.  Gates|    Gates|0.23656514096796266|          Jennifer S.|
+----------+--------------------+------------------+---------+-------------------+---------------------+
only showing top 5 rows



#### Task 16

In [63]:
df = df.withColumn('ROW_ID', monotonically_increasing_id())

In [64]:
df.show(5)

+----------+--------------------+------------------+---------+-------------------+---------------------+----------+
|      DATE|               TITLE|        VOTER_NAME|last_name|         random_val|first_and_middle_name|    ROW_ID|
+----------+--------------------+------------------+---------+-------------------+---------------------+----------+
|04/11/2018|Deputy Mayor Pro Tem|      Adam Medrano|  Medrano|                0.0|                 Adam|         0|
|02/14/2018|       Councilmember|   Lee M. Kleinman| Kleinman| 0.3902904485654395|               Lee M.|         1|
|04/25/2018|       Councilmember|    Tennell Atkins|   Atkins| 0.8023088428290783|              Tennell|8589934592|
|08/29/2018|       Councilmember|      Kevin Felder|   Felder| 0.5195703231373053|                Kevin|8589934593|
|10/18/2017|       Councilmember|Jennifer S.  Gates|    Gates|0.23656514096796266|          Jennifer S.|8589934594|
+----------+--------------------+------------------+---------+----------

In [65]:
df = df.orderBy(df['ROW_ID'].asc())

In [66]:
df.show(10)

+----------+--------------------+-------------------+---------+-------------------+---------------------+-----------+
|      DATE|               TITLE|         VOTER_NAME|last_name|         random_val|first_and_middle_name|     ROW_ID|
+----------+--------------------+-------------------+---------+-------------------+---------------------+-----------+
|04/11/2018|Deputy Mayor Pro Tem|       Adam Medrano|  Medrano|                0.0|                 Adam|          0|
|02/14/2018|       Councilmember|    Lee M. Kleinman| Kleinman| 0.3902904485654395|               Lee M.|          1|
|04/25/2018|       Councilmember|     Tennell Atkins|   Atkins| 0.8023088428290783|              Tennell| 8589934592|
|08/29/2018|       Councilmember|       Kevin Felder|   Felder| 0.5195703231373053|                Kevin| 8589934593|
|10/18/2017|       Councilmember| Jennifer S.  Gates|    Gates|0.23656514096796266|          Jennifer S.| 8589934594|
|12/13/2017|       Councilmember|     Sandy  Greyson|  G