In [11]:
sc.applicationId

'local-1698598112549'

In [12]:

voter_df = spark.read.csv('./DallasCouncilVoters.csv.gz', header=True)
# Show the distinct VOTER_NAME entries
voter_df.select('VOTER_NAME').distinct().show(40, truncate=False)

# Filter voter_df where the VOTER_NAME is 1-20 characters in length
voter_df = voter_df.filter('length(VOTER_NAME) > 0 and length(VOTER_NAME) < 20')

# Filter out voter_df where the VOTER_NAME contains an underscore
voter_df = voter_df.filter(~ F.col('VOTER_NAME').contains('_'))

# Show the distinct VOTER_NAME entries again
voter_df.select('VOTER_NAME').distinct().show(40, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|VOTER_NAME                                                                                                                                                                                                                                                                                                                                                                                                                 |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [13]:
# Add a new column called splits separated on whitespace
voter_df = voter_df.withColumn('splits', F.split(voter_df.VOTER_NAME, '\s+'))

# Create a new column called first_name based on the first item in splits
voter_df = voter_df.withColumn('first_name', voter_df.splits.getItem(0))

# Get the last entry of the splits list and create a column called last_name
voter_df = voter_df.withColumn('last_name', voter_df.splits.getItem(F.size('splits') - 1))

# Drop the splits column
voter_df = voter_df.drop('splits')

# Show the voter_df DataFrame
voter_df.show()



+----------+-------------+-------------------+----------+---------+
|      DATE|        TITLE|         VOTER_NAME|first_name|last_name|
+----------+-------------+-------------------+----------+---------+
|02/08/2017|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates|
|02/08/2017|Councilmember| Philip T. Kingston|    Philip| Kingston|
|02/08/2017|        Mayor|Michael S. Rawlings|   Michael| Rawlings|
|02/08/2017|Councilmember|       Adam Medrano|      Adam|  Medrano|
|02/08/2017|Councilmember|       Casey Thomas|     Casey|   Thomas|
|02/08/2017|Councilmember|Carolyn King Arnold|   Carolyn|   Arnold|
|02/08/2017|Councilmember|       Scott Griggs|     Scott|   Griggs|
|02/08/2017|Councilmember|   B. Adam  McGough|        B.|  McGough|
|02/08/2017|Councilmember|       Lee Kleinman|       Lee| Kleinman|
|02/08/2017|Councilmember|      Sandy Greyson|     Sandy|  Greyson|
|02/08/2017|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates|
|02/08/2017|Councilmember| Philip T. Kingston|  

In [14]:
voter_df = voter_df\
    .withColumn(
        'random_val',
        F.when(voter_df.TITLE == 'Councilmember', F.rand())
    )

# Show some of the DataFrame rows, noting whether the when clause worked
voter_df.show()

+----------+-------------+-------------------+----------+---------+--------------------+
|      DATE|        TITLE|         VOTER_NAME|first_name|last_name|          random_val|
+----------+-------------+-------------------+----------+---------+--------------------+
|02/08/2017|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates|  0.9690986256922961|
|02/08/2017|Councilmember| Philip T. Kingston|    Philip| Kingston| 0.07957600724517866|
|02/08/2017|        Mayor|Michael S. Rawlings|   Michael| Rawlings|                NULL|
|02/08/2017|Councilmember|       Adam Medrano|      Adam|  Medrano|   0.862825952912005|
|02/08/2017|Councilmember|       Casey Thomas|     Casey|   Thomas| 0.31965254454710423|
|02/08/2017|Councilmember|Carolyn King Arnold|   Carolyn|   Arnold|  0.7330886586134099|
|02/08/2017|Councilmember|       Scott Griggs|     Scott|   Griggs|  0.9508436682943732|
|02/08/2017|Councilmember|   B. Adam  McGough|        B.|  McGough|0.052998255468878264|
|02/08/2017|Councilme

In [15]:
# Add a column to voter_df for a voter based on their position
voter_df = voter_df\
    .withColumn(
        'random_val',
        F.when(voter_df.TITLE == 'Councilmember', F.rand())
        .when(voter_df.TITLE == 'Mayor', 2)
        .otherwise(0)
    )

# Show some of the DataFrame rows
voter_df.show(3, truncate=False)

# Use the .filter() clause with random_val
voter_df.filter(voter_df.random_val == 0).show(3, truncate=False)

+----------+-------------+-------------------+----------+---------+-------------------+
|DATE      |TITLE        |VOTER_NAME         |first_name|last_name|random_val         |
+----------+-------------+-------------------+----------+---------+-------------------+
|02/08/2017|Councilmember|Jennifer S. Gates  |Jennifer  |Gates    |0.924981669860554  |
|02/08/2017|Councilmember|Philip T. Kingston |Philip    |Kingston |0.14246533629399083|
|02/08/2017|Mayor        |Michael S. Rawlings|Michael   |Rawlings |2.0                |
+----------+-------------+-------------------+----------+---------+-------------------+
only showing top 3 rows

+----------+--------------------+-----------------+----------+---------+----------+
|DATE      |TITLE               |VOTER_NAME       |first_name|last_name|random_val|
+----------+--------------------+-----------------+----------+---------+----------+
|04/25/2018|Deputy Mayor Pro Tem|Adam Medrano     |Adam      |Medrano  |0.0       |
|04/25/2018|Mayor Pro T

In [16]:
voter_df.dtypes

[('DATE', 'string'),
 ('TITLE', 'string'),
 ('VOTER_NAME', 'string'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('random_val', 'double')]

In [17]:
voter_df = voter_df.withColumn('splits', F.split(voter_df.VOTER_NAME, ' '))
voter_df.show(3, truncate=False)

+----------+-------------+-------------------+----------+---------+-------------------+-----------------------+
|DATE      |TITLE        |VOTER_NAME         |first_name|last_name|random_val         |splits                 |
+----------+-------------+-------------------+----------+---------+-------------------+-----------------------+
|02/08/2017|Councilmember|Jennifer S. Gates  |Jennifer  |Gates    |0.924981669860554  |[Jennifer, S., Gates]  |
|02/08/2017|Councilmember|Philip T. Kingston |Philip    |Kingston |0.14246533629399083|[Philip, T., Kingston] |
|02/08/2017|Mayor        |Michael S. Rawlings|Michael   |Rawlings |2.0                |[Michael, S., Rawlings]|
+----------+-------------+-------------------+----------+---------+-------------------+-----------------------+
only showing top 3 rows



In [18]:
def getFirstAndMiddle(names):
  # Return a space separated string of names
  return ' '.join(names)

# Define the method as a UDF
udfFirstAndMiddle = F.udf(getFirstAndMiddle, StringType())

# Create a new column using your UDF
voter_df = voter_df.withColumn('first_and_middle_name', udfFirstAndMiddle(voter_df.splits))

# Drop the unecessary columns then show the DataFrame
voter_df = voter_df.drop('first_name')
voter_df = voter_df.drop('splits')
voter_df.show()

+----------+-------------+-------------------+---------+-------------------+---------------------+
|      DATE|        TITLE|         VOTER_NAME|last_name|         random_val|first_and_middle_name|
+----------+-------------+-------------------+---------+-------------------+---------------------+
|02/08/2017|Councilmember|  Jennifer S. Gates|    Gates|  0.924981669860554|    Jennifer S. Gates|
|02/08/2017|Councilmember| Philip T. Kingston| Kingston|0.14246533629399083|   Philip T. Kingston|
|02/08/2017|        Mayor|Michael S. Rawlings| Rawlings|                2.0|  Michael S. Rawlings|
|02/08/2017|Councilmember|       Adam Medrano|  Medrano|0.19061674559226627|         Adam Medrano|
|02/08/2017|Councilmember|       Casey Thomas|   Thomas|0.06330632057892627|         Casey Thomas|
|02/08/2017|Councilmember|Carolyn King Arnold|   Arnold|   0.86755437491524|  Carolyn King Arnold|
|02/08/2017|Councilmember|       Scott Griggs|   Griggs| 0.7363428634166084|         Scott Griggs|
|02/08/201

In [19]:
df = spark.read.csv('./DallasCouncilVotes.csv.gz', header=True)

# Select all the unique council voters
voter_df_single = df.select(df["VOTER NAME"]).distinct()

# Count the rows in voter_df
print("\nThere are %d rows in the voter_df DataFrame.\n" % voter_df_single.count())

# Add a ROW_ID
voter_df_single = voter_df_single.withColumn('ROW_ID', F.monotonically_increasing_id())

# Show the rows with 10 highest IDs in the set
voter_df_single.orderBy(voter_df_single.ROW_ID.desc()).show(10)


There are 36 rows in the voter_df DataFrame.

+--------------------+------+
|          VOTER NAME|ROW_ID|
+--------------------+------+
|                NULL|    35|
|        Lee Kleinman|    34|
|  the  final  201...|    33|
|         Erik Wilson|    32|
|  the  final   20...|    31|
| Carolyn King Arnold|    30|
| Rickey D.  Callahan|    29|
|   the   final  2...|    28|
|    Monica R. Alonzo|    27|
|     Lee M. Kleinman|    26|
+--------------------+------+
only showing top 10 rows



In [20]:
# Print the number of partitions in each DataFrame

print("\nThere are %d partitions in the voter_df DataFrame.\n" % voter_df.rdd.getNumPartitions())
print("\nThere are %d partitions in the voter_df_single DataFrame.\n" % voter_df_single.rdd.getNumPartitions())

# Add a ROW_ID field to each DataFrame
voter_df = voter_df.withColumn('ROW_ID', F.monotonically_increasing_id())
voter_df_single = voter_df_single.withColumn('ROW_ID', F.monotonically_increasing_id())

# Show the top 10 IDs in each DataFrame 
voter_df.orderBy(voter_df.ROW_ID.desc()).show(2,truncate=False)
voter_df_single.orderBy(voter_df_single.ROW_ID.desc()).show(10)


There are 1 partitions in the voter_df DataFrame.


There are 1 partitions in the voter_df_single DataFrame.

+----------+-------------+--------------+---------+-------------------+---------------------+------+
|DATE      |TITLE        |VOTER_NAME    |last_name|random_val         |first_and_middle_name|ROW_ID|
+----------+-------------+--------------+---------+-------------------+---------------------+------+
|11/20/2018|Councilmember|Mark  Clayton |Clayton  |0.5241081050917347 |Mark  Clayton        |43911 |
|11/20/2018|Councilmember|Tennell Atkins|Atkins   |0.14371968397544677|Tennell Atkins       |43910 |
+----------+-------------+--------------+---------+-------------------+---------------------+------+
only showing top 2 rows

+--------------------+------+
|          VOTER NAME|ROW_ID|
+--------------------+------+
|                NULL|    35|
|        Lee Kleinman|    34|
|  the  final  201...|    33|
|         Erik Wilson|    32|
|  the  final   20...|    31|
| Carolyn King Arn

In [21]:
voter_df = voter_df.withColumn("DATE", F.to_date(voter_df["DATE"], "dd/MM/yyyy"))
voter_df_march = voter_df.filter(F.month(voter_df.DATE) == 3)
voter_df_april = voter_df.filter(F.month(voter_df.DATE) == 4)

In [22]:
voter_df_march.show(3, truncate=False)

+----------+-------------+-------------------+---------+-------------------+---------------------+------+
|DATE      |TITLE        |VOTER_NAME         |last_name|random_val         |first_and_middle_name|ROW_ID|
+----------+-------------+-------------------+---------+-------------------+---------------------+------+
|2018-03-10|Councilmember|Sandy  Greyson     |Greyson  |0.9315991259857322 |Sandy  Greyson       |272   |
|2018-03-10|Councilmember|Jennifer S.  Gates |Gates    |0.04711424363909489|Jennifer S.  Gates   |323   |
|2018-03-10|Councilmember|Philip T.  Kingston|Kingston |0.9582637445693514 |Philip T.  Kingston  |354   |
+----------+-------------+-------------------+---------+-------------------+---------------------+------+
only showing top 3 rows



In [23]:
# Determine the highest ROW_ID and save it in previous_max_ID
previous_max_ID = voter_df_march\
    .select('ROW_ID')\
    .rdd\
    .max()[0]


# Add a ROW_ID column to voter_df_april starting at the desired value
voter_df_april = voter_df_april\
    .withColumn(
        'ROW_ID',
        F.monotonically_increasing_id() + previous_max_ID
    )

# Show the ROW_ID from both DataFrames and compare
voter_df_march.select('ROW_ID').show(3)
voter_df_april.select('ROW_ID').show(3)

+------+
|ROW_ID|
+------+
|   272|
|   323|
|   354|
+------+
only showing top 3 rows

+------+
|ROW_ID|
+------+
| 41004|
| 41005|
| 41006|
+------+
only showing top 3 rows



In [24]:

from pyspark.sql.functions import to_timestamp, col, to_date, date_format

def convert_date_format(df, column, input_format, output_format, start_date=None, end_date=None):
    """
    Takes a PySpark DataFrame, a string column, input and output date formats
    and optional start and end date parameters to filter the data.
    Returns a new DataFrame with the converted date column and the filtered data.
    """
    # Convert column to timestamp column using input format
    df = df.withColumn(column+" to_timestamp", to_timestamp(col(column), input_format))
    df = df.withColumn(column+' to_date', to_date(col(column), input_format))
    df = df.withColumn(column+' output_format', date_format(col(column+' to_date'), output_format))

    # Check if start and end date parameters have been specified
    if start_date and end_date:
        # Filter data based on date range
        df = df.filter((col(column) >= start_date) & (col(column) <= end_date))
    
    # Create a new date column with the specified output format
    df = df.withColumn(column+"timestamp to_string", col(column).cast("timestamp").cast("string"))
    
    return df

# creating input dataframe
data = [("2022-01-05", "John"),
        ("2021-12-31", "Mary"),
        ("2022-02-14", "Peter"),
        ("2022-01-15", "Jane")]

schema = ["date", "name"]
df = spark.createDataFrame(data, schema)
convert_date_format(df, "date", "yyyy-MM-dd", "dd/MM/yyyy", "2022-01-01 00:00:00", "2022-01-31 23:59:59").show(3, truncate=False)


+----------+----+-------------------+------------+------------------+-----------------------+
|date      |name|date to_timestamp  |date to_date|date output_format|datetimestamp to_string|
+----------+----+-------------------+------------+------------------+-----------------------+
|2022-01-05|John|2022-01-05 00:00:00|2022-01-05  |05/01/2022        |2022-01-05 00:00:00    |
|2022-01-15|Jane|2022-01-15 00:00:00|2022-01-15  |15/01/2022        |2022-01-15 00:00:00    |
+----------+----+-------------------+------------+------------------+-----------------------+



In [25]:
# creating input dataframe
data = [("220105", "John"),
        ("211231", "Mary"),
        ("220214", "Peter"),
        ("220115", "Jane")]

schema = ["date", "name"]
df = spark.createDataFrame(data, schema)

# Convert date to timestamp
df = df.withColumn("date_ts", to_timestamp(col("date"), "yyMMdd"))
df = df.withColumn("subtract_21_months", F.add_months(F.current_timestamp(), -21))
df.show( truncate=False)
# Find records before 25 months
df_filtered = df.filter(df.date_ts <= df.subtract_21_months)
df_filtered.show(truncate=False)

+------+-----+-------------------+------------------+
|date  |name |date_ts            |subtract_21_months|
+------+-----+-------------------+------------------+
|220105|John |2022-01-05 00:00:00|2022-01-29        |
|211231|Mary |2021-12-31 00:00:00|2022-01-29        |
|220214|Peter|2022-02-14 00:00:00|2022-01-29        |
|220115|Jane |2022-01-15 00:00:00|2022-01-29        |
+------+-----+-------------------+------------------+

+------+----+-------------------+------------------+
|date  |name|date_ts            |subtract_21_months|
+------+----+-------------------+------------------+
|220105|John|2022-01-05 00:00:00|2022-01-29        |
|211231|Mary|2021-12-31 00:00:00|2022-01-29        |
|220115|Jane|2022-01-15 00:00:00|2022-01-29        |
+------+----+-------------------+------------------+



In [26]:
df = spark.createDataFrame(data, schema)

# Convert date to timestamp
df = df.withColumn("date_ts", F.unix_timestamp(F.col("date"), "yyMMdd"))
df = df.withColumn("subtract_21_months", F.unix_timestamp(F.add_months(F.current_timestamp(), -21)))
df.show( truncate=False)
# Find records before 21 months
df_filtered = df.filter(df.date_ts <= df.subtract_21_months)

# Show results
df_filtered.show(truncate=False)

+------+-----+----------+------------------+
|date  |name |date_ts   |subtract_21_months|
+------+-----+----------+------------------+
|220105|John |1641321000|1643394600        |
|211231|Mary |1640889000|1643394600        |
|220214|Peter|1644777000|1643394600        |
|220115|Jane |1642185000|1643394600        |
+------+-----+----------+------------------+

+------+----+----------+------------------+
|date  |name|date_ts   |subtract_21_months|
+------+----+----------+------------------+
|220105|John|1641321000|1643394600        |
|211231|Mary|1640889000|1643394600        |
|220115|Jane|1642185000|1643394600        |
+------+----+----------+------------------+

