In [1]:
import findspark
from pyspark.sql import SparkSession, Row, DataFrame, functions as F
from pyspark.sql.types import StringType, StructType, StructField, IntegerType
from pyspark.sql.functions import col, expr, lit, substring, concat,\
                                  concat_ws, when, coalesce
from functools import reduce

In [2]:
findspark.init()

In [3]:
spark = SparkSession.builder.master('local[*]').getOrCreate()

24/07/27 10:08:06 WARN Utils: Your hostname, Eduardos-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.15.17 instead (on interface en0)
24/07/27 10:08:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/27 10:08:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.csv('dados/banklist.csv', sep=',', inferSchema=True,
                    header=True, encoding='ISO-8859-1')

In [5]:
df.count()

568

In [6]:
len(df.columns)

7

In [7]:
df.columns

['Bank Name',
 'City',
 'State',
 'Cert',
 'Acquiring Institution',
 'Closing Date',
 'Fund']

In [8]:
df.createOrReplaceTempView('banklist')

In [9]:
spark.sql('''
    SELECT `Bank Name`, City, `Closing Date`
    FROM banklist
''').show(4, truncate=False)

+------------------------+-------------+------------+
|Bank Name               |City         |Closing Date|
+------------------------+-------------+------------+
|Citizens Bank           |Sac City     |3-Nov-23    |
|Heartland Tri-State Bank|Elkhart      |28-Jul-23   |
|First Republic Bank     |San Francisco|1-May-23    |
|Signature Bank          |New York     |12-Mar-23   |
+------------------------+-------------+------------+
only showing top 4 rows



In [10]:
df.describe().show()

24/07/27 10:08:08 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+--------------------+-------+-----+------------------+---------------------+------------+------------------+
|summary|           Bank Name|   City|State|              Cert|Acquiring Institution|Closing Date|              Fund|
+-------+--------------------+-------+-----+------------------+---------------------+------------+------------------+
|  count|                 568|    568|  568|               568|                  568|         568|               568|
|   mean|                NULL|   NULL| NULL|31660.663732394365|                 NULL|        NULL|10041.323943661971|
| stddev|                NULL|   NULL| NULL| 16478.38085982673|                 NULL|        NULL|1111.4143367151717|
|    min|1st American Stat...|Acworth|   AL|                91|      1st United Bank|    1-Aug-08|              4645|
|    max|               ebank|Wyoming|   WY|             59017|  Your Community Bank|    9-Sep-11|             10545|
+-------+--------------------+-------+-----+------------

In [11]:
df.describe('City', 'State').show()

+-------+-------+-----+
|summary|   City|State|
+-------+-------+-----+
|  count|    568|  568|
|   mean|   NULL| NULL|
| stddev|   NULL| NULL|
|    min|Acworth|   AL|
|    max|Wyoming|   WY|
+-------+-------+-----+



In [12]:
df.dtypes

[('Bank Name', 'string'),
 ('City', 'string'),
 ('State', 'string'),
 ('Cert', 'int'),
 ('Acquiring Institution', 'string'),
 ('Closing Date', 'string'),
 ('Fund', 'int')]

In [13]:
df.schema

StructType([StructField('Bank Name', StringType(), True), StructField('City', StringType(), True), StructField('State', StringType(), True), StructField('Cert', IntegerType(), True), StructField('Acquiring Institution', StringType(), True), StructField('Closing Date', StringType(), True), StructField('Fund', IntegerType(), True)])

In [14]:
df.printSchema()

root
 |-- Bank Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Cert: integer (nullable = true)
 |-- Acquiring Institution: string (nullable = true)
 |-- Closing Date: string (nullable = true)
 |-- Fund: integer (nullable = true)



In [15]:
df.dropDuplicates()

DataFrame[Bank Name: string, City: string, State: string, Cert: int, Acquiring Institution: string, Closing Date: string, Fund: int]

In [16]:
df.select(*['Bank Name', 'City']).show(2)

+--------------------+--------+
|           Bank Name|    City|
+--------------------+--------+
|       Citizens Bank|Sac City|
|Heartland Tri-Sta...| Elkhart|
+--------------------+--------+
only showing top 2 rows



In [17]:
df.select(*(set(df.columns) - {'Fund'})).show(5)

+-----+---------------------+--------------------+-------------+-----+------------+
|State|Acquiring Institution|           Bank Name|         City| Cert|Closing Date|
+-----+---------------------+--------------------+-------------+-----+------------+
|   IA| Iowa Trust & Savi...|       Citizens Bank|     Sac City| 8758|    3-Nov-23|
|   KS| Dream First Bank,...|Heartland Tri-Sta...|      Elkhart|25851|   28-Jul-23|
|   CA| JPMorgan Chase Ba...| First Republic Bank|San Francisco|59017|    1-May-23|
|   NY|  Flagstar Bank, N.A.|      Signature Bank|     New York|57053|   12-Mar-23|
|   CA| FirstCitizens Ba...| Silicon Valley Bank|  Santa Clara|24735|   10-Mar-23|
+-----+---------------------+--------------------+-------------+-----+------------+
only showing top 5 rows



In [18]:
df2 = df.withColumnRenamed('Bank Name', 'bank_name')\
        .withColumnRenamed('Acquiring Institution', 'acquiring_institution')\
        .withColumnRenamed('Closing Date', 'closing_date')\
        .withColumnRenamed('City', 'city')\
        .withColumnRenamed('State', 'state')\
        .withColumnRenamed('Cert', 'cert')\
        .withColumnRenamed('Fund', 'fund')

In [19]:
df2.printSchema()

root
 |-- bank_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- cert: integer (nullable = true)
 |-- acquiring_institution: string (nullable = true)
 |-- closing_date: string (nullable = true)
 |-- fund: integer (nullable = true)



In [20]:
df2 = df.withColumn('city_state', concat(col('city'), lit(' - '), col('state')))

In [21]:
df2.show(2)

+--------------------+--------+-----+-----+---------------------+------------+-----+-------------+
|           Bank Name|    City|State| Cert|Acquiring Institution|Closing Date| Fund|   city_state|
+--------------------+--------+-----+-----+---------------------+------------+-----+-------------+
|       Citizens Bank|Sac City|   IA| 8758| Iowa Trust & Savi...|    3-Nov-23|10545|Sac City - IA|
|Heartland Tri-Sta...| Elkhart|   KS|25851| Dream First Bank,...|   28-Jul-23|10544| Elkhart - KS|
+--------------------+--------+-----+-----+---------------------+------------+-----+-------------+
only showing top 2 rows



In [22]:
df2 = df2.withColumn('country', lit('US'))

In [23]:
df2.show(2)

+--------------------+--------+-----+-----+---------------------+------------+-----+-------------+-------+
|           Bank Name|    City|State| Cert|Acquiring Institution|Closing Date| Fund|   city_state|country|
+--------------------+--------+-----+-----+---------------------+------------+-----+-------------+-------+
|       Citizens Bank|Sac City|   IA| 8758| Iowa Trust & Savi...|    3-Nov-23|10545|Sac City - IA|     US|
|Heartland Tri-Sta...| Elkhart|   KS|25851| Dream First Bank,...|   28-Jul-23|10544| Elkhart - KS|     US|
+--------------------+--------+-----+-----+---------------------+------------+-----+-------------+-------+
only showing top 2 rows



In [24]:
df2 = df2.drop('cert')

In [25]:
df2.show(2)

+--------------------+--------+-----+---------------------+------------+-----+-------------+-------+
|           Bank Name|    City|State|Acquiring Institution|Closing Date| Fund|   city_state|country|
+--------------------+--------+-----+---------------------+------------+-----+-------------+-------+
|       Citizens Bank|Sac City|   IA| Iowa Trust & Savi...|    3-Nov-23|10545|Sac City - IA|     US|
|Heartland Tri-Sta...| Elkhart|   KS| Dream First Bank,...|   28-Jul-23|10544| Elkhart - KS|     US|
+--------------------+--------+-----+---------------------+------------+-----+-------------+-------+
only showing top 2 rows



In [26]:
df2 = reduce(DataFrame.drop, ['city_state', 'country'], df2)

In [27]:
df2.show(2)

+--------------------+--------+-----+---------------------+------------+-----+
|           Bank Name|    City|State|Acquiring Institution|Closing Date| Fund|
+--------------------+--------+-----+---------------------+------------+-----+
|       Citizens Bank|Sac City|   IA| Iowa Trust & Savi...|    3-Nov-23|10545|
|Heartland Tri-Sta...| Elkhart|   KS| Dream First Bank,...|   28-Jul-23|10544|
+--------------------+--------+-----+---------------------+------------+-----+
only showing top 2 rows



In [28]:
df3 = df.where(df['state'] == 'NE')

In [29]:
df3.show(2)

+-------------------+-------+-----+-----+---------------------+------------+-----+
|          Bank Name|   City|State| Cert|Acquiring Institution|Closing Date| Fund|
+-------------------+-------+-----+-----+---------------------+------------+-----+
| Ericson State Bank|Ericson|   NE|18265| Farmers and Merch...|   14-Feb-20|10535|
|Mid City Bank, Inc.|  Omaha|   NE|19397|         Premier Bank|    4-Nov-11|10410|
+-------------------+-------+-----+-----+---------------------+------------+-----+
only showing top 2 rows



In [30]:
df4 = df.where(df['CERT'].between(1000, 2000))

In [31]:
df4.show(2)

+-------------------+----------+-----+----+---------------------+------------+-----+
|          Bank Name|      City|State|Cert|Acquiring Institution|Closing Date| Fund|
+-------------------+----------+-----+----+---------------------+------------+-----+
|Fayette County Bank|Saint Elmo|   IL|1802| United Fidelity B...|   26-May-17|10528|
|     Heartland Bank|   Leawood|   KS|1361|         Metcalf Bank|   20-Jul-12|10452|
+-------------------+----------+-----+----+---------------------+------------+-----+
only showing top 2 rows



In [32]:
df5 = df.where(df['state'].isin('NE', 'IL'))

In [33]:
df5.show(2)

+--------------------+-------+-----+-----+---------------------+------------+-----+
|           Bank Name|   City|State| Cert|Acquiring Institution|Closing Date| Fund|
+--------------------+-------+-----+-----+---------------------+------------+-----+
|  Ericson State Bank|Ericson|   NE|18265| Farmers and Merch...|   14-Feb-20|10535|
|Washington Federa...|Chicago|   IL|30570|   Royal Savings Bank|   15-Dec-17|10530|
+--------------------+-------+-----+-----+---------------------+------------+-----+
only showing top 2 rows



In [34]:
df6 = df.where((df['state'] == 'NE') & (df['city'] == 'Ericson'))
df6.show()

+------------------+-------+-----+-----+---------------------+------------+-----+
|         Bank Name|   City|State| Cert|Acquiring Institution|Closing Date| Fund|
+------------------+-------+-----+-----+---------------------+------------+-----+
|Ericson State Bank|Ericson|   NE|18265| Farmers and Merch...|   14-Feb-20|10535|
+------------------+-------+-----+-----+---------------------+------------+-----+

