In [1]:
#!pip install pyspark
#!pip install findspark

In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

KeyboardInterrupt: 

In [None]:
df = spark.sql('''select 'Sucesso total, estamos online!' as hello''')
df.show()

+--------------------+
|               hello|
+--------------------+
|Sucesso total, es...|
+--------------------+



In [None]:
from pyspark.sql import Row, DataFrame
from pyspark.sql.types import StringType, StructType, StructField, IntegerType
from pyspark.sql.functions import col, expr, lit, substring, concat, concat_ws, when, coalesce
from pyspark.sql import functions as F
from functools import reduce

### Data Manipulation using Spark

In [None]:
df = spark.read.csv('banklist.csv', sep= ',', inferSchema = True, header = True)

print('df.count  :', df.count())
print('df.col ct :', len(df.columns))
print('df.columns  :', df.columns)

df.count  : 553
df.col ct : 7
df.columns  : ['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution', 'Closing Date', 'Updated Date']


In [None]:
df.createOrReplaceTempView("banklist")

df_check = spark.sql('''select `Bank Name`, City, `Closing Date` from banklist''')
df_check.show(4, truncate=False)

+------------------------------------------------------+------------------+------------+
|Bank Name                                             |City              |Closing Date|
+------------------------------------------------------+------------------+------------+
|Fayette County Bank                                   |Saint Elmo        |26-May-17   |
|Guaranty Bank, (d/b/a BestBank in Georgia & Michigan) |Milwaukee         |5-May-17    |
|First NBC Bank                                        |New Orleans       |28-Apr-17   |
|Proficio Bank                                         |Cottonwood Heights|3-Mar-17    |
+------------------------------------------------------+------------------+------------+
only showing top 4 rows



# DataFrame Basic Operations

In [None]:
df.describe().show()

+-------+--------------------+-------+----+-----------------+---------------------+------------+------------+
|summary|           Bank Name|   City|  ST|             CERT|Acquiring Institution|Closing Date|Updated Date|
+-------+--------------------+-------+----+-----------------+---------------------+------------+------------+
|  count|                 553|    553| 553|              553|                  553|         553|         553|
|   mean|                NULL|   NULL|NULL|31729.65280289331|                 NULL|        NULL|        NULL|
| stddev|                NULL|   NULL|NULL|16420.59489355429|                 NULL|        NULL|        NULL|
|    min|1st American Stat...|Acworth|  AL|               91|      1st United Bank|    1-Aug-08|    1-Aug-13|
|    max|               ebank|Wyoming|  WY|            58701|  Your Community Bank|    9-Sep-11|    9-Sep-12|
+-------+--------------------+-------+----+-----------------+---------------------+------------+------------+



In [None]:
df.describe('City', 'ST').show()

+-------+-------+----+
|summary|   City|  ST|
+-------+-------+----+
|  count|    553| 553|
|   mean|   NULL|NULL|
| stddev|   NULL|NULL|
|    min|Acworth|  AL|
|    max|Wyoming|  WY|
+-------+-------+----+



### Count, Columns and Schema

In [None]:
print('Total de linhas :', df.count())
print('Total de colunas :', len(df.columns))
print('Colunas :', df.columns)
print('Tipo de dados :', df.dtypes)
print('Schema :', df.schema)


Total de linhas : 553
Total de colunas : 7
Colunas : ['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution', 'Closing Date', 'Updated Date']
Tipo de dados : [('Bank Name', 'string'), ('City', 'string'), ('ST', 'string'), ('CERT', 'int'), ('Acquiring Institution', 'string'), ('Closing Date', 'string'), ('Updated Date', 'string')]
Schema : StructType([StructField('Bank Name', StringType(), True), StructField('City', StringType(), True), StructField('ST', StringType(), True), StructField('CERT', IntegerType(), True), StructField('Acquiring Institution', StringType(), True), StructField('Closing Date', StringType(), True), StructField('Updated Date', StringType(), True)])


In [None]:
df.printSchema

# Remove Duplicates

In [None]:
df = df.dropDuplicates()

print('Total de linhas :', df.count())
print('Total de colunas :', df.columns)


Total de linhas : 553
Total de colunas : ['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution', 'Closing Date', 'Updated Date']


# Select Specific Columns

In [None]:
df2 = df.select(*['Bank Name', 'City'])
df2.show(2)

+-------------+----------+
|    Bank Name|      City|
+-------------+----------+
|       InBank|Oak Forest|
|Bank of Alamo|     Alamo|
+-------------+----------+
only showing top 2 rows



# Select Multiple Columns

In [None]:
col_1 = list(set(df.columns) - {'CERT', 'ST'})
df2 = df.select(*col_1)
df2.show()

+------------+---------------+------------+--------------------+---------------------+
|Closing Date|           City|Updated Date|           Bank Name|Acquiring Institution|
+------------+---------------+------------+--------------------+---------------------+
|    4-Sep-09|     Oak Forest|   17-Oct-15|              InBank| MB Financial Bank...|
|    8-Nov-02|          Alamo|   18-Mar-05|       Bank of Alamo|          No Acquirer|
|    2-Aug-13|     Fort Myers|    9-Feb-17|First Community B...|              C1 Bank|
|   24-Oct-14|        Chicago|    6-Jan-16|The National Repu...|  State Bank of Texas|
|   26-Oct-12|         Berwyn|   24-Jan-13|           NOVA Bank|          No Acquirer|
|   20-Jul-12|      Woodstock|    6-Jun-16|First Cherokee St...| Community & South...|
|    1-Mar-02|     Boca Raton|    9-Apr-08|Net 1st National ...|       Bank Leumi USA|
|    8-Jun-12|     Whiteville|   21-Mar-14|       Waccamaw Bank| First Community Bank|
|   16-Jul-10|      Clewiston|   11-Jul-16|

# Rename Columns

In [None]:
df2 = df \
  .withColumnRenamed('Bank Name', 'bank_name') \
  .withColumnRenamed('Acquiring Institution', 'acq_institution') \
  .withColumnRenamed('Closing Date', 'closing_date') \
  .withColumnRenamed('ST', 'state') \
  .withColumnRenamed('CERT', 'cert') #\

df2.show(2)

+-------------+----------+-----+-----+--------------------+------------+------------+
|    bank_name|      City|state| cert|     acq_institution|closing_date|Updated Date|
+-------------+----------+-----+-----+--------------------+------------+------------+
|       InBank|Oak Forest|   IL|20203|MB Financial Bank...|    4-Sep-09|   17-Oct-15|
|Bank of Alamo|     Alamo|   TN| 9961|         No Acquirer|    8-Nov-02|   18-Mar-05|
+-------------+----------+-----+-----+--------------------+------------+------------+
only showing top 2 rows



# Add Columns

In [None]:
df2 = df.withColumn('state', col('ST'))
df2.show(2)

+-------------+----------+---+-----+---------------------+------------+------------+-----+
|    Bank Name|      City| ST| CERT|Acquiring Institution|Closing Date|Updated Date|state|
+-------------+----------+---+-----+---------------------+------------+------------+-----+
|       InBank|Oak Forest| IL|20203| MB Financial Bank...|    4-Sep-09|   17-Oct-15|   IL|
|Bank of Alamo|     Alamo| TN| 9961|          No Acquirer|    8-Nov-02|   18-Mar-05|   TN|
+-------------+----------+---+-----+---------------------+------------+------------+-----+
only showing top 2 rows



# Add constant column

In [None]:
df2 = df.withColumn('country', lit('US'))
df2.show(2)

+-------------+----------+---+-----+---------------------+------------+------------+-------+
|    Bank Name|      City| ST| CERT|Acquiring Institution|Closing Date|Updated Date|country|
+-------------+----------+---+-----+---------------------+------------+------------+-------+
|       InBank|Oak Forest| IL|20203| MB Financial Bank...|    4-Sep-09|   17-Oct-15|     US|
|Bank of Alamo|     Alamo| TN| 9961|          No Acquirer|    8-Nov-02|   18-Mar-05|     US|
+-------------+----------+---+-----+---------------------+------------+------------+-------+
only showing top 2 rows



# Drop columns

In [None]:
df2 = df.drop('CERT')
df2.show(2)

+-------------+----------+---+---------------------+------------+------------+
|    Bank Name|      City| ST|Acquiring Institution|Closing Date|Updated Date|
+-------------+----------+---+---------------------+------------+------------+
|       InBank|Oak Forest| IL| MB Financial Bank...|    4-Sep-09|   17-Oct-15|
|Bank of Alamo|     Alamo| TN|          No Acquirer|    8-Nov-02|   18-Mar-05|
+-------------+----------+---+---------------------+------------+------------+
only showing top 2 rows



# Drop multiple columns

In [None]:
df2 = df.drop(*'CERT', 'Updated Date', 'ST')
df2.show(2)

+-------------+----------+-----+---------------------+------------+
|    Bank Name|      City| CERT|Acquiring Institution|Closing Date|
+-------------+----------+-----+---------------------+------------+
|       InBank|Oak Forest|20203| MB Financial Bank...|    4-Sep-09|
|Bank of Alamo|     Alamo| 9961|          No Acquirer|    8-Nov-02|
+-------------+----------+-----+---------------------+------------+
only showing top 2 rows



In [None]:
df2 = reduce(DataFrame.drop, ['CERT', 'ST'], df)
df2.show(2)

+-------------+----------+---------------------+------------+------------+
|    Bank Name|      City|Acquiring Institution|Closing Date|Updated Date|
+-------------+----------+---------------------+------------+------------+
|       InBank|Oak Forest| MB Financial Bank...|    4-Sep-09|   17-Oct-15|
|Bank of Alamo|     Alamo|          No Acquirer|    8-Nov-02|   18-Mar-05|
+-------------+----------+---------------------+------------+------------+
only showing top 2 rows



# Filter data

In [None]:
# Equal to values
df2 = df.where(df['ST']=='NE')

# Between values
df3 = df.where(df['CERT'].between('1000', '2000'))

# Is inside multiple values
df4 = df.where(df['ST'].isin('NE', 'IL'))

print('Total de linhas  :', df.count())
print('Total de linhas df2  :', df2.count())
print('Total de linhas df3  :', df3.count())
print('Total de linhas df4:', df4.count())


Total de linhas  : 553
Total de linhas df2  : 3
Total de linhas df3  : 9
Total de linhas df4: 71


# Filter data using logical operators

In [None]:
df2 = df.where((df['ST'] == 'NE') & (df['City'] == 'Ericson'))
df2.show(3)

+---------+----+---+----+---------------------+------------+------------+
|Bank Name|City| ST|CERT|Acquiring Institution|Closing Date|Updated Date|
+---------+----+---+----+---------------------+------------+------------+
+---------+----+---+----+---------------------+------------+------------+



# Replace values in DataFrame

In [None]:
# Pre replace
df.show(2)

# Post replace
print('Replace 7 in the above dataframe with 17 at all instances')
df.na.replace(7,17).show(2)

+-------------+----------+---+-----+---------------------+------------+------------+
|    Bank Name|      City| ST| CERT|Acquiring Institution|Closing Date|Updated Date|
+-------------+----------+---+-----+---------------------+------------+------------+
|       InBank|Oak Forest| IL|20203| MB Financial Bank...|    4-Sep-09|   17-Oct-15|
|Bank of Alamo|     Alamo| TN| 9961|          No Acquirer|    8-Nov-02|   18-Mar-05|
+-------------+----------+---+-----+---------------------+------------+------------+
only showing top 2 rows

Replace 7 in the above dataframe with 17 at all instances
+-------------+----------+---+-----+---------------------+------------+------------+
|    Bank Name|      City| ST| CERT|Acquiring Institution|Closing Date|Updated Date|
+-------------+----------+---+-----+---------------------+------------+------------+
|       InBank|Oak Forest| IL|20203| MB Financial Bank...|    4-Sep-09|   17-Oct-15|
|Bank of Alamo|     Alamo| TN| 9961|          No Acquirer|    8-Nov