---
# Task A - Data ingestion

**Spark setup**

In [1]:
# Basic imports
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
# Build SparkSession
spark = SparkSession.builder \
    .appName("CrimesFix") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "16") \
    .getOrCreate()

**Reading and checking data**

In [3]:
# Reading data
file_transactions = '/home/jovyan/code/Projecto/Datasets/3_crimes.csv'

! head $file_transactions
df_crimes = spark.read.csv(file_transactions, header=True, sep=',', inferSchema=True)


ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
11037294,JA371270,03/18/2015 12:00:00 PM,0000X W WACKER DR,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,BANK,false,false,0111,001,42,32,11,,,2015,08/01/2017 03:52:26 PM,,,
11646293,JC213749,12/20/2018 03:00:00 PM,023XX N LOCKWOOD AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,APARTMENT,false,false,2515,025,36,19,11,,,2018,04/06/2019 04:04:43 PM,,,
11645836,JC212333,05/01/2016 12:25:00 AM,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,false,false,0824,008,15,63,11,,,2016,04/06/2019 04:04:43 PM,,,
11645959,JC211511,12/20/2018 04:00:00 PM,045XX N ALBANY AVE,2820,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,false,false,1724,017,33,14,08A,,,2018,04/06/2019 04:04:43 PM,,,
11645601,JC212935,06/01/2014 12:01:00 AM,087XX S SANG

In [None]:
df_crimes.show(10)
print(f'df_crimes - number of rows is {df_crimes.count()    }.')
df_crimes.printSchema()

+--------+-----------+--------------------+--------------------+----+------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|      ID|Case Number|                Date|               Block|IUCR|      Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|Latitude|Longitude|Location|
+--------+-----------+--------------------+--------------------+----+------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|11037294|   JA371270|03/18/2015 12:00:...|   0000X W WACKER DR|1153|DECEPTIVE PRACTICE|FINANCIAL IDENTIT...|                BANK| false|   false| 111|       1|  42|            32|      11| 

In [None]:
# Now let's check the number of duplicated rows again, as we dropped some columns
print(f'df_crimes - number of rows is {df_crimes.count()  }; after dropDuplicates() applied would be {df_crimes.dropDuplicates().count()   }.')

file_transactions - number of rows is 8104658; after dropDuplicates() applied would be 8104658.


In [None]:
# Check for null values
print(f'''df_crimes - number of rows after dropna(how='any') would be {df_crimes.dropna(how='any').count()     }.''')

file_transactions - number of rows after dropna(how='any') would be 7399802.


In [None]:
# There are half a million of nulls in the dataset, so let's check how many nulls are in each column
print('Checking nulls at each column of df_crimes')
dict_nulls_retail = {col: df_crimes.filter(df_crimes[col].isNull()).count() for col in df_crimes.columns}
dict_nulls_retail

Checking nulls at each column of df_transactions


{'ID': 0,
 'Case Number': 0,
 'Date': 0,
 'Block': 0,
 'IUCR': 0,
 'Primary Type': 0,
 'Description': 0,
 'Location Description': 13257,
 'Arrest': 0,
 'Domestic': 0,
 'Beat': 0,
 'District': 47,
 'Ward': 614831,
 'Community Area': 613455,
 'FBI Code': 0,
 'X Coordinate': 89642,
 'Y Coordinate': 89642,
 'Year': 0,
 'Updated On': 0,
 'Latitude': 89642,
 'Longitude': 89642,
 'Location': 89642}

In [11]:
df_crimes = df_crimes.dropna(how='any')

In [12]:
print(f'df_crimes - number of rows is {df_crimes.count()  }.')

df_crimes - number of rows is 7399802.


In [13]:
print('Checking nulls at each column of df_crimes')
dict_nulls_retail = {col: df_crimes.filter(df_crimes[col].isNull()).count() for col in df_crimes.columns}
dict_nulls_retail

Checking nulls at each column of df_crimes


{'ID': 0,
 'Case Number': 0,
 'Date': 0,
 'Block': 0,
 'IUCR': 0,
 'Primary Type': 0,
 'Description': 0,
 'Location Description': 0,
 'Arrest': 0,
 'Domestic': 0,
 'Beat': 0,
 'District': 0,
 'Ward': 0,
 'Community Area': 0,
 'FBI Code': 0,
 'X Coordinate': 0,
 'Y Coordinate': 0,
 'Year': 0,
 'Updated On': 0,
 'Latitude': 0,
 'Longitude': 0,
 'Location': 0}