# Pandas API on Spark

In [30]:
import pandas as pd
import pyspark.pandas as ps
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

## Conceptual foundations

### Configuration

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
spark.sql("set spark.sql.ansi.enabled=false")

DataFrame[key: string, value: string]

In [9]:
print("Spark version:", spark.version)
print("pyspark.pandas version:", ps.sys.version)

print(ps.options.display.max_rows)

Spark version: 4.0.0
pyspark.pandas version: 3.13.5 | packaged by conda-forge | (main, Jun 16 2025, 08:17:35) [GCC 13.3.0]
1000


## Cluster and data access

### Create simple dataframe

In [10]:
df = ps.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]})
print(type(df))
df.head()

<class 'pyspark.pandas.frame.DataFrame'>


Unnamed: 0,x,y
0,1,10
1,2,20
2,3,30


### Loading data

In [11]:
shark_incidents_dirty_df = ps.read_csv("shark-incidents.csv", index_col="UIN")
shark_incidents_df = ps.read_parquet("shark-incidents.parquet", index_col="uin")

shark_incidents_df.head()

Unnamed: 0_level_0,incident_month,incident_year,victim_injury,state,location,latitude,longitude,site_category,site_category_comment,shark_common_name,shark_scientific_name,shark_identification_method,shark_identification_source,shark_length_m,basis_for_length,provoked_unprovoked,provocative_act,no_sharks,victim_activity,fish_speared_,commercial_dive_activity,object_of_bite,present_at_time_of_bite,direction_first_strike,shark_behaviour,victim_aware_of_shark,shark_captured,injury_location,injury_severity,victim_gender,victim_age,victim_clothing,clothing_coverage,dominant_clothing_colour,other_clothing_colour,clothing_pattern,fin_colour,diversionary_action_taken,diversionary_action_outcome,people_<3m,people_3_15m,time_of_incident,depth_of_incident_m,teeth_recovered,time_in_water_min,water_temperature_°c,total_water_depth_m,water_visability_m,distance_to_shore_m,spring_or_neap_tide,tidal_cycle,wind_condition,weather_condition,air_temperature_°c,personal_protective_device,deterrent_brand_and_type,data_source,reference,Unnamed: 59_level_0
uin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1
1,1,1791,fatal,NSW,near sydney,-33.866667,151.2,coastal,,white shark,Carcharodon carcharias,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,,swimming,,,,,,,,,torso,major lacerations,female,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,book,"shark&survl, whitley 1958, book ref 1793",
2,3,1803,injured,WA,"hamelin bay, faure island",-25.833333,113.883333,coastal,,tiger shark,Galeocerdo cuvier,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,swam at victim,Y,,,,male,,,,,,,,pushed at shark,,,,,0.0,,1.0,,,,,,,,,,,,book,"balgridge,green,taylor,whitley 1940",
3,1,1807,injured,NSW,"cockle bay, sydney harbour",-33.866667,151.2,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour",,,,unprovoked,,1.0,swimming,,,,,,bit victim on wrist,,,"arm, hand",minor lacerations,male,,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,sydney gazette 18.1.1807,
4,1,1820,fatal,TAS,"sweetwater point, pitt water",-42.8,147.533333,coastal,,,,,,,,provoked,,1.0,swimming,,,,catch,,bit victim on leg,N,,leg,major lacerations,male,,,,,,,,,,,,,1.0,,,,,,100.0,,,,,,,,witness account,"shark&survl, c. black researcher",
5,1,1825,injured,NSW,"kirribili point, sydney harbour",-33.85,151.216667,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,bit legs,,,leg,minor lacerations,male,15.0,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,maitland daily mercury 13.11.1899,


### Creating dataframes programmatically

In [None]:
df_spark = ps.DataFrame({"name": ["Alice", "Bob"], "age": [25, 32]})
df_spark.head()

In [None]:
df_local = pd.DataFrame({"city": ["London", "Paris"], "country": ["UK", "FR"]})
df_from_pandas = ps.from_pandas(df_local)
df_from_pandas.head()

### Inspecting data

In [12]:
shark_incidents_df.head(5)

Unnamed: 0_level_0,incident_month,incident_year,victim_injury,state,location,latitude,longitude,site_category,site_category_comment,shark_common_name,shark_scientific_name,shark_identification_method,shark_identification_source,shark_length_m,basis_for_length,provoked_unprovoked,provocative_act,no_sharks,victim_activity,fish_speared_,commercial_dive_activity,object_of_bite,present_at_time_of_bite,direction_first_strike,shark_behaviour,victim_aware_of_shark,shark_captured,injury_location,injury_severity,victim_gender,victim_age,victim_clothing,clothing_coverage,dominant_clothing_colour,other_clothing_colour,clothing_pattern,fin_colour,diversionary_action_taken,diversionary_action_outcome,people_<3m,people_3_15m,time_of_incident,depth_of_incident_m,teeth_recovered,time_in_water_min,water_temperature_°c,total_water_depth_m,water_visability_m,distance_to_shore_m,spring_or_neap_tide,tidal_cycle,wind_condition,weather_condition,air_temperature_°c,personal_protective_device,deterrent_brand_and_type,data_source,reference,Unnamed: 59_level_0
uin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1
1,1,1791,fatal,NSW,near sydney,-33.866667,151.2,coastal,,white shark,Carcharodon carcharias,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,,swimming,,,,,,,,,torso,major lacerations,female,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,book,"shark&survl, whitley 1958, book ref 1793",
2,3,1803,injured,WA,"hamelin bay, faure island",-25.833333,113.883333,coastal,,tiger shark,Galeocerdo cuvier,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,swam at victim,Y,,,,male,,,,,,,,pushed at shark,,,,,0.0,,1.0,,,,,,,,,,,,book,"balgridge,green,taylor,whitley 1940",
3,1,1807,injured,NSW,"cockle bay, sydney harbour",-33.866667,151.2,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour",,,,unprovoked,,1.0,swimming,,,,,,bit victim on wrist,,,"arm, hand",minor lacerations,male,,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,sydney gazette 18.1.1807,
4,1,1820,fatal,TAS,"sweetwater point, pitt water",-42.8,147.533333,coastal,,,,,,,,provoked,,1.0,swimming,,,,catch,,bit victim on leg,N,,leg,major lacerations,male,,,,,,,,,,,,,1.0,,,,,,100.0,,,,,,,,witness account,"shark&survl, c. black researcher",
5,1,1825,injured,NSW,"kirribili point, sydney harbour",-33.85,151.216667,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,bit legs,,,leg,minor lacerations,male,15.0,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,maitland daily mercury 13.11.1899,


In [13]:
shark_incidents_df.info()

<class 'pyspark.pandas.frame.DataFrame'>
Index: 1196 entries, 1 to 1196
Data columns (total 59 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   incident_month               1196 non-null   int64  
 1   incident_year                1196 non-null   int64  
 2   victim_injury                1196 non-null   object 
 3   state                        1196 non-null   object 
 4   location                     1193 non-null   object 
 5   latitude                     1196 non-null   float64
 6   longitude                    1196 non-null   float64
 7   site_category                1196 non-null   object 
 8   site_category_comment        23 non-null     object 
 9   shark_common_name            1144 non-null   object 
 10  shark_scientific_name        1144 non-null   object 
 11  shark_identification_method  994 non-null    object 
 12  shark_identification_source  138 non-null    object 
 13  shark_length_m      

In [14]:
shark_incidents_df.describe()

Unnamed: 0,incident_month,incident_year,latitude,longitude,shark_length_m,no_sharks,victim_age,people_<3m,people_3_15m,time_of_incident,depth_of_incident_m,time_in_water_min,water_temperature_°c,total_water_depth_m,water_visability_m,distance_to_shore_m,air_temperature_°c,Unnamed: 18
count,1196.0,1196.0,1196.0,1196.0,579.0,1107.0,699.0,97.0,83.0,514.0,518.0,230.0,91.0,228.0,69.0,356.0,40.0,1.0
mean,5.915552,1966.895485,-28.584045,143.191359,2.698446,1.03523,28.164521,1.556701,3.289157,1281.745136,2.129923,59.37,20.981319,5.961404,8.347826,3201.761236,24.175,415438758.0
std,4.090449,48.260971,7.931424,13.194388,1.209384,0.347591,13.794858,1.561043,5.511695,410.957788,5.485205,253.578818,4.12709,9.286013,14.188672,21638.80955,4.914069,
min,1.0,1791.0,-43.6523,96.816667,0.3,1.0,0.0,0.0,0.0,130.0,0.0,0.1,0.3,0.5,0.0,1.0,10.0,415438758.0
25%,2.0,1931.0,-33.866667,141.6,1.8,1.0,17.0,1.0,1.0,930.0,0.0,3.0,19.0,1.0,1.0,30.0,22.0,415438758.0
50%,5.0,1983.0,-31.816667,149.983333,2.6,1.0,25.0,1.0,2.0,1300.0,0.0,10.0,21.0,2.0,5.0,80.0,25.0,415438758.0
75%,10.0,2010.0,-24.383333,152.183333,3.5,1.0,36.0,2.0,3.0,1620.0,1.0,30.0,23.0,7.0,10.0,200.0,27.0,415438758.0
max,12.0,2022.0,-9.4,159.15,6.0,10.0,84.0,12.0,40.0,2330.0,45.0,2160.0,29.0,80.0,100.0,280000.0,35.0,415438758.0


## Transformations

### Selecting and filtering

In [15]:
subset_df = shark_incidents_df[["shark_common_name", "victim_injury"]]
subset_df.head()

Unnamed: 0_level_0,shark_common_name,victim_injury
uin,Unnamed: 1_level_1,Unnamed: 2_level_1
1,white shark,fatal
2,tiger shark,injured
3,bull shark,injured
4,,fatal
5,bull shark,injured


In [16]:
fatal_df = subset_df[subset_df["victim_injury"] == "fatal"]
fatal_df.head()

Unnamed: 0_level_0,shark_common_name,victim_injury
uin,Unnamed: 1_level_1,Unnamed: 2_level_1
1,white shark,fatal
4,,fatal
10,whaler shark,fatal
11,whaler shark,fatal
12,bull shark,fatal


### Column expressions

In [17]:
shark_incidents_df["was_fatal"] = shark_incidents_df["victim_injury"] == "fatal"
shark_incidents_df.head()

Unnamed: 0_level_0,incident_month,incident_year,victim_injury,state,location,latitude,longitude,site_category,site_category_comment,shark_common_name,shark_scientific_name,shark_identification_method,shark_identification_source,shark_length_m,basis_for_length,provoked_unprovoked,provocative_act,no_sharks,victim_activity,fish_speared_,commercial_dive_activity,object_of_bite,present_at_time_of_bite,direction_first_strike,shark_behaviour,victim_aware_of_shark,shark_captured,injury_location,injury_severity,victim_gender,victim_age,victim_clothing,clothing_coverage,dominant_clothing_colour,other_clothing_colour,clothing_pattern,fin_colour,diversionary_action_taken,diversionary_action_outcome,people_<3m,people_3_15m,time_of_incident,depth_of_incident_m,teeth_recovered,time_in_water_min,water_temperature_°c,total_water_depth_m,water_visability_m,distance_to_shore_m,spring_or_neap_tide,tidal_cycle,wind_condition,weather_condition,air_temperature_°c,personal_protective_device,deterrent_brand_and_type,data_source,reference,Unnamed: 59_level_0,was_fatal
uin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
1,1,1791,fatal,NSW,near sydney,-33.866667,151.2,coastal,,white shark,Carcharodon carcharias,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,,swimming,,,,,,,,,torso,major lacerations,female,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,book,"shark&survl, whitley 1958, book ref 1793",,True
2,3,1803,injured,WA,"hamelin bay, faure island",-25.833333,113.883333,coastal,,tiger shark,Galeocerdo cuvier,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,swam at victim,Y,,,,male,,,,,,,,pushed at shark,,,,,0.0,,1.0,,,,,,,,,,,,book,"balgridge,green,taylor,whitley 1940",,False
3,1,1807,injured,NSW,"cockle bay, sydney harbour",-33.866667,151.2,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour",,,,unprovoked,,1.0,swimming,,,,,,bit victim on wrist,,,"arm, hand",minor lacerations,male,,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,sydney gazette 18.1.1807,,False
4,1,1820,fatal,TAS,"sweetwater point, pitt water",-42.8,147.533333,coastal,,,,,,,,provoked,,1.0,swimming,,,,catch,,bit victim on leg,N,,leg,major lacerations,male,,,,,,,,,,,,,1.0,,,,,,100.0,,,,,,,,witness account,"shark&survl, c. black researcher",,True
5,1,1825,injured,NSW,"kirribili point, sydney harbour",-33.85,151.216667,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,bit legs,,,leg,minor lacerations,male,15.0,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,maitland daily mercury 13.11.1899,,False


### Handling missing data

In [18]:
shark_incidents_dirty_df.tail()

Unnamed: 0_level_0,Incident.month,Incident.year,Victim.injury,State,Location,Latitude,Longitude,Site.category,Site.category.comment,Shark.common.name,Shark.scientific.name,Shark.identification.method,Shark.identification.source,Shark.length.m,Basis.for.length,Provoked/unprovoked,Provocative.act,No.sharks,Victim.activity,Fish.speared?,Commercial.dive.activity,Object.of.bite,Present.at.time.of.bite,Direction.first.strike,Shark.behaviour,Victim.aware.of.shark,Shark.captured,Injury.location,Injury.severity,Victim.gender,Victim.age,Victim.clothing,Clothing.coverage,Dominant.clothing.colour,Other.clothing.colour,Clothing.pattern,Fin.colour,Diversionary.action.taken,Diversionary.action.outcome,People <3m,People 3-15m,Time.of.incident,Depth.of.incident.m,Teeth.recovered,Time.in.water.min,Water.temperature.°C,Total.water.depth.m,Water.visability.m,Distance.to.shore.m,Spring.or.neap.tide,Tidal.cycle,Wind.condition,Weather.condition,Air.temperature.°C,Personal.protective.device,Deterrent.brand.and.type,Data.source,Reference,_c59
UIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [19]:
shark_incidents_dirty_df.isnull().sum(axis=0)

Incident.month                   11
Incident.year                    11
Victim.injury                    10
State                            10
Location                         13
Latitude                         10
Longitude                        10
Site.category                    10
Site.category.comment          1183
Shark.common.name                63
Shark.scientific.name            63
Shark.identification.method     213
Shark.identification.source    1069
Shark.length.m                  628
Basis.for.length                725
Provoked/unprovoked              15
Provocative.act                1055
No.sharks                       100
Victim.activity                  36
Fish.speared?                  1207
Commercial.dive.activity       1091
Object.of.bite                  921
Present.at.time.of.bite         617
Direction.first.strike          905
Shark.behaviour                 227
Victim.aware.of.shark           593
Shark.captured                 1139
Injury.location             

In [20]:
clean_df = shark_incidents_dirty_df.dropna(subset=["Shark.common.name"])
clean_df.tail()

Unnamed: 0_level_0,Incident.month,Incident.year,Victim.injury,State,Location,Latitude,Longitude,Site.category,Site.category.comment,Shark.common.name,Shark.scientific.name,Shark.identification.method,Shark.identification.source,Shark.length.m,Basis.for.length,Provoked/unprovoked,Provocative.act,No.sharks,Victim.activity,Fish.speared?,Commercial.dive.activity,Object.of.bite,Present.at.time.of.bite,Direction.first.strike,Shark.behaviour,Victim.aware.of.shark,Shark.captured,Injury.location,Injury.severity,Victim.gender,Victim.age,Victim.clothing,Clothing.coverage,Dominant.clothing.colour,Other.clothing.colour,Clothing.pattern,Fin.colour,Diversionary.action.taken,Diversionary.action.outcome,People <3m,People 3-15m,Time.of.incident,Depth.of.incident.m,Teeth.recovered,Time.in.water.min,Water.temperature.°C,Total.water.depth.m,Water.visability.m,Distance.to.shore.m,Spring.or.neap.tide,Tidal.cycle,Wind.condition,Weather.condition,Air.temperature.°C,Personal.protective.device,Deterrent.brand.and.type,Data.source,Reference,_c59
UIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1
1191,2,2022,fatal,NSW,"little bay, sydney",-30.883,153.083,coastal,bay to open ocean,white shark,Carcharodon carcharias,,government official,4.5,,unprovoked,,1,swimming,,,,fishing,below,video online (graphic),N,N,,,male,35.0,,,,,,,,,,,1700.0,,,,,,,50.0,,,,,,,,,https://www.watoday.com.au/national/nsw/one-of...,
1192,2,2022,injured,QLD,"Redcliffe, Brisbane",-27.2333,153.2667,ocean/pelagic,,tiger shark,Galeocerdo cuvier,,witness/victim,3.5,,unprovoked,,1,other:floating,,,,fishing,,,,,leg,major lacerations,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1193,2,2022,injured,WA,"wylie bay, esperance",-33.84848,121.9184,coastal,bay to open ocean,white shark,Carcharodon carcharias,,government official,3.3,,unprovoked,,1,other:floating,,,,,,,,,torso,major lacerations,female,20.0,,,,,,,,,,,1200.0,,,,,,,,,,,,,,,,https://www.dailymail.co.uk/news/article-10483...,
1194,2,2022,injured,NSW,"shelley beach, manly",-33.8006,151.2982,coastal,bay to open ocean,wobbegong,Orectolobidae,victim,,,,unprovoked,,1,diving,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1195,3,2022,injured,NSW,crowdy head beach,-32.0,152.7512,coastal,surf beach,wobbegong,Orectolobidae,,government official,,,unprovoked,,1,swimming,,,,baitfish,,,,,left leg,minor lacerations,male,32.0,swimwear,,black,,,,,,,,1720.0,,,40.0,20.0,1.2,,50.0,,,,,,,,,,


### Sorting and renaming

In [22]:
longest_df = shark_incidents_df.sort_values("shark_length_m", ascending=False)
longest_df.head()

Unnamed: 0_level_0,incident_month,incident_year,victim_injury,state,location,latitude,longitude,site_category,site_category_comment,shark_common_name,shark_scientific_name,shark_identification_method,shark_identification_source,shark_length_m,basis_for_length,provoked_unprovoked,provocative_act,no_sharks,victim_activity,fish_speared_,commercial_dive_activity,object_of_bite,present_at_time_of_bite,direction_first_strike,shark_behaviour,victim_aware_of_shark,shark_captured,injury_location,injury_severity,victim_gender,victim_age,victim_clothing,clothing_coverage,dominant_clothing_colour,other_clothing_colour,clothing_pattern,fin_colour,diversionary_action_taken,diversionary_action_outcome,people_<3m,people_3_15m,time_of_incident,depth_of_incident_m,teeth_recovered,time_in_water_min,water_temperature_°c,total_water_depth_m,water_visability_m,distance_to_shore_m,spring_or_neap_tide,tidal_cycle,wind_condition,weather_condition,air_temperature_°c,personal_protective_device,deterrent_brand_and_type,data_source,reference,Unnamed: 59_level_0,was_fatal
uin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
345,2,1936,fatal,NSW,"manly, south steyne, sydney",-33.783333,151.283333,coastal,,white shark,Carcharodon carcharias,"bite analysis, shark behaviour, geographical l...",,6.0,witness/victim,unprovoked,,1,swimming,,,,swimming,,,N,,other: body not recovered,,male,14.0,,,,,,,,,,1.0,1500.0,0.0,,,23.0,2.0,,100.0,,,,,,,,media outlet,"sharpe, green, shark&survl, the canberra times...",,True
596,2,1982,fatal,TAS,south cape bay,-43.566667,146.883333,coastal,,white shark,Carcharodon carcharias,direct observation,,6.0,"witness/victim, other: comparison to boat",provoked,,1,spearfishing,,,,"spearfishing, boating",,bit victim on body,N,,other: body not recovered,,male,32.0,wetsuit/drysuit,full body,black,,,,,,,,,0.0,,,,,,50.0,,,moderate breeze,,,speargun,,media outlet,"green, shark&survl, c.black, daily mirror 1.3....",,True
606,6,1984,uninjured,TAS,cape raoul,-43.233333,147.783333,coastal,,white shark,Carcharodon carcharias,direct observation,,6.0,witness/victim,provoked,enticed shark,1,diving,,,Catch,catch,,swam towards victim,Y,,,,male,,,,,,,,pushed at shark,,,,,,,,,5.0,,,,,,,,,,media outlet,the mercury 16.6.1984,,False
609,10,1984,uninjured,TAS,"the lanterns, fortescue bay",-43.133333,148.0,coastal,,white shark,Carcharodon carcharias,direct observation,,6.0,witness/victim,provoked,enticed shark,1,diving,,Y,Catch,catch,,circled victim,Y,,,,male,,wetsuit/drysuit,full body,black,,,,hid in rock cave,,,,,23.0,,5.0,,,23.0,,,,,,,,,media outlet,the mercury 15.10.1984,,False
613,3,1985,fatal,SA,"wiseman's beach, port lincoln",-34.466667,136.016667,coastal,,white shark,Carcharodon carcharias,direct observation,,6.0,witness/victim,unprovoked,,1,snorkelling,,,,snorkelling,behind,bit victim,N,,torso,major lacerations,female,33.0,wetsuit/drysuit,full body,black,,,yellow,,,,,1230.0,,,,,,,150.0,,,,,,,,media outlet,"papers, taylor.",,True


In [23]:
renamed_df = shark_incidents_dirty_df.rename(columns={"Shark.common.name": "species"})
renamed_df.head()

Unnamed: 0_level_0,Incident.month,Incident.year,Victim.injury,State,Location,Latitude,Longitude,Site.category,Site.category.comment,species,Shark.scientific.name,Shark.identification.method,Shark.identification.source,Shark.length.m,Basis.for.length,Provoked/unprovoked,Provocative.act,No.sharks,Victim.activity,Fish.speared?,Commercial.dive.activity,Object.of.bite,Present.at.time.of.bite,Direction.first.strike,Shark.behaviour,Victim.aware.of.shark,Shark.captured,Injury.location,Injury.severity,Victim.gender,Victim.age,Victim.clothing,Clothing.coverage,Dominant.clothing.colour,Other.clothing.colour,Clothing.pattern,Fin.colour,Diversionary.action.taken,Diversionary.action.outcome,People <3m,People 3-15m,Time.of.incident,Depth.of.incident.m,Teeth.recovered,Time.in.water.min,Water.temperature.°C,Total.water.depth.m,Water.visability.m,Distance.to.shore.m,Spring.or.neap.tide,Tidal.cycle,Wind.condition,Weather.condition,Air.temperature.°C,Personal.protective.device,Deterrent.brand.and.type,Data.source,Reference,_c59
UIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1
1,1,1791,fatal,NSW,near sydney,-33.86666667,151.2,coastal,,white shark,Carcharodon carcharias,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,,swimming,,,,,,,,,torso,major lacerations,female,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,book,"shark&survl, whitley 1958, book ref 1793",
2,3,1803,injured,WA,"hamelin bay, faure island",-25.83333333,113.8833333,coastal,,tiger shark,Galeocerdo cuvier,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,swam at victim,Y,,,,male,,,,,,,,pushed at shark,,,,,0.0,,1.0,,,,,,,,,,,,book,"balgridge,green,taylor,whitley 1940",
3,1,1807,injured,NSW,"cockle bay, sydney harbour",-33.86666667,151.2,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour",,,,unprovoked,,1.0,swimming,,,,,,bit victim on wrist,,,"arm, hand",minor lacerations,male,,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,sydney gazette 18.1.1807,
4,1,1820,fatal,TAS,"sweetwater point, pitt water",-42.8,147.5333333,coastal,,,,,,,,provoked,,1.0,swimming,,,,catch,,bit victim on leg,N,,leg,major lacerations,male,,,,,,,,,,,,,1.0,,,,,,100.0,,,,,,,,witness account,"shark&survl, c. black researcher",
5,1,1825,injured,NSW,"kirribili point, sydney harbour",-33.85,151.2166667,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,bit legs,,,leg,minor lacerations,male,15.0,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,maitland daily mercury 13.11.1899,


## Aggregations, joins and dates

### GroupBy and aggregations

In [24]:
length_df = (
    shark_incidents_df.groupby("shark_common_name")
    .agg(largest=("shark_length_m", "max"), average=("shark_length_m", "median"))
    .sort_values(["average"], ascending=False)
)

length_df.head()

Unnamed: 0_level_0,largest,average
shark_common_name,Unnamed: 1_level_1,Unnamed: 2_level_1
white shark,6.0,3.0
tiger shark,5.0,3.0
hammerhead shark,4.0,3.0
broadnose sevengill shark,3.0,2.6
sevengill shark,2.6,2.6


### Joins

In [25]:
shark_incidents_df.merge(
    length_df, how="left", left_on="shark_common_name", right_index=True
).head()

Unnamed: 0_level_0,incident_month,incident_year,victim_injury,state,location,latitude,longitude,site_category,site_category_comment,shark_common_name,shark_scientific_name,shark_identification_method,shark_identification_source,shark_length_m,basis_for_length,provoked_unprovoked,provocative_act,no_sharks,victim_activity,fish_speared_,commercial_dive_activity,object_of_bite,present_at_time_of_bite,direction_first_strike,shark_behaviour,victim_aware_of_shark,shark_captured,injury_location,injury_severity,victim_gender,victim_age,victim_clothing,clothing_coverage,dominant_clothing_colour,other_clothing_colour,clothing_pattern,fin_colour,diversionary_action_taken,diversionary_action_outcome,people_<3m,people_3_15m,time_of_incident,depth_of_incident_m,teeth_recovered,time_in_water_min,water_temperature_°c,total_water_depth_m,water_visability_m,distance_to_shore_m,spring_or_neap_tide,tidal_cycle,wind_condition,weather_condition,air_temperature_°c,personal_protective_device,deterrent_brand_and_type,data_source,reference,Unnamed: 59_level_0,was_fatal,largest,average
uin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1
1,1,1791,fatal,NSW,near sydney,-33.866667,151.2,coastal,,white shark,Carcharodon carcharias,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,,swimming,,,,,,,,,torso,major lacerations,female,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,book,"shark&survl, whitley 1958, book ref 1793",,True,6.0,3.0
2,3,1803,injured,WA,"hamelin bay, faure island",-25.833333,113.883333,coastal,,tiger shark,Galeocerdo cuvier,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,swam at victim,Y,,,,male,,,,,,,,pushed at shark,,,,,0.0,,1.0,,,,,,,,,,,,book,"balgridge,green,taylor,whitley 1940",,False,5.0,3.0
3,1,1807,injured,NSW,"cockle bay, sydney harbour",-33.866667,151.2,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour",,,,unprovoked,,1.0,swimming,,,,,,bit victim on wrist,,,"arm, hand",minor lacerations,male,,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,sydney gazette 18.1.1807,,False,4.0,2.25
4,1,1820,fatal,TAS,"sweetwater point, pitt water",-42.8,147.533333,coastal,,,,,,,,provoked,,1.0,swimming,,,,catch,,bit victim on leg,N,,leg,major lacerations,male,,,,,,,,,,,,,1.0,,,,,,100.0,,,,,,,,witness account,"shark&survl, c. black researcher",,True,,
5,1,1825,injured,NSW,"kirribili point, sydney harbour",-33.85,151.216667,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,bit legs,,,leg,minor lacerations,male,15.0,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,maitland daily mercury 13.11.1899,,False,4.0,2.25


### Dates and times

In [26]:
shark_incidents_df["incident_day"] = 1

shark_incidents_df["incident_date"] = ps.to_datetime(
    shark_incidents_df.rename(
        columns={
            "incident_day": "day",
            "incident_month": "month",
            "incident_year": "year",
        }
    )
).head()

shark_incidents_df.head()

Unnamed: 0_level_0,incident_month,incident_year,victim_injury,state,location,latitude,longitude,site_category,site_category_comment,shark_common_name,shark_scientific_name,shark_identification_method,shark_identification_source,shark_length_m,basis_for_length,provoked_unprovoked,provocative_act,no_sharks,victim_activity,fish_speared_,commercial_dive_activity,object_of_bite,present_at_time_of_bite,direction_first_strike,shark_behaviour,victim_aware_of_shark,shark_captured,injury_location,injury_severity,victim_gender,victim_age,victim_clothing,clothing_coverage,dominant_clothing_colour,other_clothing_colour,clothing_pattern,fin_colour,diversionary_action_taken,diversionary_action_outcome,people_<3m,people_3_15m,time_of_incident,depth_of_incident_m,teeth_recovered,time_in_water_min,water_temperature_°c,total_water_depth_m,water_visability_m,distance_to_shore_m,spring_or_neap_tide,tidal_cycle,wind_condition,weather_condition,air_temperature_°c,personal_protective_device,deterrent_brand_and_type,data_source,reference,Unnamed: 59_level_0,was_fatal,incident_day,incident_date
uin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1
1,1,1791,fatal,NSW,near sydney,-33.866667,151.2,coastal,,white shark,Carcharodon carcharias,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,,swimming,,,,,,,,,torso,major lacerations,female,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,book,"shark&survl, whitley 1958, book ref 1793",,True,1,1791-01-01
2,3,1803,injured,WA,"hamelin bay, faure island",-25.833333,113.883333,coastal,,tiger shark,Galeocerdo cuvier,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,swam at victim,Y,,,,male,,,,,,,,pushed at shark,,,,,0.0,,1.0,,,,,,,,,,,,book,"balgridge,green,taylor,whitley 1940",,False,1,1803-03-01
3,1,1807,injured,NSW,"cockle bay, sydney harbour",-33.866667,151.2,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour",,,,unprovoked,,1.0,swimming,,,,,,bit victim on wrist,,,"arm, hand",minor lacerations,male,,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,sydney gazette 18.1.1807,,False,1,1807-01-01
4,1,1820,fatal,TAS,"sweetwater point, pitt water",-42.8,147.533333,coastal,,,,,,,,provoked,,1.0,swimming,,,,catch,,bit victim on leg,N,,leg,major lacerations,male,,,,,,,,,,,,,1.0,,,,,,100.0,,,,,,,,witness account,"shark&survl, c. black researcher",,True,1,1820-01-01
5,1,1825,injured,NSW,"kirribili point, sydney harbour",-33.85,151.216667,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,bit legs,,,leg,minor lacerations,male,15.0,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,maitland daily mercury 13.11.1899,,False,1,1825-01-01


In [28]:
shark_incidents_df.dtypes

incident_month             int64
incident_year              int64
victim_injury             object
state                     object
location                  object
                       ...      
reference                 object
                           int64
was_fatal                   bool
incident_day               int32
incident_date     datetime64[ns]
Length: 62, dtype: object

## Advanced operations

### Window functions

In [29]:
shark_incidents_df["length_rank"] = shark_incidents_df.groupby("shark_common_name")[
    "shark_length_m"
].rank(method="dense", ascending=False)

shark_incidents_df.head()

Unnamed: 0_level_0,incident_month,incident_year,victim_injury,state,location,latitude,longitude,site_category,site_category_comment,shark_common_name,shark_scientific_name,shark_identification_method,shark_identification_source,shark_length_m,basis_for_length,provoked_unprovoked,provocative_act,no_sharks,victim_activity,fish_speared_,commercial_dive_activity,object_of_bite,present_at_time_of_bite,direction_first_strike,shark_behaviour,victim_aware_of_shark,shark_captured,injury_location,injury_severity,victim_gender,victim_age,victim_clothing,clothing_coverage,dominant_clothing_colour,other_clothing_colour,clothing_pattern,fin_colour,diversionary_action_taken,diversionary_action_outcome,people_<3m,people_3_15m,time_of_incident,depth_of_incident_m,teeth_recovered,time_in_water_min,water_temperature_°c,total_water_depth_m,water_visability_m,distance_to_shore_m,spring_or_neap_tide,tidal_cycle,wind_condition,weather_condition,air_temperature_°c,personal_protective_device,deterrent_brand_and_type,data_source,reference,Unnamed: 59_level_0,was_fatal,incident_day,incident_date,length_rank
uin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1
1,1,1791,fatal,NSW,near sydney,-33.866667,151.2,coastal,,white shark,Carcharodon carcharias,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,,swimming,,,,,,,,,torso,major lacerations,female,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,book,"shark&survl, whitley 1958, book ref 1793",,True,1,1791-01-01,39.0
2,3,1803,injured,WA,"hamelin bay, faure island",-25.833333,113.883333,coastal,,tiger shark,Galeocerdo cuvier,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,swam at victim,Y,,,,male,,,,,,,,pushed at shark,,,,,0.0,,1.0,,,,,,,,,,,,book,"balgridge,green,taylor,whitley 1940",,False,1,1803-03-01,25.0
3,1,1807,injured,NSW,"cockle bay, sydney harbour",-33.866667,151.2,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour",,,,unprovoked,,1.0,swimming,,,,,,bit victim on wrist,,,"arm, hand",minor lacerations,male,,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,sydney gazette 18.1.1807,,False,1,1807-01-01,24.0
4,1,1820,fatal,TAS,"sweetwater point, pitt water",-42.8,147.533333,coastal,,,,,,,,provoked,,1.0,swimming,,,,catch,,bit victim on leg,N,,leg,major lacerations,male,,,,,,,,,,,,,1.0,,,,,,100.0,,,,,,,,witness account,"shark&survl, c. black researcher",,True,1,1820-01-01,6.0
5,1,1825,injured,NSW,"kirribili point, sydney harbour",-33.85,151.216667,estuary/harbour,,bull shark,Carcharhinus leucas,"bite analysis, shark behaviour, geographical l...",,,,unprovoked,,1.0,swimming,,,,,,bit legs,,,leg,minor lacerations,male,15.0,,,,,,,,,,,,,,,,,,,,,,,,,,media outlet,maitland daily mercury 13.11.1899,,False,1,1825-01-01,24.0


### Interoperating with PySpark

In [31]:
shark_incidents_sdf = shark_incidents_df.to_spark(index_col="uin")
shark_incidents_sdf.printSchema()

shark_incidents_extended_sdf = shark_incidents_sdf.withColumn(
    "shark_length_ft", F.col("shark_length_m") * 3.28084
)

shark_incidents_extended_df = shark_incidents_extended_sdf.pandas_api(index_col="uin")
shark_incidents_extended_df.sort_values("shark_length_m", ascending=False).head()

root
 |-- uin: long (nullable = true)
 |-- incident_month: long (nullable = true)
 |-- incident_year: long (nullable = true)
 |-- victim_injury: string (nullable = true)
 |-- state: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- site_category: string (nullable = true)
 |-- site_category_comment: string (nullable = true)
 |-- shark_common_name: string (nullable = true)
 |-- shark_scientific_name: string (nullable = true)
 |-- shark_identification_method: string (nullable = true)
 |-- shark_identification_source: string (nullable = true)
 |-- shark_length_m: double (nullable = true)
 |-- basis_for_length: string (nullable = true)
 |-- provoked_unprovoked: string (nullable = true)
 |-- provocative_act: string (nullable = true)
 |-- no_sharks: long (nullable = true)
 |-- victim_activity: string (nullable = true)
 |-- fish_speared_: string (nullable = true)
 |-- commercial_dive_activity: 

Unnamed: 0_level_0,incident_month,incident_year,victim_injury,state,location,latitude,longitude,site_category,site_category_comment,shark_common_name,shark_scientific_name,shark_identification_method,shark_identification_source,shark_length_m,basis_for_length,provoked_unprovoked,provocative_act,no_sharks,victim_activity,fish_speared_,commercial_dive_activity,object_of_bite,present_at_time_of_bite,direction_first_strike,shark_behaviour,victim_aware_of_shark,shark_captured,injury_location,injury_severity,victim_gender,victim_age,victim_clothing,clothing_coverage,dominant_clothing_colour,other_clothing_colour,clothing_pattern,fin_colour,diversionary_action_taken,diversionary_action_outcome,people_<3m,people_3_15m,time_of_incident,depth_of_incident_m,teeth_recovered,time_in_water_min,water_temperature_°c,total_water_depth_m,water_visability_m,distance_to_shore_m,spring_or_neap_tide,tidal_cycle,wind_condition,weather_condition,air_temperature_°c,personal_protective_device,deterrent_brand_and_type,data_source,reference,Unnamed: 59_level_0,was_fatal,incident_day,incident_date,length_rank,shark_length_ft
uin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1
345,2,1936,fatal,NSW,"manly, south steyne, sydney",-33.783333,151.283333,coastal,,white shark,Carcharodon carcharias,"bite analysis, shark behaviour, geographical l...",,6.0,witness/victim,unprovoked,,1,swimming,,,,swimming,,,N,,other: body not recovered,,male,14.0,,,,,,,,,,1.0,1500.0,0.0,,,23.0,2.0,,100.0,,,,,,,,media outlet,"sharpe, green, shark&survl, the canberra times...",,True,1,NaT,1.0,19.68504
596,2,1982,fatal,TAS,south cape bay,-43.566667,146.883333,coastal,,white shark,Carcharodon carcharias,direct observation,,6.0,"witness/victim, other: comparison to boat",provoked,,1,spearfishing,,,,"spearfishing, boating",,bit victim on body,N,,other: body not recovered,,male,32.0,wetsuit/drysuit,full body,black,,,,,,,,,0.0,,,,,,50.0,,,moderate breeze,,,speargun,,media outlet,"green, shark&survl, c.black, daily mirror 1.3....",,True,1,NaT,1.0,19.68504
606,6,1984,uninjured,TAS,cape raoul,-43.233333,147.783333,coastal,,white shark,Carcharodon carcharias,direct observation,,6.0,witness/victim,provoked,enticed shark,1,diving,,,Catch,catch,,swam towards victim,Y,,,,male,,,,,,,,pushed at shark,,,,,,,,,5.0,,,,,,,,,,media outlet,the mercury 16.6.1984,,False,1,NaT,1.0,19.68504
609,10,1984,uninjured,TAS,"the lanterns, fortescue bay",-43.133333,148.0,coastal,,white shark,Carcharodon carcharias,direct observation,,6.0,witness/victim,provoked,enticed shark,1,diving,,Y,Catch,catch,,circled victim,Y,,,,male,,wetsuit/drysuit,full body,black,,,,hid in rock cave,,,,,23.0,,5.0,,,23.0,,,,,,,,,media outlet,the mercury 15.10.1984,,False,1,NaT,1.0,19.68504
613,3,1985,fatal,SA,"wiseman's beach, port lincoln",-34.466667,136.016667,coastal,,white shark,Carcharodon carcharias,direct observation,,6.0,witness/victim,unprovoked,,1,snorkelling,,,,snorkelling,behind,bit victim,N,,torso,major lacerations,female,33.0,wetsuit/drysuit,full body,black,,,yellow,,,,,1230.0,,,,,,,150.0,,,,,,,,media outlet,"papers, taylor.",,True,1,NaT,1.0,19.68504


## Performance and Exporting

### Performance considerations

In [34]:
shark_incidents_sdf = shark_incidents_df.to_spark(index_col="uin")

# Caching underlying Spark DataFrame
shark_incidents_sdf.cache()

# Repartitoning for performance
shark_incidents_repartitioned_sdf = shark_incidents_sdf.repartition("shark_common_name")
shark_incidents_repartitioned_df = shark_incidents_repartitioned_sdf.pandas_api()

### Exporting Data

In [None]:
shark_incidents_df.to_parquet("parquet/", index_col="uin")
shark_incidents_df.to_csv("csv/", index_col="uin")