# Combine Cal Fire and Kaggle Data Sets

## Imports and Globals

In [None]:
import math

import pandas as pd
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# Expect files to be sorted by fire start date
CAL_FIRE_FILEPATH = "/content/drive/My Drive/ML6140  - Project/Raw Data/Cal Fire/CalFire_IncidentData_Reformatted.csv"
KAGGLE_FIRE_FILEPATH = "/content/drive/My Drive/ML6140  - Project/Raw Data/Kaggle_DATA/california_fires_mod.csv"

In [None]:
COMBINED_OUTFILE_PATH = "/content/drive/My Drive/ML6140  - Project/Raw Data/CombinedFires/combinedFires.csv"
DEDUPED_OUTFILE_PATH = "/content/drive/My Drive/ML6140  - Project/Raw Data/CombinedFires/combinedFires_deduped.csv"

## Find matches between Cal Fire and Kaggle during overlap period

### Helpers

In [None]:
def haversine_distance(lat1, lon1, lat2, lon2):
  """
  Return distance along spherical surface in km.
  """
  # Radius of the Earth in kilometers
  R = 6371.0

  # Convert latitude and longitude from degrees to radians
  lat1_rad = math.radians(lat1)
  lon1_rad = math.radians(lon1)
  lat2_rad = math.radians(lat2)
  lon2_rad = math.radians(lon2)

  # Difference in coordinates
  dlat = lat2_rad - lat1_rad
  dlon = lon2_rad - lon1_rad

  # Haversine formula
  a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
  c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

  # Distance in kilometers
  distance = R * c
  return distance

### Functions for matching

In [None]:
# def match_names(name1, name2):
#     # Ensure name1 and name2 are strings, replace NaNs with empty strings
#     name1 = str(name1).strip().lower() if pd.notna(name1) else ''
#     name2 = str(name2).strip().lower() if pd.notna(name2) else ''

#     # Adjust the threshold as needed
#     return fuzz.partial_ratio(name1, name2) > 80

def match_names(cf_name, kaggle_name):
  cf_name = cf_name.strip()[:-5]  # remove " fire"
  cf_name = cf_name.lower()
  kaggle_name = kaggle_name.lower()
  # TODO: would be better with regex to enforce "River" only matches with "River" and not also with "Riverside"
  return cf_name in kaggle_name

In [None]:
def match_times(year1, month1, day1, year2, month2, day2, day_threshold=1):
    # Match year and month exactly
    if year1 != year2 or month1 != month2:
        return False

    # Allow a difference of 'day_threshold' days in the day of the month
    return abs(day1 - day2) <= day_threshold

In [None]:
def match_locations(loc1, loc2, max_distance_km=0.0111):

    # loc1 and loc2 are tuples of (latitude, longitude)
    distance = haversine_distance(loc1[0], loc1[1], loc2[0], loc2[1])
    return distance <= max_distance_km

In [None]:
def match_sizes(size1, size2, ratio_threshold=0.05):

    # Avoid division by zero for very small sizes
    if size1 == 0 or size2 == 0:
        return size1 == size2

    # Calculate the ratio of the smaller size to the larger size
    size_ratio = min(size1, size2) / max(size1, size2)

    # Compare the size ratio to the threshold
    return size_ratio >= (1 - ratio_threshold)

In [None]:
def dist_to_radius_ratio(lat1, lon1, lat2, lon2, acres1, acres2):
  dist = haversine_distance(lat1, lon1, lat2, lon2)
  km2_burned = max(min(acres1, acres2), 0) / 247.1  # Convert acres to km2
  radius = (km2_burned / math.pi) ** .5  # area = pi * r2
  if radius:
    return dist / radius

In [None]:
def dist_to_radius_ratio_valid(lat1, lon1, lat2, lon2, acres1, acres2, ratio_max=0.01):
  ratio = dist_to_radius_ratio(lat1, lon1, lat2, lon2, acres1, acres2)
  if ratio:
    return ratio <= ratio_max
  return not haversine_distance(lat1, lon1, lat2, lon2)

### Main

### Read in and format DFs

In [None]:
cal_fire_full_df = pd.read_csv(CAL_FIRE_FILEPATH)
kaggle_fire_full_df = pd.read_csv(KAGGLE_FIRE_FILEPATH)

# Renaming columns in Kaggle dataset to match Cal Fire's columns
# Add or remove columns as needed
kaggle_fire_full_df.rename(columns={
    'FIRE_NAME': 'incident_name',
    'DISCOVERY_YEAR': 'incident_created_year',
    'DISCOVERY_MONTH': 'incident_created_month',
    'DISCOVERY_DAY': 'incident_created_day',
    'DISCOVERY_HOUR': 'incident_created_hour',
    'DISCOVERY_MINUTE': 'incident_created_minute',
    'CONT_YEAR': 'incident_extinguished_year',
    'CONT_MONTH': 'incident_extinguished_month',
    'CONT_DAY': 'incident_extinguished_day',
    'CONT_HOUR': 'incident_extinguished_hour',
    'CONT_MINUTE': 'incident_extinguished_minute',
    'FIRE_SIZE': 'incident_acres_burned',
    'LATITUDE': 'incident_latitude',
    'LONGITUDE': 'incident_longitude'
}, inplace=True)

# Filtering the datasets for the years 2013 to 2015
cal_fire_overlap_df = cal_fire_full_df[cal_fire_full_df['incident_created_year'].between(2013, 2015)]
kaggle_fire_overlap_df = kaggle_fire_full_df[kaggle_fire_full_df['incident_created_year'].between(2013, 2015)]

### Inspect DFs

In [None]:
print(len(cal_fire_overlap_df))
cal_fire_overlap_df.head()

316


Unnamed: 0,incident_name,incident_created_year,incident_created_month,incident_created_day,incident_created_hour,incident_created_minute,incident_acres_burned,incident_longitude,incident_latitude,incident_extinguished_year,incident_extinguished_month,incident_extinguished_day,incident_extinguished_hour,incident_extinguished_minute
0,River Fire,2013,2,24,8,16,407.0,-118.01651,36.602575,2013.0,2.0,28.0,20.0,0.0
1,Fawnskin Fire,2013,4,20,17,30,30.0,-116.941311,34.288877,2013.0,4.0,22.0,9.0,0.0
2,Gold Fire,2013,4,30,12,59,274.0,-119.635004,37.116295,2013.0,5.0,1.0,7.0,0.0
3,Silverado Fire,2013,4,30,23,44,75.0,-122.350844,38.441792,2013.0,5.0,1.0,17.0,15.0
4,Yellow Fire,2013,5,1,2,1,125.0,-122.655616,38.638828,2013.0,5.0,3.0,6.0,15.0


In [None]:
cal_fire_overlap_df["incident_created_year"].value_counts()

2013    141
2015     99
2014     76
Name: incident_created_year, dtype: int64

In [None]:
cal_fire_overlap_df["incident_name"].str[:1].value_counts()

C    40
S    34
B    27
M    21
L    20
P    19
D    17
R    15
G    14
W    14
F    14
H    12
T    11
O     8
A     7
K     6
N     6
V     5
E     5
3     4
I     4
J     4
5     3
U     2
1     1
Y     1
2     1
Q     1
Name: incident_name, dtype: int64

In [None]:
len(cal_fire_overlap_df[pd.isna(cal_fire_overlap_df["incident_name"])])

0

In [None]:
print(len(kaggle_fire_overlap_df))
kaggle_fire_overlap_df.head()

22576


Unnamed: 0,incident_name,incident_created_year,incident_created_month,incident_created_day,incident_created_hour,incident_created_minute,STAT_CAUSE_DESCR,incident_extinguished_year,incident_extinguished_month,incident_extinguished_day,incident_extinguished_hour,incident_extinguished_minute,incident_acres_burned,incident_latitude,incident_longitude,STATE
166974,,2013,1,1,0.0,11.0,Campfire,,,,,,0.1,32.701186,-117.103192,CA
166975,WALTERS,2013,1,1,12.0,56.0,Miscellaneous,2013.0,1.0,1.0,15.0,0.0,0.3,33.313447,-114.862947,CA
166976,,2013,1,1,16.0,36.0,Missing/Undefined,,,,,,0.12,34.572792,-118.045025,CA
166977,,2013,1,1,16.0,36.0,Missing/Undefined,,,,,,0.12,34.95,-118.166666,CA
166978,,2013,1,2,13.0,10.0,Campfire,,,,,,0.01,37.95543,-121.339826,CA


In [None]:
kaggle_fire_overlap_df["incident_created_year"].value_counts()

2013    8734
2015    7362
2014    6480
Name: incident_created_year, dtype: int64

In [None]:
kf_droppedNa = kaggle_fire_overlap_df[pd.notnull(kaggle_fire_overlap_df["incident_name"])]
kf_droppedNa[kf_droppedNa["incident_name"].str.startswith("S")]

Unnamed: 0,incident_name,incident_created_year,incident_created_month,incident_created_day,incident_created_hour,incident_created_minute,STAT_CAUSE_DESCR,incident_extinguished_year,incident_extinguished_month,incident_extinguished_day,incident_extinguished_hour,incident_extinguished_minute,incident_acres_burned,incident_latitude,incident_longitude,STATE
167053,SOLEDAD,2013,1,16,10.0,58.0,Miscellaneous,2013.0,1.0,16.0,11.0,45.0,1.00,34.436944,-118.369167,CA
167120,SPRINGS,2013,1,26,14.0,15.0,Missing/Undefined,2013.0,1.0,26.0,18.0,30.0,15.00,37.309280,-118.306640,CA
167205,SHADOW,2013,2,13,13.0,7.0,Miscellaneous,2013.0,2.0,13.0,13.0,45.0,0.50,34.435278,-118.378889,CA
167239,SANDRAIL,2013,2,15,15.0,42.0,Equipment Use,2013.0,2.0,15.0,15.0,48.0,0.10,32.968300,-115.169600,CA
167246,SILVER,2013,2,16,13.0,21.0,Miscellaneous,2013.0,2.0,16.0,13.0,30.0,0.25,34.432778,-118.420556,CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189489,S STANISLAUS AV /W MOUNT,2015,12,7,14.0,22.0,Debris Burning,2015.0,12.0,7.0,14.0,57.0,0.10,36.428560,-120.338298,CA
189512,SHADY,2015,12,16,3.0,10.0,Missing/Undefined,2015.0,12.0,16.0,3.0,34.0,1.00,33.663848,-116.173449,CA
189527,SOLIMAR,2015,12,25,0.0,45.0,Missing/Undefined,,,,,,1288.00,34.314167,-119.352778,CA
189534,S DIVISION DR BIG_BEAR_L,2015,12,25,21.0,38.0,Miscellaneous,,,,,,0.01,34.252214,-116.866547,CA


In [None]:
kaggle_fire_overlap_df["incident_name"].str[:1].value_counts()

S    1682
C    1559
M    1333
B    1268
H    1081
P     972
R     932
L     765
W     687
D     620
T     585
A     568
G     566
F     540
E     417
O     387
V     306
N     292
I     263
J     238
1     236
K     212
3     104
2      99
Y      64
Q      54
U      48
5      46
4      46
9      34
6      32
7      23
Z      18
8      13
X       3
#       3
_       1
{       1
0       1
Name: incident_name, dtype: int64

In [None]:
len(kaggle_fire_overlap_df[pd.isna(kaggle_fire_overlap_df["incident_name"])])

6477

Ideas:
- Search by name. That eliminates misses more quickly than year.
- Use str.startswith instead of fuzz match--better accuracy and more flexibility
- Iteratively increase cutoff for location/acres/time

In [None]:
temp_name_cal_fire_example = "Pala Fire"
temp_name_kaggle_example = "# PALA TEMECULA RD MP 2.5 P"
temp_name_kaggle_example2 = "PALA # 4"  # probably the matching fire (by eye)
print("should match, but wouldn't at threshold 80")
print(f"fuzz('{temp_name_cal_fire_example}', '{temp_name_kaggle_example}'): ", fuzz.partial_ratio(temp_name_cal_fire_example, temp_name_kaggle_example))  # wouldn't match at threshold 80
print(f"fuzz('{temp_name_cal_fire_example}', '{temp_name_kaggle_example2}'): ", fuzz.partial_ratio(temp_name_cal_fire_example, temp_name_kaggle_example2))  # wouldn't match
print(f"'{temp_name_kaggle_example}' startswith '{temp_name_cal_fire_example}': ", temp_name_kaggle_example.lower().startswith(temp_name_cal_fire_example[:-4].lower()))  # in practice, would also strip
print()
print(f"'{temp_name_kaggle_example2}' startswith '{temp_name_cal_fire_example}': ", temp_name_kaggle_example2.lower().startswith(temp_name_cal_fire_example[:-4].lower()))
print(f"'{temp_name_cal_fire_example}' in '{temp_name_kaggle_example}': ", temp_name_cal_fire_example[:-4].lower() in temp_name_kaggle_example.lower())
print(f"'{temp_name_cal_fire_example}' in '{temp_name_kaggle_example}': ", temp_name_cal_fire_example[:-4].lower() in temp_name_kaggle_example2.lower())
print()
print("shouldn't match, but do")
print(f"fuzz('# GOLF MECULA RD MP 2.5 P', '{temp_name_kaggle_example}'): ", fuzz.partial_ratio("# GOLF MECULA RD MP 2.5 P", temp_name_kaggle_example))

should match, but wouldn't at threshold 80
fuzz('Pala Fire', '# PALA TEMECULA RD MP 2.5 P'):  22
fuzz('Pala Fire', 'PALA # 4'):  25
'# PALA TEMECULA RD MP 2.5 P' startswith 'Pala Fire':  False

'PALA # 4' startswith 'Pala Fire':  True
'Pala Fire' in '# PALA TEMECULA RD MP 2.5 P':  True
'Pala Fire' in '# PALA TEMECULA RD MP 2.5 P':  True

shouldn't match, but do
fuzz('# GOLF MECULA RD MP 2.5 P', '# PALA TEMECULA RD MP 2.5 P'):  80


### Perform matching

In [None]:
def get_matches(cal_fires_to_match_df, max_distance_km=0.0111, dist_to_radius_ratio_threshold=0.02):
  # matches = []
  matches = {}

  # With a focus on matching on names, work from a smaller df that doesn't have
  # null names. This may miss sum matches.
  kaggle_name_not_null = kaggle_fire_overlap_df[pd.notnull(kaggle_fire_overlap_df["incident_name"])]
  kaggle_null_name = kaggle_fire_overlap_df[pd.isnull(kaggle_fire_overlap_df["incident_name"])]
  for cf_index, cf_row in tqdm(cal_fires_to_match_df.iterrows(), total=len(cal_fires_to_match_df)):  # tqdm for progress bar
    iter_matches = []

    cf_name = cf_row["incident_name"]

    # Start by getting matching names since that's the biggest reduction in search size for other attrs.
    kaggle_df_for_name = kaggle_name_not_null[kaggle_name_not_null["incident_name"].apply(lambda k_name: match_names(cf_name, k_name))]
    kaggle_df_for_name = pd.concat([kaggle_df_for_name, kaggle_null_name])

    for k_index, k_row in kaggle_df_for_name.iterrows():
      if (match_times(cf_row['incident_created_year'], cf_row['incident_created_month'], cf_row['incident_created_day'],
                    k_row['incident_created_year'], k_row['incident_created_month'], k_row['incident_created_day'])
          and (haversine_distance(cf_row['incident_latitude'], cf_row['incident_longitude'],
                             k_row['incident_latitude'], k_row['incident_longitude']) <= max_distance_km
              or dist_to_radius_ratio_valid(cf_row['incident_latitude'], cf_row['incident_longitude'],
                             k_row['incident_latitude'], k_row['incident_longitude'],
                                         cf_row['incident_acres_burned'], k_row['incident_acres_burned'],
                              ratio_max=dist_to_radius_ratio_threshold))
      ):
              iter_matches.append(k_row)
    if iter_matches:
      matches[cf_index] = iter_matches

  return matches

In [None]:
# Get matches
# at .3 ratio, about 80% of the areas overlap, at .4, about 75% https://www.123calculus.com/en/two-circles-calculator-page-7-60-400.html
dist_to_radius_ratio_threshold = .4
cal_fires_to_match_df = cal_fire_overlap_df
matches = get_matches(cal_fires_to_match_df,
                      dist_to_radius_ratio_threshold=dist_to_radius_ratio_threshold)

100%|██████████| 316/316 [02:29<00:00,  2.11it/s]


View matches

In [None]:
matches_as_list = []
keys = matches.keys()
for k in keys:
  pair = {
      "cal fire": cal_fire_overlap_df.iloc[k],
      "kaggle": matches[k][0]
  }
  stats = {
      "distance": haversine_distance(pair["cal fire"]["incident_latitude"], pair["cal fire"]["incident_longitude"], pair["kaggle"]["incident_latitude"], pair["kaggle"]["incident_longitude"]),
      "distance to radius ratio": dist_to_radius_ratio(pair["cal fire"]["incident_latitude"], pair["cal fire"]["incident_longitude"], pair["kaggle"]["incident_latitude"], pair["kaggle"]["incident_longitude"], pair["cal fire"]["incident_acres_burned"], pair["kaggle"]["incident_acres_burned"]),
      "size ratio": min(pair["cal fire"]["incident_acres_burned"], pair["kaggle"]["incident_acres_burned"]) / max(pair["cal fire"]["incident_acres_burned"], pair["kaggle"]["incident_acres_burned"]),
      "days off": pair["cal fire"]["incident_created_day"] - pair["kaggle"]["incident_created_day"]
  }
  pair["stats"] = stats
  matches_as_list.append(pair)
matches_as_list

[{'cal fire': incident_name                   Cherry Fire
  incident_created_year                  2013
  incident_created_month                    5
  incident_created_day                     20
  incident_created_hour                    13
  incident_created_minute                  47
  incident_acres_burned                  25.0
  incident_longitude              -116.907213
  incident_latitude                  33.96201
  incident_extinguished_year           2013.0
  incident_extinguished_month             5.0
  incident_extinguished_day              20.0
  incident_extinguished_hour             18.0
  incident_extinguished_minute           50.0
  Name: 15, dtype: object,
  'kaggle': incident_name                              CHERRY
  incident_created_year                        2013
  incident_created_month                          5
  incident_created_day                           19
  incident_created_hour                        13.0
  incident_created_minute                      

In [None]:
for key in matches.keys():
  if len(matches[key]) > 1:
    raise Exception("Muliple matches for key: ", key)
  else:
    matches[key] = matches[key][0]

View unmatched

In [None]:
# Get unmatched fires
cal_fires_to_match_df = cal_fires_to_match_df.drop(matches.keys())

In [None]:
cal_fires_to_match_df

Unnamed: 0,incident_name,incident_created_year,incident_created_month,incident_created_day,incident_created_hour,incident_created_minute,incident_acres_burned,incident_longitude,incident_latitude,incident_extinguished_year,incident_extinguished_month,incident_extinguished_day,incident_extinguished_hour,incident_extinguished_minute
0,River Fire,2013,2,24,8,16,407.0,-118.016510,36.602575,2013.0,2.0,28.0,20.0,0.0
1,Fawnskin Fire,2013,4,20,17,30,30.0,-116.941311,34.288877,2013.0,4.0,22.0,9.0,0.0
2,Gold Fire,2013,4,30,12,59,274.0,-119.635004,37.116295,2013.0,5.0,1.0,7.0,0.0
3,Silverado Fire,2013,4,30,23,44,75.0,-122.350844,38.441792,2013.0,5.0,1.0,17.0,15.0
4,Yellow Fire,2013,5,1,2,1,125.0,-122.655616,38.638828,2013.0,5.0,3.0,6.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,Munjar Fire,2015,10,2,14,38,104.0,-121.893800,39.870700,2015.0,10.0,2.0,17.0,50.0
312,Cienega Fire,2015,10,12,16,0,690.0,-121.327340,36.708540,2015.0,10.0,16.0,18.0,0.0
313,Gibraltar Fire,2015,10,29,5,30,21.0,-119.632160,34.482800,2015.0,11.0,9.0,12.0,0.0
314,Potrero Fire,2015,11,7,1,45,50.0,-118.879444,34.150000,2015.0,11.0,7.0,16.0,45.0


## Combine Datasets, substituting averages for matching fires

Replace Kaggle matches with Cal Fire data. Add unmatched 2013-2015 Cal Fire fires to data set. Add the Kaggle and Cal Fire fires from outside the overlap range.

In [None]:
# Indices of the kaggle rows that matched Cal Fire rows
kaggle_matches_to_drop = [matches[key].name for key in matches.keys()]

In [None]:
combined_df = pd.concat([cal_fire_full_df, kaggle_fire_full_df.drop(kaggle_matches_to_drop)])

In [None]:
combined_df

Unnamed: 0,incident_name,incident_created_year,incident_created_month,incident_created_day,incident_created_hour,incident_created_minute,incident_acres_burned,incident_longitude,incident_latitude,incident_extinguished_year,incident_extinguished_month,incident_extinguished_day,incident_extinguished_hour,incident_extinguished_minute,STAT_CAUSE_DESCR,STATE
0,River Fire,2013,2,24,8.0,16.0,407.00,-118.016510,36.602575,2013.0,2.0,28.0,20.0,0.0,,
1,Fawnskin Fire,2013,4,20,17.0,30.0,30.00,-116.941311,34.288877,2013.0,4.0,22.0,9.0,0.0,,
2,Gold Fire,2013,4,30,12.0,59.0,274.00,-119.635004,37.116295,2013.0,5.0,1.0,7.0,0.0,,
3,Silverado Fire,2013,4,30,23.0,44.0,75.00,-122.350844,38.441792,2013.0,5.0,1.0,17.0,15.0,,
4,Yellow Fire,2013,5,1,2.0,1.0,125.00,-122.655616,38.638828,2013.0,5.0,3.0,6.0,15.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189545,PINON,2015,12,29,14.0,52.0,0.10,-117.358313,33.808482,2015.0,12.0,29.0,15.0,10.0,Miscellaneous,CA
189546,,2015,12,30,17.0,23.0,0.01,-120.166666,35.050000,,,,,,Missing/Undefined,CA
189547,ACADEMY 2,2015,12,30,17.0,51.0,0.10,-119.556721,36.536611,,,,,,Debris Burning,CA
189548,LEIGH,2015,12,30,18.0,16.0,0.10,-117.007382,32.701123,,,,,,Missing/Undefined,CA


## Write to file

Clean a bit

In [None]:
extra_columns = ["STAT_CAUSE_DESCR", "STATE"]
combined_df = combined_df.drop(extra_columns, axis=1)

In [None]:
combined_df

Unnamed: 0,incident_name,incident_created_year,incident_created_month,incident_created_day,incident_created_hour,incident_created_minute,incident_acres_burned,incident_longitude,incident_latitude,incident_extinguished_year,incident_extinguished_month,incident_extinguished_day,incident_extinguished_hour,incident_extinguished_minute
0,River Fire,2013,2,24,8.0,16.0,407.00,-118.016510,36.602575,2013.0,2.0,28.0,20.0,0.0
1,Fawnskin Fire,2013,4,20,17.0,30.0,30.00,-116.941311,34.288877,2013.0,4.0,22.0,9.0,0.0
2,Gold Fire,2013,4,30,12.0,59.0,274.00,-119.635004,37.116295,2013.0,5.0,1.0,7.0,0.0
3,Silverado Fire,2013,4,30,23.0,44.0,75.00,-122.350844,38.441792,2013.0,5.0,1.0,17.0,15.0
4,Yellow Fire,2013,5,1,2.0,1.0,125.00,-122.655616,38.638828,2013.0,5.0,3.0,6.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189545,PINON,2015,12,29,14.0,52.0,0.10,-117.358313,33.808482,2015.0,12.0,29.0,15.0,10.0
189546,,2015,12,30,17.0,23.0,0.01,-120.166666,35.050000,,,,,
189547,ACADEMY 2,2015,12,30,17.0,51.0,0.10,-119.556721,36.536611,,,,,
189548,LEIGH,2015,12,30,18.0,16.0,0.10,-117.007382,32.701123,,,,,


In [None]:
combined_df.dtypes

incident_name                    object
incident_created_year             int64
incident_created_month            int64
incident_created_day              int64
incident_created_hour           float64
incident_created_minute         float64
incident_acres_burned           float64
incident_longitude              float64
incident_latitude               float64
incident_extinguished_year      float64
incident_extinguished_month     float64
incident_extinguished_day       float64
incident_extinguished_hour      float64
incident_extinguished_minute    float64
dtype: object

In [None]:
combined_df["incident_name"] = combined_df["incident_name"].apply(lambda x: str(x).lower())

Write to file

In [None]:
combined_df.sort_values(["incident_created_year", "incident_created_month", "incident_created_day", "incident_name"], inplace=True)

In [None]:
combined_df.to_csv(COMBINED_OUTFILE_PATH, index=False)