In [1]:
import pandas as pd
import numpy as np
import os
#import datetime as dt

In [2]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [3]:
# Get all files in the Details folder
details_path = "../Data/Details/"
files = [f"{details_path}{filename}" for filename in os.listdir(details_path) if os.path.isfile((os.path.join(details_path, filename)))]

In [4]:
# Get state FIPS for OTHER_FIPS conversion
state_fips_path = "../Data/cleaned_state.csv"
state_fips_df = pd.read_csv(state_fips_path, dtype=str)
state_fips_df = state_fips_df.rename(columns={"STATEFP": "TOR_OTHER_STATE_FIPS", "STATE": "TOR_OTHER_CZ_STATE"})
state_fips_df = state_fips_df.drop(columns=["Unnamed: 0"])
state_fips_df.head()

Unnamed: 0,TOR_OTHER_STATE_FIPS,TOR_OTHER_CZ_STATE
0,1,AL
1,2,AK
2,4,AZ
3,5,AR
4,6,CA


In [5]:
output_path = "../Data/Cleaned/"
tornado_id = 1

In [6]:
# Timezone data
tz_codes = ['CST', 'MST', 'EST', 'PST', 'UNK', 'CDT', 'EDT', 'MDT', 'GMT', 'HST', 'PDT', 'CSC',
 'AST', 'EST-5', 'MST-7', 'CST-6', 'PST-8', 'AST-4', 'HST-10', 'AKST-9']
tz_offsets = [-6, -7, -5, -8, 0, -5, -4, -6, 0, -10, -7, -6, -4, -5, -7, -6, -8, -4, -10, -9]
tz_recode = ['CST-6', 'MST-7', 'EST-5', 'PST-8', 'UNK', 'CDT-5', 'EDT-4', 'MDT-6', 'GMT-0',
             'HST-10', 'PDT-7', 'CST-6', 'AST-4', 'EST-5', 'MST-7', 'CST-6', 'PST-8', 'AST-4',
             'HST-10', 'AKST-9']
timezones_df = pd.DataFrame({
    "CZ_TIMEZONE": tz_codes,
    "CZ_TIMEZONE_RECODE": tz_recode,
    "UTC_OFFSET": tz_offsets
})

timezones_df

Unnamed: 0,CZ_TIMEZONE,CZ_TIMEZONE_RECODE,UTC_OFFSET
0,CST,CST-6,-6
1,MST,MST-7,-7
2,EST,EST-5,-5
3,PST,PST-8,-8
4,UNK,UNK,0
5,CDT,CDT-5,-5
6,EDT,EDT-4,-4
7,MDT,MDT-6,-6
8,GMT,GMT-0,0
9,HST,HST-10,-10


In [94]:
def process_file(file):
    print(f"Processing file {file}")
    # Read file. Set FIPS columns to string
    detail_data_raw_df = pd.read_csv(file, dtype={"STATE_FIPS": str,
                                                  "CZ_FIPS": str,
                                                  "TOR_OTHER_CZ_FIPS": str,
                                                  "DAMAGE_PROPERTY": str,
                                                  "DAMAGE_CROPS": str,
                                                  "BEGIN_AZIMUTH": str,
                                                  "BEGIN_LOCATION": str,
                                                  "END_AZIMUTH": str,
                                                  "END_LOCATION": str})
    
    # Filter tornadoes
    detail_data_clean_df = detail_data_raw_df[detail_data_raw_df["EVENT_TYPE"] == "Tornado"]
    
    # Renumber Puerto Rico & Virgin Islands state FIPS
    detail_data_clean_df.loc[detail_data_clean_df["STATE"] == "PUERTO RICO", "STATE_FIPS"] = "72"
    detail_data_clean_df.loc[detail_data_clean_df["STATE"] == "VIRGIN ISLANDS", "STATE_FIPS"] = "78"

    # Combine State and County FIPS
    detail_data_clean_df["FIPS"] = (detail_data_clean_df["STATE_FIPS"].str.zfill(2)) + (detail_data_clean_df["CZ_FIPS"].str.zfill(3))
    
    # Process timestamps
    pattern = r'-\d{2}'
    detail_data_clean_df["YEAR"] = detail_data_clean_df["YEAR"].astype(str)
 
    # FROM MICROSOFT COPILOT
    # Replace 2-digit year with 4-digit year for timestamp creation
    detail_data_clean_df["BEGIN_DATE_TIME"] = detail_data_clean_df.apply(
        lambda row: pd.Series(row["BEGIN_DATE_TIME"]).str.replace(pattern, f"-{row["YEAR"]}", regex=True)[0],axis=1)
    detail_data_clean_df["END_DATE_TIME"] = detail_data_clean_df.apply(
        lambda row: pd.Series(row["END_DATE_TIME"]).str.replace(pattern, f"-{row["YEAR"]}", regex=True)[0],axis=1)
    
    # Convert BEGIN_DATE_TIME and END_DATE_TIME to UTC UNIX (POSIX) timestamps
    detail_data_clean_df.loc[:,"B_DATE_TIME"] = pd.to_datetime(detail_data_clean_df.loc[:,"BEGIN_DATE_TIME"],
                                                               format="%d-%b-%Y %H:%M:%S")
    detail_data_clean_df.loc[:,"E_DATE_TIME"] = pd.to_datetime(detail_data_clean_df.loc[:,"END_DATE_TIME"],
                                                               format="%d-%b-%Y %H:%M:%S")
    
    detail_data_clean_df = detail_data_clean_df.merge(timezones_df, on="CZ_TIMEZONE", how="left")

    detail_data_clean_df["B_DATE_TIME"] = detail_data_clean_df["B_DATE_TIME"]\
                                        - pd.TimedeltaIndex(detail_data_clean_df["UTC_OFFSET"], unit="H")
    detail_data_clean_df["E_DATE_TIME"] = detail_data_clean_df["E_DATE_TIME"]\
                                        - pd.TimedeltaIndex(detail_data_clean_df["UTC_OFFSET"], unit="H")
    
    detail_data_clean_df.loc[:,"BEGIN_TIMESTAMP"] = detail_data_clean_df.loc[:,"B_DATE_TIME"].astype("int64") // 10**9
    detail_data_clean_df.loc[:,"END_TIMESTAMP"] = detail_data_clean_df.loc[:,"E_DATE_TIME"].astype("int64") // 10**9

    # Get F/EF scale codes
    detail_data_clean_df["TOR_F_SCALE"] = detail_data_clean_df["TOR_F_SCALE"].fillna("EFU")
    pattern = r'F(\w)'
    detail_data_clean_df["TOR_F_LEVEL"] = detail_data_clean_df["TOR_F_SCALE"].str.extract(pattern)

    # Accumulate Deaths & Injuries
    detail_data_clean_df["DEATHS"] = detail_data_clean_df["DEATHS_DIRECT"] + detail_data_clean_df["DEATHS_INDIRECT"]
    detail_data_clean_df["INJURIES"] = detail_data_clean_df["INJURIES_DIRECT"] + detail_data_clean_df["INJURIES_INDIRECT"]

    #print("DAMAGE_PROPERTY RAW: ", detail_data_clean_df["DAMAGE_PROPERTY"].value_counts())

    # Convert DAMAGE_PROPERTY to numeric
    detail_data_clean_df.loc[:,"DAMAGE_PROPERTY"] = detail_data_clean_df.loc[:,"DAMAGE_PROPERTY"].fillna("0.00K")
    detail_data_clean_df.loc[detail_data_clean_df["DAMAGE_PROPERTY"].str.match(r'\d*[.]?\d*[^KMB]\Z'), "DAMAGE_PROPERTY"] += "K"

    pattern = r'(\d*[.]?\d*)[KMB]'
    detail_data_clean_df["DMG_PRP"] = detail_data_clean_df["DAMAGE_PROPERTY"].str.extract(pattern).astype(float)
    pattern = r'\d*[.]?\d*([KMB])'
    detail_data_clean_df["DMG_PRP_MULT_STR"] = detail_data_clean_df["DAMAGE_PROPERTY"].str.extract(pattern)
    detail_data_clean_df["DMG_PRP"] = (detail_data_clean_df["DMG_PRP"] * 
                                        np.where(detail_data_clean_df["DMG_PRP_MULT_STR"] == "K", 1000, 1))
    detail_data_clean_df["DMG_PRP"] = (detail_data_clean_df["DMG_PRP"] * 
                                        np.where(detail_data_clean_df["DMG_PRP_MULT_STR"] == "M", 1000000, 1))
    detail_data_clean_df["DMG_PRP"] = (detail_data_clean_df["DMG_PRP"] * 
                                        np.where(detail_data_clean_df["DMG_PRP_MULT_STR"] == "B", 1000000000, 1))

    # Convert DAMAGE_CROPS to numeric
    detail_data_clean_df.loc[:,"DAMAGE_CROPS"] = detail_data_clean_df.loc[:,"DAMAGE_CROPS"].fillna("0.00K")
    detail_data_clean_df.loc[detail_data_clean_df["DAMAGE_CROPS"].str.match(r'\d*[.]?\d*[^KMB]\Z'), "DAMAGE_CROPS"] += "K"

    pattern = r'(\d*[.]?\d*)[KMB]'
    detail_data_clean_df["DMG_CRP"] = detail_data_clean_df["DAMAGE_CROPS"].str.extract(pattern).astype(float)
    pattern = r'\d*[.]?\d*([KMB])'
    detail_data_clean_df["DMG_CRP_MULT_STR"] = detail_data_clean_df["DAMAGE_CROPS"].str.extract(pattern)
    detail_data_clean_df["DMG_CRP"] = (detail_data_clean_df["DMG_CRP"] * 
                                        np.where(detail_data_clean_df["DMG_CRP_MULT_STR"] == "K", 1000, 1))
    detail_data_clean_df["DMG_CRP"] = (detail_data_clean_df["DMG_CRP"] * 
                                        np.where(detail_data_clean_df["DMG_CRP_MULT_STR"] == "M", 1000000, 1))
    detail_data_clean_df["DMG_CRP"] = (detail_data_clean_df["DMG_CRP"] * 
                                        np.where(detail_data_clean_df["DMG_CRP_MULT_STR"] == "B", 1000000000, 1))
    
    # Trim columns
    detail_data_clean_df = detail_data_clean_df[[
       'EVENT_ID', 'FIPS',
       'BEGIN_TIMESTAMP', 'END_TIMESTAMP',
       'DEATHS', 'INJURIES', 'DMG_PRP', 'DMG_CRP',
       'TOR_F_SCALE', 'TOR_F_LEVEL', 'TOR_LENGTH', 'TOR_WIDTH',        
       'BEGIN_RANGE', 'BEGIN_AZIMUTH', 'BEGIN_LOCATION',
       'END_RANGE', 'END_AZIMUTH', 'END_LOCATION',
       'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON',
       'EVENT_NARRATIVE']]
    
    # Rename temporary columns back to their original names
    detail_data_clean_df = detail_data_clean_df.rename({"DMG_PRP": "DAMAGE_PROPERTY",
                                                        "DMG_CRP": "DAMAGE_CROPS" }, axis=1)

    # Fix numeric dtypes
    detail_data_clean_df["DAMAGE_PROPERTY"] = detail_data_clean_df["DAMAGE_PROPERTY"].astype("int64")
    detail_data_clean_df["DAMAGE_CROPS"] = detail_data_clean_df["DAMAGE_CROPS"].astype("int64")

    return detail_data_clean_df


In [97]:
enabled = True
write = True

if enabled:
    df_list = [process_file(file) for file in files]
    details_full_df = pd.concat(df_list)
    details_full_df = details_full_df.sort_values("BEGIN_TIMESTAMP")
    details_full_df = details_full_df.reset_index()
    details_full_df = details_full_df.drop(columns=["index"])
    print("File processing complete")

    if write:
        details_full_df.to_csv(f"{output_path}/Tornadoes_1950_2024.csv", index=False)
        details_full_df.to_json(f"{output_path}/Tornadoes_1950_2024.json", orient='index')
        print(f"File written to {output_path}/Tornadoes_1950_2024.csv")
        print(f"File written to {output_path}/Tornadoes_1950_2024.json")

Processing file ../Data/Details/StormEvents_details-ftp_v1.0_d1950_c20210803.csv
Processing file ../Data/Details/StormEvents_details-ftp_v1.0_d1951_c20210803.csv
Processing file ../Data/Details/StormEvents_details-ftp_v1.0_d1952_c20210803.csv
Processing file ../Data/Details/StormEvents_details-ftp_v1.0_d1953_c20210803.csv
Processing file ../Data/Details/StormEvents_details-ftp_v1.0_d1954_c20210803.csv
Processing file ../Data/Details/StormEvents_details-ftp_v1.0_d1955_c20210803.csv
Processing file ../Data/Details/StormEvents_details-ftp_v1.0_d1956_c20210803.csv
Processing file ../Data/Details/StormEvents_details-ftp_v1.0_d1957_c20210803.csv
Processing file ../Data/Details/StormEvents_details-ftp_v1.0_d1958_c20210803.csv
Processing file ../Data/Details/StormEvents_details-ftp_v1.0_d1959_c20210803.csv
Processing file ../Data/Details/StormEvents_details-ftp_v1.0_d1960_c20210803.csv
Processing file ../Data/Details/StormEvents_details-ftp_v1.0_d1961_c20210803.csv
Processing file ../Data/Deta

In [99]:
details_full_df["EVENT_NARRATIVE"]

0                                                      NaN
1                                                      NaN
2                                                      NaN
3                                                      NaN
4                                                      NaN
                               ...                        
78491    This tornado developed west of the S 560 Road ...
78492    An EF-0 tornado formed shortly before 11pm CDT...
78493    An EF-0 tornado formed at 1130pm CDT just east...
78494    This tornado developed north of Highway 62 ove...
78495    At the Oregon State University Ship Operations...
Name: EVENT_NARRATIVE, Length: 78496, dtype: object

In [145]:
bad_fips = ['02101', '02155', '02181', '06000', '12000', '13597', '15000',
            '16000', '29677', '30000', '32000', '37000', '40012', '40018', '40020',
            '40026', '40028', '40030', '40032', '40038', '40040', '40046', '40048',
            '40050', '40052', '46000', '46001', '46131', '51039', '51123', '51780',
            '53000', '96010', '99003', '99005', '99008', '99009', '99010', '99011',
            '99013', '99021', '99069', '99079', '99081', '99085', '99091', '99097',
            '99099', '99127', '99139']

#details_full_df[details_full_df["FIPS"].isin(bad_fips)].to_csv("../Data/Cleaned/DEBUGGING.csv", index=False)

In [166]:
fips_path = "../../../Data/fips_data.csv"
fips_df = pd.read_csv(fips_path, dtype=str)
fips_list = fips_df["FIPS"].tolist()

details_full_df[~details_full_df["FIPS"].isin(fips_list)]

Unnamed: 0,EVENT_ID,FIPS,BEGIN_TIMESTAMP,END_TIMESTAMP,DEATHS,INJURIES,DAMAGE_PROPERTY,DAMAGE_CROPS,TOR_F_SCALE,TOR_F_LEVEL,...,BEGIN_AZIMUTH,BEGIN_LOCATION,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EVENT_NARRATIVE


In [148]:
details_full_df[details_full_df['BEGIN_LAT'].isna()]

Unnamed: 0,EVENT_ID,FIPS,BEGIN_DATE_TIME,STATE,CZ_NAME,BEGIN_TIMESTAMP,END_TIMESTAMP,DEATHS,INJURIES,DAMAGE_PROPERTY,...,BEGIN_AZIMUTH,BEGIN_LOCATION,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EVENT_NARRATIVE
665,10017020,19075,02-JUN-1952 18:30:00,IOWA,GRUNDY,-554772600,-554772600,1,2,0,...,,,0.0,,,,,,,
5227,9979090,01047,04-NOV-1959 15:20:00,ALABAMA,DALLAS,-320553600,-320553600,0,0,0,...,,,0.0,,,,,,,
34759,10317175,06007,07-JAN-1993 15:00:00,CALIFORNIA,BUTTE,726447600,726447600,0,0,0,...,,Biggs,0.0,,,,,,,A barn roof was tossed 75 feet and 2 vehicles ...
34760,10320946,13309,08-JAN-1993 04:00:00,GEORGIA,WHEELER,726483600,726483600,0,0,0,...,,Alamo,0.0,,,,,,,A small tornado developed at Alamo and moved e...
34762,10319606,12057,08-JAN-1993 14:00:00,FLORIDA,HILLSBOROUGH,726519600,726519600,0,0,0,...,,,0.0,,,,,,,A small tornado touched down briefly blowing a...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73264,986081,39153,21-OCT-2021 16:08:00,OHIO,SUMMIT,1634850480,1634850660,0,0,0,...,,,,,,,,,,A brief EF0 tornado with estimated maximum win...
73265,986070,39151,21-OCT-2021 16:10:00,OHIO,STARK,1634850600,1634850900,0,0,0,...,,,,,,,,,,An EF1 tornado with estimated maximum winds of...
73266,986136,39133,21-OCT-2021 16:21:00,OHIO,PORTAGE,1634851260,1634851320,0,0,0,...,,,,,,,,,,A very brief EF0 tornado with estimated maximu...
73267,986122,39155,21-OCT-2021 17:15:00,OHIO,TRUMBULL,1634854500,1634854620,0,0,0,...,,,,,,,,,,A brief EF1 tornado with estimated maximum win...


In [38]:
def find_next_tornado_segment(df, current_index):
    #print("____________FIND NEXT SEGMENT____________")

    iterrable_df = pd.DataFrame(df).copy()
    
    current_df = iterrable_df.iloc[current_index:,:]
    current_row = current_df.iloc[0:1,:]

    next_fips = current_row.loc[:,"TOR_OTHER_CZ_FIPS"].values[0]
    next_wfo = current_row.loc[:,"TOR_OTHER_WFO"].values[0]
    next_timestamp = current_row.loc[:,"END_TIMESTAMP"].values[0]    

    found = iterrable_df[(iterrable_df["CZ_FIPS"] == next_fips) &
                          (iterrable_df["WFO"] == next_wfo) &
                          (iterrable_df["BEGIN_TIMESTAMP"] == next_timestamp)]

    if (len(found) == 0):
        print(f"""    Next FIPS: {next_fips}
    Next WFO: {next_wfo}
    Next Timestamp: {next_timestamp}""")
        print("Next segment not found.")
        return -1        
    else:
        return found.index[0]

In [39]:
def identify_segments(df):
    print("____________IDENTIFY SEGMENTS____________")
    segments_df = df.copy()
    segments_df = segments_df[segments_df["TOR_OTHER_CZ_FIPS"].notna()]
    return segments_df.index

In [40]:
def identify_tornadoes(df, tornado_id):
    print("____________IDENTIFY TORNADOES____________")
    #segment_indices = identify_segments(df)
    # print(segment_indices)

    #global tornado_id
    work_df = pd.DataFrame(df).copy()
    #global detail_data_clean_pd

    for index, row in work_df.iterrows():
        if row["TORNADO_ID"] == 0:
            this_row = work_df.iloc[index:index+1,:]
            #work_df.loc[index:index+1,"TORNADO_ID"] = tornado_id
            #this_row = work_df.loc[index,:]
            work_df.loc[index,"TORNADO_ID"] = tornado_id
            is_segment = this_row["TOR_OTHER_CZ_FIPS"].notna()
            if is_segment.values[0]:
                #print(f"EVENT_ID: {this_row["EVENT_ID"].values[0]} TOR_OTHER_CZ_FIPS: {this_row["TOR_OTHER_CZ_FIPS"].values[0]}")
                #print(f"Index: {index}")
                next_segment_index = find_next_tornado_segment(work_df, index)
                if (next_segment_index > -1):
                    work_df.loc[next_segment_index,"TORNADO_ID"] = tornado_id
                    print(f"Next segment Tornado ID: {work_df.loc[next_segment_index,"TORNADO_ID"]}")
                #print(f"   Next Segment Index: {next_segment_index}")
                #print(f"Next: {find_next_tornado_segment(index)}")
            
            tornado_id = tornado_id + 1

    #tornado_id = tornado_id_temp
    #print(tornado_id)
    return work_df, tornado_id

In [89]:
# TESTING
enabled = True

if enabled:
    result = process_file(files[43])
    result = result.sort_values("BEGIN_TIMESTAMP")
    
    #print("DAMAGE_PROPERTY: ", result["DAMAGE_PROPERTY"].unique())
    #print("DAMAGE_CROPS: ", result["DAMAGE_CROPS"].unique())
    #print(result["FIPS"])
    #print(result["TOR_OTHER_FIPS"].unique())
    #print(result.columns)

    #print(result.dtypes)
    #result2, TEMP = identify_tornadoes(result, tornado_id)
    #print(result2)
    #print(f"Unique tornado count: {len(result2["TORNADO_ID"].unique())}")
    #print(result2["TORNADO_ID"])
    #result2.to_csv("tornado_id.csv")

Processing file ../Data/Details/StormEvents_details-ftp_v1.0_d1993_c20220425.csv
Empty DataFrame
Columns: [BEGIN_YEARMONTH, BEGIN_DAY, BEGIN_TIME, END_YEARMONTH, END_DAY, END_TIME, EPISODE_ID, EVENT_ID, STATE, STATE_FIPS, YEAR, MONTH_NAME, EVENT_TYPE, CZ_TYPE, CZ_FIPS, CZ_NAME, WFO, BEGIN_DATE_TIME, CZ_TIMEZONE, END_DATE_TIME, INJURIES_DIRECT, INJURIES_INDIRECT, DEATHS_DIRECT, DEATHS_INDIRECT, DAMAGE_PROPERTY, DAMAGE_CROPS, SOURCE, MAGNITUDE, MAGNITUDE_TYPE, FLOOD_CAUSE, CATEGORY, TOR_F_SCALE, TOR_LENGTH, TOR_WIDTH, TOR_OTHER_WFO, TOR_OTHER_CZ_STATE, TOR_OTHER_CZ_FIPS, TOR_OTHER_CZ_NAME, BEGIN_RANGE, BEGIN_AZIMUTH, BEGIN_LOCATION, END_RANGE, END_AZIMUTH, END_LOCATION, BEGIN_LAT, BEGIN_LON, END_LAT, END_LON, EPISODE_NARRATIVE, EVENT_NARRATIVE, DATA_SOURCE]
Index: []

[0 rows x 51 columns]
DAMAGE_PROPERTY RAW:  DAMAGE_PROPERTY
0       226
500K    116
50K     105
5K       76
5M       51
.5K      25
50M       9
.05K      4
3         1
30        1
06        1
1K        1
Name: count, dtype:

In [13]:
enabled = False

if enabled:
    result = process_file(files[-1])
    result = result.sort_values("BEGIN_TIMESTAMP")
    #print(result.dtypes)
    result2, TEMP = identify_tornadoes(result, tornado_id)
    print(result2)
    print(f"Unique tornado count: {len(result2["TORNADO_ID"].unique())}")
    print(result2["TORNADO_ID"])
    result2.to_csv("tornado_id.csv")

In [None]:
#details_full_df[details_full_df["CZ_TIMEZONE"] == "GMT-0"]

Unnamed: 0,EVENT_ID,CZ_FIPS,WFO,CZ_TIMEZONE,BEGIN_TIMESTAMP,END_TIMESTAMP,DEATHS,INJURIES,DAMAGE_PROPERTY,DAMAGE_CROPS,...,BEGIN_LOCATION,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EVENT_NARRATIVE,TORNADO_ID
452,10033097,49,,GMT-0,-57084720,-57084720,0,0,0,0,...,,0.0,,,32.3,-92.75,,,,0


In [None]:
#details_full_df[details_full_df["CZ_TIMEZONE"] == "UNK"]

KeyError: 'CZ_TIMEZONE'

In [16]:
details_full_df.dtypes

EVENT_ID               int64
CZ_FIPS                Int64
WFO                   object
CZ_TIMEZONE           object
BEGIN_TIMESTAMP        int64
END_TIMESTAMP          int64
DEATHS                 int64
INJURIES               int64
DAMAGE_PROPERTY        int64
DAMAGE_CROPS           int64
TOR_F_SCALE           object
TOR_LENGTH           float64
TOR_WIDTH            float64
TOR_OTHER_CZ_FIPS      Int64
TOR_OTHER_WFO         object
BEGIN_RANGE          float64
BEGIN_AZIMUTH         object
BEGIN_LOCATION        object
END_RANGE            float64
END_AZIMUTH           object
END_LOCATION          object
BEGIN_LAT            float64
BEGIN_LON            float64
END_LAT              float64
END_LON              float64
EVENT_NARRATIVE       object
TORNADO_ID             int64
dtype: object

In [121]:
print(len(details_full_df["EVENT_ID"].unique()))
print(len(details_full_df))

78496
78496


In [18]:
#detail_data_clean_pd = detail_data_clean_pd.drop(columns=["MONTH_NAME",  "SOURCE",
#                                                          "MAGNITUDE", "MAGNITUDE_TYPE", "FLOOD_CAUSE", "CATEGORY",
#                                                          "DATA_SOURCE"])
#detail_data_clean_pd.columns

In [19]:
#detail_data_clean_pd["BEGIN_YEARMONTH"] = detail_data_clean_pd["BEGIN_YEARMONTH"].astype(str)
#detail_data_clean_pd["BEGIN_YEARMONTH"]

In [20]:
#pattern = r"(\d{4})"
#detail_data_clean_pd["BEGIN_YEAR"] = detail_data_clean_pd["BEGIN_YEARMONTH"].str.extract(pattern)
#detail_data_clean_pd["BEGIN_YEAR"]

In [21]:
#pattern = r"\d{4}(\d{2})"
#detail_data_clean_pd["BEGIN_MONTH"] = detail_data_clean_pd["BEGIN_YEARMONTH"].str.extract(pattern)
#detail_data_clean_pd["BEGIN_MONTH"]

In [22]:
#detail_data_clean_pd["END_YEARMONTH"] = detail_data_clean_pd["END_YEARMONTH"].astype(str)
#detail_data_clean_pd["END_YEARMONTH"]

In [23]:
#pattern = r"(\d{4})"
#detail_data_clean_pd["END_YEAR"] = detail_data_clean_pd["END_YEARMONTH"].str.extract(pattern)
#detail_data_clean_pd["END_YEAR"]

In [24]:
#pattern = r"\d{4}(\d{2})"
#detail_data_clean_pd["END_MONTH"] = detail_data_clean_pd["END_YEARMONTH"].str.extract(pattern)
#detail_data_clean_pd["END_MONTH"]

In [25]:
#detail_data_clean_pd["BEGIN_YEAR"] = detail_data_clean_pd["BEGIN_YEAR"].astype("Int64")
#detail_data_clean_pd["BEGIN_MONTH"] = detail_data_clean_pd["BEGIN_MONTH"].astype("Int64")
#detail_data_clean_pd["END_YEAR"] = detail_data_clean_pd["END_YEAR"].astype("Int64")
#detail_data_clean_pd["END_MONTH"] = detail_data_clean_pd["END_MONTH"].astype("Int64")
#detail_data_clean_pd = detail_data_clean_pd.drop(columns=["BEGIN_YEARMONTH", "END_YEARMONTH"])

In [26]:
#detail_data_clean_pd["BEGIN_LOC"] = detail_data_clean_pd['BEGIN_RANGE'].astype(str) + " miles " + detail_data_clean_pd['BEGIN_AZIMUTH'] + " of " + detail_data_clean_pd['BEGIN_LOCATION'] + ", " + detail_data_clean_pd['STATE']
#detail_data_clean_pd["BEGIN_LOC"]

In [27]:
#detail_data_clean_pd["END_LOC"] = detail_data_clean_pd['END_RANGE'].astype(str) + " miles " + detail_data_clean_pd['BEGIN_AZIMUTH'] + " of " + detail_data_clean_pd['END_LOCATION'] + ", " + detail_data_clean_pd['STATE']
#detail_data_clean_pd["END_LOC"]