# Storm Data Analysis

In [None]:
# Read in data
import pandas as pd

df = pd.read_csv('./../Data/Stormdata_2006.csv', encoding='iso-8859-1')
print(df.shape)
print(df.columns)

(48595, 58)
Index(['BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'END_YEARMONTH',
       'END_DAY', 'END_TIME', 'EPISODE_ID', 'EVENT_ID', 'STATE', 'STATE_FIPS',
       'YEAR', 'MONTH_NAME', 'EVENT_TYPE', 'CZ_TYPE', 'CZ_FIPS', 'CZ_NAME',
       'WFO', 'BEGIN_DATE_TIME', 'CZ_TIMEZONE', 'END_DATE_TIME',
       'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT',
       'DEATHS_INDIRECT', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'SOURCE',
       'MAGNITUDE', 'MAGNITUDE_TYPE', 'FLOOD_CAUSE', 'CATEGORY', 'TOR_F_SCALE',
       'TOR_LENGTH', 'TOR_WIDTH', 'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE',
       'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME', 'BEGIN_RANGE',
       'BEGIN_AZIMUTH', 'BEGIN_LOCATION', 'END_RANGE', 'END_AZIMUTH',
       'END_LOCATION', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON',
       'EPISODE_NARRATIVE', 'EVENT_NARRATIVE', 'LAST_MOD_DATE',
       'LAST_MOD_TIME', 'LAST_CERT_DATE', 'LAST_CERT_TIME', 'LAST_MOD',
       'LAST_CERT', 'ADDCORR_FLG', 'ADDCORR_DATE'],
      dtype='obje

In [30]:
# Convert start and end dates to date time
df['END_DATE_TIME'] = pd.to_datetime(df['END_DATE_TIME'])
df['BEGIN_DATE_TIME'] = pd.to_datetime(df['BEGIN_DATE_TIME'])

print(df.shape)
print("min:", min(df['BEGIN_DATE_TIME']), "\nmax:", max(df['END_DATE_TIME']))

# Remove those ending after our AOL database - Those before could be ok?
df = df[df['END_DATE_TIME'] <= '2006-06-01 00:00:00']

print(df.shape)
print("min:", min(df['BEGIN_DATE_TIME']), "\nmax:", max(df['END_DATE_TIME']))

(48595, 58)
min: 2006-01-01 00:00:00 
max: 2006-10-27 21:00:00
(25734, 58)
min: 2006-01-01 00:00:00 
max: 2006-05-31 23:59:00


#### ROLAP Query possibilities:

# Count the number of queries on each day, month
```
SELECT T.BEGIN_MONTH, T.BEGIN_DAY, SUM(*)
FROM Storm_Table T, QUERY_TIME_TABLE Q
WHERE Q.Day = T.BEGIN_DAY AND Q.Query IS NOT NULL
GROUP BY ROLLUP (T.BEGIN_MONTH, T.BEGIN_DAY)
```

# Count the number of storm events beginning on each day, month
```
SELECT T.BEGIN_MONTH, T.BEGIN_DAY, SUM(*)
FROM Storm_Table T
GROUP BY ROLLUP (T.BEGIN_MONTH, T.BEGIN_DAY)
```


# Obtain the Queries that occur during a natural disaster
```
SELECT Q.Query, Q.DATE_TIME
FROM Storm_Table T, QUERY_TIME_TABLE Q
WHERE 
    Q.Query IS NOT NULL
GROUP BY
    Q.Query,
    Q.DATE_TIME
HAVING Q.DATE_TIME BETWEEN T.BEGIN_DATE_TIME AND T.END_DATE_TIME
```

# Obtain the Queries that occur during a Tornado
```
SELECT Q.Query, Q.DATE_TIME
FROM 
    QUERY_TIME_TABLE Q,
    (
        SELECT *
        FROM Storm_Table N
        WHERE N.EVENT_TYPE = 'Tornado'  -- Can change this line to look at those with damage high cost or injuries too
    ) T
WHERE 
    Q.Query IS NOT NULL
GROUP BY
    Q.Query,
    Q.DATE_TIME
HAVING Q.DATE_TIME BETWEEN T.BEGIN_DATE_TIME AND T.END_DATE_TIME
```




#### Specific Question:
During the event (and 2 weeks after) are people searching things related to the event? (How do we find the keywords associated with this? Any queries containing the 'event type' string?)

#### To find the similairty between queries and events:
We could try to embed all of the tornado event descriptions and then look at similarity of query embeddings to those (either to the average embedding or pairwise distances), examining the english plaintext of the queries whose embeddings are most similar to the description embeddings.

Also, we could look at the urls clicked and see if any of them are related to the weather reporting agency detailed in the source column of the events.

In [50]:
# More Exploration

print(df.columns)
print(df.shape)


print(df['EVENT_TYPE'].nunique())  # 36 unique event types
print(df['EVENT_TYPE'].unique())

# Most seem to be boring, look at wildfires & tornadoes

print("Tornadoes: ", len(df[df['EVENT_TYPE'] == 'Tornado']))  # number of tornado events (705)
print("Wildfires: ", len(df[df['EVENT_TYPE'] == 'Wildfire']))  # number of wildfire events (191)

tornado_df = df[df['EVENT_TYPE'] == 'Tornado']
wildfire_df = df[df['EVENT_TYPE'] == 'Wildfire']



Index(['BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'END_YEARMONTH',
       'END_DAY', 'END_TIME', 'EPISODE_ID', 'EVENT_ID', 'STATE', 'STATE_FIPS',
       'YEAR', 'MONTH_NAME', 'EVENT_TYPE', 'CZ_TYPE', 'CZ_FIPS', 'CZ_NAME',
       'WFO', 'BEGIN_DATE_TIME', 'CZ_TIMEZONE', 'END_DATE_TIME',
       'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT',
       'DEATHS_INDIRECT', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'SOURCE',
       'MAGNITUDE', 'MAGNITUDE_TYPE', 'FLOOD_CAUSE', 'CATEGORY', 'TOR_F_SCALE',
       'TOR_LENGTH', 'TOR_WIDTH', 'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE',
       'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME', 'BEGIN_RANGE',
       'BEGIN_AZIMUTH', 'BEGIN_LOCATION', 'END_RANGE', 'END_AZIMUTH',
       'END_LOCATION', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON',
       'EPISODE_NARRATIVE', 'EVENT_NARRATIVE', 'LAST_MOD_DATE',
       'LAST_MOD_TIME', 'LAST_CERT_DATE', 'LAST_CERT_TIME', 'LAST_MOD',
       'LAST_CERT', 'ADDCORR_FLG', 'ADDCORR_DATE'],
      dtype='object')
(25734,

In [None]:
# Drop the columns which contain only NAs for both of the dataframes

print(f"Pre shapes:\n\nTornado: {tornado_df.shape}\nWildfire: {wildfire_df.shape}\n")

tornado_df = tornado_df.dropna(axis = 1, how = 'all')
wildfire_df = wildfire_df.dropna(axis = 1, how = 'all')

print(f"Post shapes:\n\nTornado: {tornado_df.shape}\nWildfire: {wildfire_df.shape}")

Pre shapes:

Tornado: (705, 58)
Wildfire: (191, 58)
Post shapes:

Tornado: (705, 44)
Wildfire: (191, 29)


In [53]:
# Print the remaining cols
print(tornado_df.columns)
print(wildfire_df.columns)

# Save the dataframes as csvs in the data directory
tornado_df.to_csv('./../Data/TornadoEventData.csv', index = False)
wildfire_df.to_csv('./../Data/WildfireEventData.csv', index = False)


Index(['BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'END_YEARMONTH',
       'END_DAY', 'END_TIME', 'EPISODE_ID', 'EVENT_ID', 'STATE', 'STATE_FIPS',
       'YEAR', 'MONTH_NAME', 'EVENT_TYPE', 'CZ_TYPE', 'CZ_FIPS', 'CZ_NAME',
       'WFO', 'BEGIN_DATE_TIME', 'CZ_TIMEZONE', 'END_DATE_TIME',
       'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT',
       'DEATHS_INDIRECT', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'SOURCE',
       'MAGNITUDE', 'MAGNITUDE_TYPE', 'TOR_F_SCALE', 'TOR_LENGTH', 'TOR_WIDTH',
       'BEGIN_RANGE', 'BEGIN_AZIMUTH', 'BEGIN_LOCATION', 'END_RANGE',
       'END_AZIMUTH', 'END_LOCATION', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT',
       'END_LON', 'EPISODE_NARRATIVE', 'EVENT_NARRATIVE'],
      dtype='object')
Index(['BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'END_YEARMONTH',
       'END_DAY', 'END_TIME', 'EPISODE_ID', 'EVENT_ID', 'STATE', 'STATE_FIPS',
       'YEAR', 'MONTH_NAME', 'EVENT_TYPE', 'CZ_TYPE', 'CZ_FIPS', 'CZ_NAME',
       'WFO', 'BEGIN_DATE_TIME', 'CZ_TIMEZON