# Write the necessary weather data from the given .csv file

In [147]:
# imports
import pandas as pd
import pyexasol

host = '192.168.56.101/8A3F422F336963EBB2E452E08A6B6E7060A554D85223032A44E9C60F0EFB8544:8563'  # Don't forget to change the fingerprint
user = 'sys'
password = 'exasol'

# Connect to Exasol
conn = pyexasol.connect(dsn=host, 
                        user=user, 
                        password=password, 
                        debug=False, 
                        protocol_version=pyexasol.PROTOCOL_V1)

#### Read in CSV

In [148]:
df = pd.read_csv('./../Data/Stormdata_2006.csv', encoding='iso-8859-1')
print(df.shape)
print(df.columns)

# Convert start and end dates to date time
df['END_DATE_TIME'] = pd.to_datetime(df['END_DATE_TIME'])
df['BEGIN_DATE_TIME'] = pd.to_datetime(df['BEGIN_DATE_TIME'])

print(df.shape)
print("min:", min(df['BEGIN_DATE_TIME']), "\nmax:", max(df['END_DATE_TIME']))

# Remove those ending after our AOL database - Those before could be ok?
df = df[df['END_DATE_TIME'] <= '2006-06-01 00:00:00']

print(df.shape)
print("min:", min(df['BEGIN_DATE_TIME']), "\nmax:", max(df['END_DATE_TIME']))

(48595, 58)
Index(['BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'END_YEARMONTH',
       'END_DAY', 'END_TIME', 'EPISODE_ID', 'EVENT_ID', 'STATE', 'STATE_FIPS',
       'YEAR', 'MONTH_NAME', 'EVENT_TYPE', 'CZ_TYPE', 'CZ_FIPS', 'CZ_NAME',
       'WFO', 'BEGIN_DATE_TIME', 'CZ_TIMEZONE', 'END_DATE_TIME',
       'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT',
       'DEATHS_INDIRECT', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'SOURCE',
       'MAGNITUDE', 'MAGNITUDE_TYPE', 'FLOOD_CAUSE', 'CATEGORY', 'TOR_F_SCALE',
       'TOR_LENGTH', 'TOR_WIDTH', 'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE',
       'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME', 'BEGIN_RANGE',
       'BEGIN_AZIMUTH', 'BEGIN_LOCATION', 'END_RANGE', 'END_AZIMUTH',
       'END_LOCATION', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON',
       'EPISODE_NARRATIVE', 'EVENT_NARRATIVE', 'LAST_MOD_DATE',
       'LAST_MOD_TIME', 'LAST_CERT_DATE', 'LAST_CERT_TIME', 'LAST_MOD',
       'LAST_CERT', 'ADDCORR_FLG', 'ADDCORR_DATE'],
      dtype='obje

#### Drop irrelevant columns

In [149]:
# Keep: 'BEGIN_DATE_TIME', 'END_DATE_TIME', 'EVENT_TYPE', 'BEGIN_DAY', 'END_DAY', 'BEGIN_YEARMONTH', 'END_YEARMONTH', 'STATE', 'STATE_FIPS', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS' 
keep = ['BEGIN_DATE_TIME', 'END_DATE_TIME', 'EVENT_TYPE', 'BEGIN_DAY', 'END_DAY', 'BEGIN_YEARMONTH', 'END_YEARMONTH', 'STATE', 'STATE_FIPS', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS']

df = df.loc[:, keep]

print(df.columns)
print(df.head)

# Fix year/month and naming oddness
df['BEGIN_MONTH'] = df['BEGIN_YEARMONTH'] - 200600
df['END_MONTH'] = df['END_YEARMONTH'] - 200600
df['FIPS_ST'] = df['STATE_FIPS']  # This contains all the spatial information we need- writing the state name was buggy
df['EVENT_TYPE'] = df['EVENT_TYPE'].astype(str)

df = df.drop(columns= ['BEGIN_YEARMONTH', 'END_YEARMONTH', 'STATE_FIPS', 'STATE'])

print(df.columns)
print(df.head)


Index(['BEGIN_DATE_TIME', 'END_DATE_TIME', 'EVENT_TYPE', 'BEGIN_DAY',
       'END_DAY', 'BEGIN_YEARMONTH', 'END_YEARMONTH', 'STATE', 'STATE_FIPS',
       'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT',
       'DEATHS_INDIRECT', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS'],
      dtype='object')
<bound method NDFrame.head of           BEGIN_DATE_TIME       END_DATE_TIME                EVENT_TYPE  \
0     2006-01-01 00:00:00 2006-01-31 23:59:00                   Drought   
1     2006-01-01 00:00:00 2006-01-31 23:59:00                   Drought   
2     2006-01-01 00:00:00 2006-01-31 23:59:00                   Drought   
3     2006-01-01 00:00:00 2006-01-31 23:59:00                   Drought   
4     2006-01-01 00:00:00 2006-01-31 23:59:00                   Drought   
...                   ...                 ...                       ...   
25729 2006-05-31 21:10:00 2006-05-31 22:15:00  Marine Thunderstorm Wind   
25730 2006-05-31 22:00:00 2006-05-31 22:00:00         Thunderstorm Wind   


#### Clean up the damage cols

In [150]:
def convert_abbreviated_string(value):
    # Check for 'k' (thousand), 'm' (million), 'b' (billion)
    if isinstance(value, str):
        print(value)
        if 'k' in value.lower():
            return float(value.replace('k', '').replace('K', '')) * 1000
        elif 'm' in value.lower():
            return float(value.replace('m', '').replace('M', '')) * 1000000
        elif 'b' in value.lower():
            return float(value.replace('b', '').replace('B', '')) * 1000000000
        else:
            # If no abbreviation, just return the float version of the number
            try:
                return float(value)
            except ValueError:
                return None  # or handle invalid strings as needed
    return value

In [151]:

# Print the unique columns and the one anomaly
print(df[df['DAMAGE_PROPERTY'].notna()]['DAMAGE_PROPERTY'].unique())
print(sum(df[df['DAMAGE_PROPERTY'].notna()]['DAMAGE_PROPERTY']=='K'))

print(df[df['DAMAGE_CROPS'].notna()]['DAMAGE_CROPS'].unique())

['900K' '4.9M' '2M' '3.2M' '4.5M' '700K' '100K' '1M' '15M' '165K' '243K'
 '524K' '623K' '1.2M' '5M' '115B' '104M' '0' '22M' '2.5M' '8.8M' '108M'
 '70K' '20K' '50K' '35K' '36K' '87K' '15K' '175K' '0K' '500K' '200K'
 '800K' '1K' '10K' '2K' '25K' '71K' '1.4M' '5.5M' '.1K' '75K' '5K' '45K'
 '55K' '.5K' '3K' '750K' '4M' '250K' '150K' '350K' '8K' '300K' '400K' '7K'
 '12K' '80K' '2.9M' '600K' '130K' '30K' '90K' '450K' '4K' '1.5K' '65K'
 '60K' '40K' '.25M' '120K' '.1M' '.5M' '160K' '16K' '2.5K' '.05K' '81K'
 '9K' '7.5M' '230K' '125K' '94.5K' '380K' '14K' '1.6M' '6K' '4.5K' '.01K'
 '1.5M' '85K' '11K' '168K' '357K' '.2K' '.25K' '10M' '.21K' '.17K' '8M'
 '19.9M' '30M' '3M' '850K' '50M' '650K' '3.5K' '126K' '265K' '.86K' '27K'
 '34K' '.75K' '190K' '240K' '43K' '210K' '21K' '.3K' '590K' '164K' '1.1M'
 '.04M' '.01M' '.15K' '.85K' '294K' '632K' '245K' '146K' '1.37M' '1.59M'
 '932K' '237K' '975K' '887K' '113K' '1.34M' '1.14M' '1.32M' '109K' '177K'
 '996K' '552K' '100M' '7M' '14.4M' '11.5M' '18K' '25M'

#### Modify the original DF

In [152]:
# Clean Up the damage property/crops data
df['DAMAGE_PROPERTY'] = df[df['DAMAGE_PROPERTY'] == 'K']['DAMAGE_PROPERTY'] = 1000  # Reporting error

# Get the string abbr. out
df['DAMAGE_CROPS'] = df['DAMAGE_CROPS'].replace("", float('nan')).apply(convert_abbreviated_string)
df['DAMAGE_PROPERTY'] = df['DAMAGE_PROPERTY'].replace("", float('nan')).apply(convert_abbreviated_string)


1B
32.5M
3M
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
750K
0
0
0
0
0
0
0
0
10K
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1.3M
0
0K
0
0
0
0
0
0
0
0
1M
0
0
0
0
0
0
0
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1M
0
0
0
0
0
0
0
0
0
10K
10K
0
0
30K
0
0
0
0
0
0
0
0
0
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
50K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0
0
0
0
0
3K
0K
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
15K
0
0
0
0
0
0
500K
300M
0
10K
2K
5K
0
0
0
30K
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0K
0K
0K
0K
0K
0K
0K
0
0
0
0
0K
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
3.9M
1.52M
4.02M
2.5M
0K
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0K
0
0K
0K
0K
0K
0K
1K
112.5K
0
0
0
0
0
0
200M
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
500K
0
0
0
250K
0
0
0
0
5K
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
0K
100K
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [153]:
# Data Types:
df.dtypes

BEGIN_DATE_TIME      datetime64[ns]
END_DATE_TIME        datetime64[ns]
EVENT_TYPE                   object
BEGIN_DAY                     int64
END_DAY                       int64
INJURIES_DIRECT               int64
INJURIES_INDIRECT             int64
DEATHS_DIRECT                 int64
DEATHS_INDIRECT               int64
DAMAGE_PROPERTY               int64
DAMAGE_CROPS                float64
BEGIN_MONTH                   int64
END_MONTH                     int64
FIPS_ST                       int64
dtype: object

#### Create The DB Schema on exasol

In [None]:
create_table_query = '''
CREATE TABLE AOL_SCHEMA.WEATHER_EVENTS (
    BEGIN_DATE_TIME TIMESTAMP,
    END_DATE_TIME TIMESTAMP,
    EVENT_TYPE VARCHAR(100),
    BEGIN_DAY INTEGER,
    END_DAY INTEGER,
    BEGIN_MONTH INTEGER,
    END_MONTH INTEGER,
    FIPS_ST INTEGER,
    INJURIES_DIRECT INTEGER,
    INJURIES_INDIRECT INTEGER,
    DEATHS_DIRECT INTEGER,
    DEATHS_INDIRECT INTEGER,
    DAMAGE_PROPERTY INTEGER,
    DAMAGE_CROPS FLOAT
)
'''
# conn.execute("DROP TABLE AOL_SCHEMA.WEATHER_EVENTS")
conn.execute(create_table_query)



<ExaStatement session_id=1814604986601759713 stmt_idx=2>

In [155]:
query = "DESCRIBE AOL_SCHEMA.WEATHER_EVENTS"  # Replace with your schema and table name
result = conn.execute(query)

# Fetch and print the results
for row in result:
    print(row)

('BEGIN_DATE_TIME', 'TIMESTAMP', 'TRUE', 'FALSE')
('END_DATE_TIME', 'TIMESTAMP', 'TRUE', 'FALSE')
('EVENT_TYPE', 'VARCHAR(100) UTF8', 'TRUE', 'FALSE')
('BEGIN_DAY', 'DECIMAL(18,0)', 'TRUE', 'FALSE')
('END_DAY', 'DECIMAL(18,0)', 'TRUE', 'FALSE')
('BEGIN_MONTH', 'DECIMAL(18,0)', 'TRUE', 'FALSE')
('END_MONTH', 'DECIMAL(18,0)', 'TRUE', 'FALSE')
('FIPS_ST', 'DECIMAL(18,0)', 'TRUE', 'FALSE')
('INJURIES_DIRECT', 'DECIMAL(18,0)', 'TRUE', 'FALSE')
('INJURIES_INDIRECT', 'DECIMAL(18,0)', 'TRUE', 'FALSE')
('DEATHS_DIRECT', 'DECIMAL(18,0)', 'TRUE', 'FALSE')
('DEATHS_INDIRECT', 'DECIMAL(18,0)', 'TRUE', 'FALSE')
('DAMAGE_PROPERTY', 'DECIMAL(18,0)', 'TRUE', 'FALSE')
('DAMAGE_CROPS', 'DOUBLE', 'TRUE', 'FALSE')


#### Write the data to the database

In [156]:
df = df.where(pd.notnull(df), None)
#df = df.applymap(lambda x: x.encode('utf-8').decode('utf-8') if isinstance(x, str) else x)

conn.import_from_pandas(df, table=('AOL_SCHEMA', 'WEATHER_EVENTS'))