# Processing

# EXPLORATORY DATA ANALYSIS
1) Describe the Data
2) Trends
3) Summary tables
4) drop unneeded columns
5) drop duplicates
6) drop outliers

# Prepare the EDA Environment

In [1]:
import os
import warnings
import netCDF4 as nc
import numpy as np
import pandas as pd
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import holoviews as hv
import xarray as xr
import rasterio
import ipywidgets as widgets
import regex as re
import chardet
import holoviews as hv
from geopandas import GeoDataFrame
from shapely.geometry import Point
from shapely.geometry import LineString
from rasterio.transform import from_origin
from rasterstats import zonal_stats
from matplotlib.path import Path
from matplotlib.colors import Normalize
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.basemap import Basemap
from netCDF4 import Dataset
from pyproj import CRS
from IPython.display import display
from sklearn.preprocessing import StandardScaler
from bokeh.io import output_notebook, show
from bokeh.resources import INLINE
from windbreaks_helpers import *

ImportError: cannot import name 'Image' from 'PIL' (unknown location)

In [4]:
warnings.filterwarnings('ignore')

In [5]:
# Set up Bokeh to display plots inline in the notebook.
output_notebook(INLINE)
hv.extension('bokeh')
%matplotlib inline

In [6]:
print_modules()

In [7]:
# # Diagnostic only!! Very long print output.
#print_all_imported_modules()

In [8]:
print_functions()

In [9]:
# # Recover previously stored src_dir
# %store -r src_dir
# print('The Data source directory is:', src_dir)

# If src_dir is not set, then comment out above, uncomment below and run, again
# Path to the directory
directory = 'Data'

# Check if the directory exists
if os.path.isdir(directory):
    src_dir = directory    
else:
    src_dir = '/data/workspace_files/'

%store src_dir
print(src_dir)

In [10]:
# %store -r extent_coords

# # Project extents
extent_coords = {'min_lat': 36.998665, 'max_lat': 37.734463, 'min_lon': -95.964735, 'max_lon': -94.616789}
%store extent_coords
print('The project extents are: ', extent_coords)

# County Boundaries

In [12]:
# Load the county boundary shapefile
sixco_fn = os.path.join(src_dir, 'GIS_files/KS_six_co_bo.shp')
sixco_data = gpd.read_file(sixco_fn)
sixco_data = sixco_data.to_crs('EPSG:6469')

# Get the CRS
get_crs(sixco_data)
# get column names with their index
print(sixco_data)

# Storm Event Data

In [14]:
se_df = pd.read_csv(os.path.join(src_dir, 'Storm_event/StormEvents_all.csv'))
output_shp_file = os.path.join(src_dir, 'Storm_event/StormEvents_all.shp')
gdf_to_shp(se_df, output_shp_file, extent_coords)
se_gdf = gpd.read_file(output_shp_file)
se_gdf = se_gdf.set_crs("EPSG:4269", allow_override=True)
print('Set the crs based on original shapefile')
get_crs(se_gdf)
print('Project to Kansas South EPSG:6469')
se_gdf = se_gdf.to_crs('EPSG:6469')
print('Write the projected shapefile')
# Write the projected shapefile
proj_shp_file = os.path.join(src_dir, 'Storm_event/StormEvents_all_proj.shp')
se_gdf.to_file(proj_shp_file)
# Read the projected shapefile
se_gdf = gpd.read_file(proj_shp_file)
get_crs(se_gdf)
# print(se_gdf)
se_gdf.head()

In [15]:
# Create a new map instance with the 'tmerc' projection
m = Basemap(projection='tmerc', lat_0=0, lon_0=0, k_0=0.9996)

# Draw coastlines, countries, states, and counties
m.drawcoastlines()
m.drawcountries()
m.drawstates()
m.drawcounties()

# Load the shapes
storm_events = se_gdf.to_crs(6469).geometry.scale(xfact=0.001, yfact=0.001, origin=(0,0))
ks_six_counties = sixco_data.to_crs(6469).geometry.scale(xfact=0.001, yfact=0.001, origin=(0,0))

# Create a new figure and axis
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

# Plot the shapefiles
for line in storm_events:
    x, y = line.xy
    x = list(x)
    y = list(y)
    m.plot(x, y, color='red')
    ax.quiver(x[-2], y[-2], [x[-1]-x[-2]], [y[-1]-y[-2]], scale_units='xy', angles='xy', scale=1, color='red')

for polygon in ks_six_counties:
    x, y = polygon.exterior.xy
    m.plot(x, y, color='beige', edgecolor='brown')

plt.show()

# Crop Loss Data

In [None]:
crop_data = pd.read_csv(os.path.join(src_dir, 'crop_loss_COL/CropData_all.csv'))
crop_data.head(10000)

In [None]:
storm_grouped = se_gdf.groupby(['YEAR', 'FIPS'])['EVENT_ID'].count().reset_index(name='STORM_COUNT')


crop_grouped = crop_data.groupby(['YEAR', 'FIPS'])['indemnity'].sum().reset_index(name='TOTAL_INDEMNITY')

aligned_data = pd.merge(storm_grouped, crop_grouped, on=['YEAR', 'FIPS'], how='inner')
print(aligned_data)

In [None]:


# Calculate correlation
correlation = aligned_data[['STORM_COUNT', 'TOTAL_INDEMNITY']].corr()
correlation.head()

In [None]:
%whos

# Visualization

In [None]:
# Create a scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=aligned_data, x='STORM_COUNT', y='TOTAL_INDEMNITY')

# Add a regression line
sns.regplot(data=aligned_data, x='STORM_COUNT', y='TOTAL_INDEMNITY', scatter=False, color='red')

# Set plot title and labels
plt.title('Correlation between Storm Count and Total Indemnity')
plt.xlabel('Storm Count')
plt.ylabel('Total Indemnity')

# Show the plot
plt.show()

In [None]:
# Create a new figure
plt.figure(figsize=(12, 6))

# Create a box plot for 'STORM_COUNT'
plt.subplot(1, 2, 1)
sns.boxplot(data=aligned_data, y='STORM_COUNT')
plt.title('Box Plot of Storm Count')

# Create a box plot for 'TOTAL_INDEMNITY'
plt.subplot(1, 2, 2)
sns.boxplot(data=aligned_data, y='TOTAL_INDEMNITY')
plt.title('Box Plot of Total Indemnity')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Assuming 'aligned_data' is your DataFrame from the previous step

# Standardize 'STORM_COUNT' and 'TOTAL_INDEMNITY'
scaler = StandardScaler()
aligned_data[['STORM_COUNT', 'TOTAL_INDEMNITY']] = scaler.fit_transform(aligned_data[['STORM_COUNT', 'TOTAL_INDEMNITY']])

# Melt the DataFrame to long format for seaborn boxplot
melted_data = aligned_data.melt(value_vars=['STORM_COUNT', 'TOTAL_INDEMNITY'], var_name='Variable', value_name='Value')

# Create a box plot
plt.figure(figsize=(10, 6))
sns.boxplot(data=melted_data, x='Variable', y='Value')
plt.title('Box Plot of Standardized Storm Count and Total Indemnity')

# Show the plot
plt.show()

In [None]:
aligned_data.set_index('YEAR')[['STORM_COUNT', 'TOTAL_INDEMNITY']].plot(marker='o')
plt.ylabel('Value')
plt.title('Storm Count and Total Indemnity over Years')
plt.show()

In [None]:
aligned_data.set_index('YEAR')[['STORM_COUNT', 'TOTAL_INDEMNITY']].plot(kind='bar', subplots=True)
plt.ylabel('Value')
plt.title('Storm Count and Total Indemnity per Year')
plt.show()