In [1]:
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib as matplotlib
import matplotlib.pyplot as plt
# import seaborn as sns
import os
import datetime as dt
from shapely import wkt
from shapely.geometry import Point, Polygon
import geopandas as gpd
import rtree, pygeos, fiona
# import netCDF4
# import xarray as xr
# import dask

pd.options.mode.chained_assignment = None  # default='warn'



In [2]:
# import re
# import pyproj

In [3]:
from import_functions import readin_and_subset_modis, get_folder_contents, get_modis_filedata
from modis_cleaning import clean_modis, add_consec_months_by_year, aggregate_by_county
from utils import REPO_PATH, DATA_PATH, DATA_CLEAN_PATH
from utils import GEO_CRS, PROJ_CRS
from utils import set_plt_settings, split_dataframe

# And change jupyter settings to auto-reload these functions before each instance running them
%load_ext autoreload
%autoreload 2

In [4]:
set_plt_settings()

# read in county shapefiles

In [5]:
c_filepath = str(DATA_PATH / 'county_shapefiles/cb_2018_us_county_5m.shp')
counties = gpd.read_file(c_filepath).to_crs(GEO_CRS)
counties.columns = counties.columns.str.lower()

# read and clean modis

In [6]:
folders = ('lst', 'ndvi')
prop_strs = ('LST_Day_CMG', 'CMG 0.05 Deg Monthly NDVI')
# i = 1
# folder = folders[i]
# prop_str = prop_strs[i]

In [7]:
# loop through modis datasets
modis = pd.DataFrame()
dfs = {}
for folder, prop_str in zip(folders, prop_strs):
    print(f'====== {folder} =====')
    folderpath = DATA_PATH / folder

    # read and clean
    print('reading and cleaning...')
    df_raw, failed_loads = readin_and_subset_modis(folderpath=folderpath, prop_str=prop_str)
    print(f'failed to load {len(failed_loads)} files: {failed_loads}')
    df = clean_modis(df_raw)
    
    
    chunks = split_dataframe(df, 'lat', chunk_pct=0.05)
    print('calculating consecutive trends and collapsing to annual data (by chunk)...')
    df_a = pd.DataFrame()
    for chunk in tqdm(chunks):
        # calculate consecutive trends
        df_chunk = add_consec_months_by_year(df=chunk, prop_col='prop', mean_col='mean', group_cols=['lat', 'lon'], year_col='year')

        # collapse to annual
        df_chunk_a = (df_chunk
             .groupby(['lat', 'lon', 'year'])[['prop', 'mbm', 'dbm', 'consec_mbm']]
             .agg({'prop':['mean','std', 'min', 'max'], 'mbm':'sum', 'dbm':'sum', 'consec_mbm':'max'}))
        df_chunk_a.columns = ['_'.join(col).strip() for col in df_chunk_a.columns.values]
        df_chunk_a = df_chunk_a.reset_index()
        df_a = pd.concat((df_chunk_a, df_a))

    # collapse by county
    print('collapsing to county-level data...')
    df_ac = aggregate_by_county(df_a, counties)
    df_ac['value'] = folder

    # stack
    dfs[folder] = (df, df_a, df_ac) # raw datasets in case anything happens
    modis = pd.concat([df_ac, modis]) # annual-county stacked
    print(f'done with {folder}!')

reading and cleaning...
files to read: 263


100%|██████████████████████████████████████████████| 263/263 [2:34:31<00:00, 35.25s/it]


failed to load 0 files: []
Missing values (ocean) dropped: 49134904 (68.86%)
Outlier threshold: [45345.]
Outliers dropped: 0 (0.00%)
calculating consecutive trends and collapsing to annual data (by chunk)...


100%|█████████████████████████████████████████████████| 21/21 [50:56<00:00, 145.54s/it]


collapsing to county-level data...
done with lst!
reading and cleaning...
files to read: 263


100%|██████████████████████████████████████████████| 263/263 [2:31:55<00:00, 34.66s/it]


failed to load 2 files: ['MOD13C2.A2000183.006.2015147120151.hdf', 'MOD13C2.A2000245.006.2015147121321.hdf']
Missing values (ocean) dropped: 52714406 (66.34%)
Outlier threshold: [18801.]
Outliers dropped: 0 (0.00%)
calculating consecutive trends and collapsing to annual data (by chunk)...


100%|█████████████████████████████████████████████████| 21/21 [55:56<00:00, 159.86s/it]


collapsing to county-level data...
done with ndvi!


# Write data to computer

In [8]:
# write final dataset
filename = 'modis_cln.csv'
filepath = DATA_CLEAN_PATH / filename
filepath.parent.mkdir(parents=True, exist_ok=True)
modis.to_csv(filepath, index=False)

In [9]:
# write intermediate datasets
for k in dfs.keys():
    filename = f'modis_annual_{k}.csv'
    filepath = DATA_CLEAN_PATH / filename
    filepath.parent.mkdir(parents=True, exist_ok=True)
    dfs[k][1].to_csv(filepath, index=False)

# Visualize

In [17]:
prop = 'lst'
year = 2020
title = f'MODIS: Land surface temperature, {year}'
val = 'prop_mean'

vdf = dfs[prop][1]
vdf = vdf.loc[vdf.year == year]

In [None]:
# Visualize
geometry = [Point(xy) for xy in zip(vdf['lon'], vdf['lat'])]
gp = gpd.GeoDataFrame(vdf, crs=GEO_CRS, geometry=geometry)
fig,ax = plt.subplots(figsize=(20,20))
gp.plot(ax=ax, column=val, markersize=0.1, legend=True)
plt.title(title)
plt.show()

  uniques = Index(uniques)


In [19]:
prop = 'ndvi'
year = 2020
title = f'MODIS: NDVI, {year}'
val = 'prop_mean'

vdf = dfs[prop][1]
vdf = vdf.loc[vdf.year == year]

In [20]:
# Visualize
geometry = [Point(xy) for xy in zip(vdf['lon'], vdf['lat'])]
gp = gpd.GeoDataFrame(vdf, crs=GEO_CRS, geometry=geometry)
fig,ax = plt.subplots(figsize=(20,10))
gp.plot(ax=ax, column=val, markersize=0.1, legend=True)
plt.title(title)
plt.show()

  uniques = Index(uniques)


ValueError: Image size of 1078x1654183 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 1440x720 with 1 Axes>