In [1]:
import pandas as pd
from datetime import date
from sqlalchemy import create_engine
import pymysql
pymysql.install_as_MySQLdb()

### Extract CSV of Northern Hemisphere Ice Extents to pandas Dataframe

In [2]:
csv_file1 = "./Resources/noaa/G02135/north/daily/data/N_seaice_extent_daily_v3.0.csv"
n_seaice_df = pd.read_csv(csv_file1, header=1)
n_seaice_df.columns = ['year', 'month', 'day', 'north_icearea_sq_km', 'missing', 'source_data']
# n_seaice_df = n_seaice_df.drop(columns=['source_data', 'missing'])
n_seaice_df.head()

Unnamed: 0,year,month,day,north_icearea_sq_km,missing,source_data
0,1978,10,26,10.231,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...
1,1978,10,28,10.42,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...
2,1978,10,30,10.557,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...
3,1978,11,1,10.67,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...
4,1978,11,3,10.777,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...


### Extract CSV of Southern Hemisphere Ice Extents to pandas Dataframe

In [3]:
csv_file2 = './Resources/noaa/G02135/south/daily/data/S_seaice_extent_daily_v3.0.csv'
s_seaice_df = pd.read_csv(csv_file2, header=1)
s_seaice_df.columns = ['year', 'month', 'day', 'south_icearea_sq_km', 'missing', 'source_data']
# s_seaice_df = s_seaice_df.drop(columns=['source_data', 'missing'])
s_seaice_df.head()

Unnamed: 0,year,month,day,south_icearea_sq_km,missing,source_data
0,1978,10,26,17.624,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...
1,1978,10,28,17.803,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...
2,1978,10,30,17.67,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...
3,1978,11,1,17.527,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...
4,1978,11,3,17.486,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...


### Confirm Data types

In [4]:
s_seaice_df.dtypes

year                     int64
month                    int64
day                      int64
south_icearea_sq_km    float64
missing                float64
source_data             object
dtype: object

### Create inline function to combine first three columns to one "date_time" column

In [5]:
n_seaice_df["date_time"] = n_seaice_df.apply(lambda row :
                          date(row.year, row.month, row.day),
                          axis=1
                          )

n_seaice_df.head()

Unnamed: 0,year,month,day,north_icearea_sq_km,missing,source_data,date_time
0,1978,10,26,10.231,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-26
1,1978,10,28,10.42,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-28
2,1978,10,30,10.557,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-30
3,1978,11,1,10.67,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-11-01
4,1978,11,3,10.777,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-11-03


In [6]:
s_seaice_df["date_time"] = n_seaice_df.apply(lambda row :
                          date(row.year, row.month, row.day),
                          axis=1
                          )

s_seaice_df.head()

Unnamed: 0,year,month,day,south_icearea_sq_km,missing,source_data,date_time
0,1978,10,26,17.624,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-26
1,1978,10,28,17.803,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-28
2,1978,10,30,17.67,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-30
3,1978,11,1,17.527,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-11-01
4,1978,11,3,17.486,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-11-03


### Convert "date_time" column to actual "date_time" data type for grouping functions below

In [7]:
n_seaice_df["date_time"] = pd.to_datetime(n_seaice_df["date_time"])
n_seaice_df.head()

Unnamed: 0,year,month,day,north_icearea_sq_km,missing,source_data,date_time
0,1978,10,26,10.231,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-26
1,1978,10,28,10.42,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-28
2,1978,10,30,10.557,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-30
3,1978,11,1,10.67,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-11-01
4,1978,11,3,10.777,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-11-03


In [8]:
s_seaice_df["date_time"] = pd.to_datetime(s_seaice_df["date_time"])
s_seaice_df.head()

Unnamed: 0,year,month,day,south_icearea_sq_km,missing,source_data,date_time
0,1978,10,26,17.624,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-26
1,1978,10,28,17.803,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-28
2,1978,10,30,17.67,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-30
3,1978,11,1,17.527,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-11-01
4,1978,11,3,17.486,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-11-03


### Merge the northern and southern dataframes

In [9]:
merged_df = n_seaice_df.merge(s_seaice_df, on="date_time", how="left")
merged_df.head()


Unnamed: 0,year_x,month_x,day_x,north_icearea_sq_km,missing_x,source_data_x,date_time,year_y,month_y,day_y,south_icearea_sq_km,missing_y,source_data_y
0,1978,10,26,10.231,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-26,1978,10,26,17.624,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...
1,1978,10,28,10.42,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-28,1978,10,28,17.803,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...
2,1978,10,30,10.557,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-10-30,1978,10,30,17.67,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...
3,1978,11,1,10.67,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-11-01,1978,11,1,17.527,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...
4,1978,11,3,10.777,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...,1978-11-03,1978,11,3,17.486,0.0,['ftp://sidads.colorado.edu/pub/DATASETS/nsid...


### Extract only the required columns from the merged dataframe

In [10]:
final_merged_df = merged_df[['date_time', 'north_icearea_sq_km', 'south_icearea_sq_km']]
final_merged_df.head()

Unnamed: 0,date_time,north_icearea_sq_km,south_icearea_sq_km
0,1978-10-26,10.231,17.624
1,1978-10-28,10.42,17.803
2,1978-10-30,10.557,17.67
3,1978-11-01,10.67,17.527
4,1978-11-03,10.777,17.486


### Sort the "date_time" column by year and aggregate the north and south columns by the max value

In [11]:
sorted_df = final_merged_df.groupby(pd.Grouper(key='date_time', freq='Y')).agg({'north_icearea_sq_km' : 'max','south_icearea_sq_km' : 'max'})
sorted_df.reset_index(inplace=True)
sorted_df.head()

Unnamed: 0,date_time,north_icearea_sq_km,south_icearea_sq_km
0,1978-12-31,14.585,17.803
1,1979-12-31,16.635,18.361
2,1980-12-31,16.302,19.092
3,1981-12-31,15.801,18.856
4,1982-12-31,16.325,18.55


### Connect to local database

In [12]:
rds_connection_string = "pythonuser:password@127.0.0.1/global_temp_sea_ice_db"
engine = create_engine(f'mysql://{rds_connection_string}')

### Check for tables

In [13]:
engine.table_names()

['global_temp', 'sea_ice']

### Use pandas to load csv converted DataFrame into database

In [14]:
sorted_df.to_sql(name='sea_ice', con=engine, if_exists='append', index=False)

### Confirm data has been added by querying the customer_location table

In [15]:
pd.read_sql_query('select * from sea_ice', con=engine).head()

Unnamed: 0,date_time,north_icearea_sq_km,south_icearea_sq_km
0,1978-12-31,14.585,17.803
1,1979-12-31,16.635,18.361
2,1980-12-31,16.302,19.092
3,1981-12-31,15.801,18.856
4,1982-12-31,16.325,18.55
