## Create small GHCNM dataframe

https://www.ncdc.noaa.gov/ghcnm/v3.php<br />
ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v3/README<br />
For Station ID check https://www.wmo.int/cpdb/volume_a_observing_stations/list_stations

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
# unpickle temps created with ghcnm.ipynb
with open('./ghcnm.tavg.v3.3.0.20170710.qca.dat.pkl', 'rb') as qca_file:
    qca_temps = pickle.load(qca_file)

In [3]:
# drop observations without all 12 months; read documentation regarding missing values
qca_temps.replace(-9999, np.NaN, inplace=True)
qca_temps.dropna(inplace=True)

# strip whitespace
qca_temps = qca_temps.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# sort by Country
sorted_temps = qca_temps.sort_values('Country', ascending = True)

In [4]:
# means grouped by CountryCode and Year 
mean_temps = sorted_temps.groupby(['CountryCode', 'YEAR'])['VALUE1', 'VALUE2', 'VALUE3', 'VALUE4', 'VALUE5', 'VALUE6', 'VALUE7', 'VALUE8', 'VALUE9', 'VALUE10', 'VALUE11', 'VALUE12'].mean()

In [5]:
print(mean_temps.head())

                       VALUE1       VALUE2       VALUE3       VALUE4  \
CountryCode YEAR                                                       
101         1853  1193.000000  1023.000000  1193.000000  1653.000000   
            1856  1264.333333  1124.333333  1327.666667  1444.333333   
            1857   823.500000  1083.500000  1208.500000  1438.500000   
            1858   864.500000  1124.500000  1254.500000  1624.500000   
            1859   874.666667   954.666667  1141.333333  1648.000000   

                       VALUE5       VALUE6       VALUE7       VALUE8  \
CountryCode YEAR                                                       
101         1853  1843.000000  2073.000000  2403.000000  2563.000000   
            1856  1831.000000  1981.000000  2264.333333  2301.000000   
            1857  1598.500000  1953.500000  2283.500000  2338.500000   
            1858  1734.500000  2184.500000  2294.500000  2464.500000   
            1859  1804.666667  2016.333333  2403.000000  2439.6

In [6]:
# Pickle dataset
with open('./ghcnm_means.pkl', 'wb') as ghcnm_means_pkl:
    pickle.dump(mean_temps, ghcnm_means_pkl, protocol=2)