## The Global Historical Climatology Network-Monthly (GHCN-M) temperature dataset

https://www.ncdc.noaa.gov/ghcnm/v3.php<br />
For ID check ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v3/README<br />
https://www.wmo.int/cpdb/volume_a_observing_stations/list_stations

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
# df = pd.read_csv('https://www2.census.gov/programs-surveys/popest/datasets/2010-2017/national/totals/nst-est2017-alldata.csv')
# read the adjusted data, see README above, check for current version
df = pd.read_csv('ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v3/csv/ghcnm.tavg.v3.3.0.20170710.qca.dat.csv', header=None)

In [3]:
# See README
df.columns = ['ID', 'YEAR', 'ELEMENT', 'VALUE1', 'DMFLAG1', 'QCFLAG1', 'DSFLAG1', 'VALUE2', 'DMFLAG2', 'QCFLAG2', 'DSFLAG2', 'VALUE3', 'DMFLAG3', 'QCFLAG3', 'DSFLAG3', 'VALUE4', 'DMFLAG4', 'QCFLAG4', 'DSFLAG4', 'VALUE5', 'DMFLAG5', 'QCFLAG5', 'DSFLAG5', 'VALUE6', 'DMFLAG6', 'QCFLAG6', 'DSFLAG6', 'VALUE7', 'DMFLAG7', 'QCFLAG7', 'DSFLAG7', 'VALUE8', 'DMFLAG8', 'QCFLAG8', 'DSFLAG8', 'VALUE9', 'DMFLAG9', 'QCFLAG9', 'DSFLAG9', 'VALUE10', 'DMFLAG10', 'QCFLAG10', 'DSFLAG10', 'VALUE11', 'DMFLAG11', 'QCFLAG11', 'DSFLAG11', 'VALUE12', 'DMFLAG12', 'QCFLAG12', 'DSFLAG12']
# print(df.head())

In [4]:
# just need temp values
temps = df[['ID', 'YEAR', 'ELEMENT', 'VALUE1', 'VALUE2', 'VALUE3', 'VALUE4', 'VALUE5', 'VALUE6', 'VALUE7', 'VALUE8', 'VALUE9', 'VALUE10', 'VALUE11', 'VALUE12']]
# print(temps.tail())
# print(temps.shape)

In [5]:
# Country code is first three digits of ID
# Here are the country codes: ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v3/country-codes
temps.insert(1, column = "CountryCode", value = df['ID'].astype(str).str[:3].astype(np.int64))
# print(temps.tail())

In [6]:
# Get country names from code
countries = pd.read_csv('ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v3/country-codes', names=['codes'], header=None)
countries[['CountryCode','Country']] = countries["codes"].str.split(" ", 1, expand=True)
countries.drop('codes', axis = 'columns', inplace = True)
countries['CountryCode'] = countries['CountryCode'].astype('int64')
# print(countries.shape)

# Pickle dataset
with open('./countrycodes.pkl', 'wb') as countrycodes_pkl:
    pickle.dump(countries, countrycodes_pkl, protocol=2)

In [7]:
# merge the dataframes (optional)
temps_merged = pd.merge(temps, countries, how = 'left', left_on = 'CountryCode', right_on = 'CountryCode')
# print(temps.tail())

In [8]:
# Pickle dataset
with open('./ghcnm.tavg.v3.3.0.20170710.qca.dat.pkl', 'wb') as qca_pkl:
    pickle.dump(temps_merged, qca_pkl, protocol=2)