# Clean up the data and save it to CSV files

Make a version with a better header and UTF-8 encoding.

In [1]:
from pathlib import Path
import shutil
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data_dir = Path("../data/raw/")
out_dir = Path("../data/processed")
out_dir.mkdir(exist_ok=True, parents=True)

In [3]:
data_files = sorted(data_dir.glob("*.txt"))
print(len(data_files))

229


Load all the files with Pandas and calculate the decimal year and the absolute temperature.

In [4]:
data_raw = {}
for data_file in data_files:
    with open(data_file, encoding="latin1") as file_pointer:
        for line in file_pointer:
            if "Estimated Jan 1951-Dec 1980 absolute temperature (C):" in line:
                components = line.split(":")[1].split("+/-")
                assert len(components) == 2
                absolute_temperature = float(components[0])
                break
        else:
            raise ValueError(f"Abolute temperature not found for {data_file}")
        datum = pd.read_csv(
            file_pointer,
            sep="\s+",
            comment="%",
            usecols=[0, 1, 2, 3],
            names="year month anomaly uncertainty_C".split(),
        )
        # 0.5 so that the data falls on the middle of the month
        datum["year_decimal"] = datum.year + 1 / 12 * (datum.month - 0.5)
        datum["temperature_C"] = datum.anomaly + absolute_temperature
        data_raw[data_file.name[:-4]] = datum

Keep only the files that have no data gaps since the 1940s. This will make it easier for students to calculate things like "mean of the past 10 years" and such.

In [5]:
data_no_gaps = {}
start_year = 1940
for country in data_raw:
    recent = data_raw[country].year_decimal >= start_year
    if not np.any(np.isnan(data_raw[country].temperature_C[recent])):
        data_no_gaps[country] = data_raw[country]
print(len(data_no_gaps))

225


Remove any NaNs from the data.

In [6]:
data_no_nans = {}
for country in data_no_gaps:
    data_no_nans[country] = data_no_gaps[country].dropna()

Save the data to CSVs.

In [7]:
today = datetime.date.today().isoformat()
for country in data_no_nans:
    out_fname = out_dir / f"{country}.csv"
    country_name = country.replace("-", " ").title()
    with open(out_fname, "w") as out_file:
        out_file.writelines([
            f"# Monthly average temperature data for {country_name}.\n",
            "# Original data from Berkeley Earth (https://www.berkeleyearth.org) licensed CC-BY-NC.\n",
            "# Processed to remove NaNs, include decimal year, convert temperature anomaly to absolute temperature, CVS format with UTF-8 encoding.\n"
            "# The year_decimal assumes every month is 1/12 of a year and assigns the datum to the middle of the month.\n",
            f"# Downloaded and processed: {today}\n"
        ])
        data_no_nans[country].to_csv(
            out_file,       
            float_format="%.3f",
            columns="year month year_decimal temperature_C uncertainty_C".split(),
            index=False,
        )

Check that the files look OK.

In [8]:
with open(out_dir / "sri-lanka.csv") as f:
    for line, i in zip(f, range(10)):
        print(line, end="")

# Monthly average temperature data for Sri Lanka.
# Original data from Berkeley Earth (https://www.berkeleyearth.org) licensed CC-BY-NC.
# Processed to remove NaNs, include decimal year, convert temperature anomaly to absolute temperature, CVS format with UTF-8 encoding.
# The year_decimal assumes every month is 1/12 of a year and assigns the datum to the middle of the month.
# Downloaded and processed: 2025-02-11
year,month,year_decimal,temperature_C,uncertainty_C
1796,1,1796.042,27.581,3.411
1796,2,1796.125,27.389,2.813
1796,3,1796.208,25.979,1.908
1796,4,1796.292,26.581,1.574


Add a README to the processed data folder.

In [9]:
country_list = "\n".join([f"* {name.replace('-', ' ').title()}" for name in data_no_nans])
readme = f"""
# Monthly average temperature data for the world

These data are a subset of the original data files from 
[Berkeley Earth](https://www.berkeleyearth.org) licensed CC-BY-NC.

This archive contains monthly average temperature records for several countries
(see list below). The data for each country is contained in a CSV file with the
year, month, decimal year, temperature (C), and uncertainty (C). The temperature 
is the country-wide average for each month of the year. The decimal year assumes 
every month is 1/12 of a year and assigns the datum to the middle of the month.

The original data were processed to remove NaNs, include the decimal year, 
convert temperature anomaly to absolute temperature, and format the data as CVS
with UTF-8 encoding.

Downloaded and processed: {today}

## License

The data distributed here are made available under the
[Creative Commons Attribution-NonCommercial 4.0 International license](https://creativecommons.org/licenses/by-nc/4.0/)
(CC-BY-NC).
Please credit the original authors of the data (Berkeley Earth) as well as 
Leonardo Uieda when using this work. 
Please include links to https://www.berkeleyearth.org and https://github.com/compgeolab/temperature-data.

## List of countries included

{country_list}
""".strip()
(out_dir / "README.md").write_text(readme)
print(readme)

# Monthly average temperature data for the world

These data are a subset of the original data files from 
[Berkeley Earth](https://www.berkeleyearth.org) licensed CC-BY-NC.

This archive contains monthly average temperature records for several countries
(see list below). The data for each country is contained in a CSV file with the
year, month, decimal year, temperature (C), and uncertainty (C). The temperature 
is the country-wide average for each month of the year. The decimal year assumes 
every month is 1/12 of a year and assigns the datum to the middle of the month.

The original data were processed to remove NaNs, include the decimal year, 
convert temperature anomaly to absolute temperature, and format the data as CVS
with UTF-8 encoding.

Downloaded and processed: 2025-02-11

## License

The data distributed here are made available under the
[Creative Commons Attribution-NonCommercial 4.0 International license](https://creativecommons.org/licenses/by-nc/4.0/)
(CC-BY-NC).
Pleas

Create a zip archive or the data.

In [10]:
_ = shutil.make_archive(
    base_name=str(out_dir.parent / "temperature-data"), 
    format="zip", 
    root_dir=out_dir
)