In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np

## Get Datasets

Use urls for each and then read the csv

In [6]:
interval = "2011-2020"
temps_url = f"https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/noaa-ghcn/decades/{interval}.csv"
temperatures = pd.read_csv(temps_url)
temperatures.head(3)

Unnamed: 0,ID,Year,VALUE1,VALUE2,VALUE3,VALUE4,VALUE5,VALUE6,VALUE7,VALUE8,VALUE9,VALUE10,VALUE11,VALUE12
0,ACW00011604,2011,-83.0,-132.0,278.0,1040.0,1213.0,1663.0,1875.0,1723.0,1466.0,987.0,721.0,428.0
1,ACW00011604,2012,121.0,-98.0,592.0,646.0,1365.0,1426.0,1771.0,1748.0,1362.0,826.0,620.0,-234.0
2,ACW00011604,2013,-104.0,-93.0,-48.0,595.0,,1612.0,1855.0,1802.0,1359.0,1042.0,601.0,


In [95]:
stations_url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/noaa-ghcn/station-metadata.csv"
stations = pd.read_csv(stations_url)
stations["FIPS"] = stations["ID"].str[0:2]
stations

Unnamed: 0,ID,LATITUDE,LONGITUDE,STNELEV,NAME,FIPS
0,ACW00011604,57.7667,11.8667,18.0,SAVE,AC
1,AE000041196,25.3330,55.5170,34.0,SHARJAH_INTER_AIRP,AE
2,AEM00041184,25.6170,55.9330,31.0,RAS_AL_KHAIMAH_INTE,AE
3,AEM00041194,25.2550,55.3640,10.4,DUBAI_INTL,AE
4,AEM00041216,24.4300,54.4700,3.0,ABU_DHABI_BATEEN_AIR,AE
...,...,...,...,...,...,...
27580,ZI000067983,-20.2000,32.6160,1132.0,CHIPINGE,ZI
27581,ZI000067991,-22.2170,30.0000,457.0,BEITBRIDGE,ZI
27582,ZIXLT371333,-17.8300,31.0200,1471.0,HARARE_BELVEDERE,ZI
27583,ZIXLT443557,-18.9800,32.4500,1018.0,GRAND_REEF,ZI


In [44]:
countries_url = "https://raw.githubusercontent.com/mysociety/gaze/master/data/fips-10-4-to-iso-country-codes.csv"
countries = pd.read_csv(countries_url)
countries["FIPS"] = countries["FIPS 10-4"]
countries["ISO"] = countries["ISO 3166"]
countries["Country"] = countries["Name"]
countries = countries.drop(["FIPS 10-4", "ISO 3166", "Name"], axis = 1)
countries.head(3)

Unnamed: 0,FIPS,ISO,Country
0,AF,AF,Afghanistan
1,AX,-,Akrotiri
2,AL,AL,Albania


###  Make them into a database

Open up a SQL connection

In [10]:
import sqlite3
conn = sqlite3.connect("bp1.db")

In [11]:
def prep_temp_df(df):
    df = df.set_index(keys=["ID", "Year"])
    df = df.stack()
    df = df.reset_index()
    df = df.rename(columns = {"level_2"  : "Month" , 0 : "Temp"})
    df["Month"] = df["Month"].str[5:].astype(int)
    df["Temp"]  = df["Temp"] / 100
    return(df)

In [13]:
df_iter = pd.read_csv(temps_url, chunksize = 100000)
for df in df_iter:
    cleaned = prep_temp_df(df)
    cleaned.to_sql("temperatures", conn, if_exists = "append", index = False)

In [45]:
stations.to_sql("stations", conn, if_exists = "replace", index = False)

In [46]:
countries.to_sql("countries", conn, if_exists = "replace", index = False)

In [48]:
conn.close()

## 2 Write a Query Function

In [56]:
def query_climate_database(country, year_begin, year_end, month):
    cmd = "SELECT S.name, S.latitude, S.longitude, C.country, T.year, T.month, T.temp "
    cmd += "FROM temperatures T "
    cmd += "LEFT JOIN stations S ON T.id = S.id "
    cmd += "LEFT JOIN countries C ON C.FIPS = S.FIPS "
    cmd += "WHERE C.country= \"" + str(country)
    cmd += "\" AND T.year >=" + str(year_begin)
    cmd += " AND T.year <=" + str(year_end)
    cmd += " AND T.month =" + str(month)
    
    conn = sqlite3.connect("bp1.db")
    df = pd.read_sql_query(cmd, conn)
    conn.close()
    return df

In [61]:
query_climate_database("Germany", 2017, 2019, 8)

Unnamed: 0,NAME,LATITUDE,LONGITUDE,Country,Year,Month,Temp
0,BREMEN,53.0464,8.7992,Germany,2017,8,16.90
1,BREMEN,53.0464,8.7992,Germany,2018,8,19.35
2,BREMEN,53.0464,8.7992,Germany,2019,8,19.50
3,KAISERSLAUTERN,49.4253,7.7367,Germany,2017,8,19.06
4,KAISERSLAUTERN,49.4253,7.7367,Germany,2018,8,20.52
...,...,...,...,...,...,...,...
493,COLEMAN,49.5667,8.4667,Germany,2018,8,22.28
494,COLEMAN,49.5667,8.4667,Germany,2019,8,21.22
495,RHEIN_MAIN,50.0333,8.5833,Germany,2017,8,19.65
496,RHEIN_MAIN,50.0333,8.5833,Germany,2018,8,22.00


In [100]:
conn = sqlite3.connect("bp1.db")
country = "United States"
year_begin = 2014
year_end = 2017
month = 4

cmd = "SELECT S.name, S.latitude, S.longitude, S.STNELEV, C.country, T.year, T.month, T.temp "
cmd += "FROM temperatures T "
cmd += "LEFT JOIN stations S ON T.id = S.id "
cmd += "LEFT JOIN countries C ON C.FIPS = S.FIPS "
cmd += "WHERE C.country=\"" + str(country)
cmd += "\" AND T.year >=" + str(year_begin)
cmd += " AND T.year <=" + str(year_end)
cmd += " AND T.month =" + str(month)

df = pd.read_sql_query(cmd, conn)
df


Unnamed: 0,NAME,LATITUDE,LONGITUDE,STNELEV,Country,Year,Month,Temp
0,ADDISON,34.2553,-87.1814,249.3,United States,2014,4,14.55
1,ADDISON,34.2553,-87.1814,249.3,United States,2015,4,16.77
2,ADDISON,34.2553,-87.1814,249.3,United States,2016,4,16.12
3,ADDISON,34.2553,-87.1814,249.3,United States,2017,4,18.15
4,ALEXANDER_CITY,32.9453,-85.9481,195.1,United States,2014,4,16.46
...,...,...,...,...,...,...,...,...
27006,LINCOLN_8_ENE,40.8483,-96.5650,362.4,United States,2017,4,11.64
27007,LINCOLN_11_SW,40.6953,-96.8542,418.2,United States,2014,4,10.94
27008,LINCOLN_11_SW,40.6953,-96.8542,418.2,United States,2015,4,12.03
27009,LINCOLN_11_SW,40.6953,-96.8542,418.2,United States,2016,4,12.42


In [101]:
obs = df.groupby(["NAME"])["Month"].transform(np.sum) / month
df = df[obs >=3]

## Write a Geographic Scatter Function for Yearly Temperature Increases

In [76]:
from plotly import express as px

In [102]:
df2 = df.groupby(["NAME", "LATITUDE", "LONGITUDE", "STNELEV"])["Temp"].aggregate(np.mean).reset_index()
df2

Unnamed: 0,NAME,LATITUDE,LONGITUDE,STNELEV,Temp
0,ABERDEEN,33.8300,-88.5214,60.4,17.565000
1,ABERDEEN,45.4558,-98.4131,396.8,8.136667
2,ABERDEEN,46.9658,-123.8292,3.0,9.995000
3,ABERDEEN_35_WNW,45.7114,-99.1297,596.5,6.272500
4,ABERDEEN_EXP_STN,42.9536,-112.8253,1342.6,7.575000
...,...,...,...,...,...
6694,ZIRKEL,40.7833,-106.5833,2846.8,1.792500
6695,ZORTMAN,47.9186,-108.5244,1229.9,5.412500
6696,ZORTMAN_MINE_MONTANA,47.9225,-108.5528,1420.4,5.826667
6697,ZUMBROTA,44.2992,-92.6661,300.2,7.577500


In [107]:
df2[df2["STNELEV"] > 4000]
df2 = df2[df2["STNELEV"] < 9999]


In [108]:
fig = px.scatter_mapbox(df2,
                        lat = "LATITUDE",
                        lon = "LONGITUDE",
                        hover_name = "NAME",
                        color = "STNELEV",
                        mapbox_style = "carto-positron")

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [74]:
def temperature_coefficient_plot(country, year_begin, year_end, month, min_obs, **kwargs):
    df = query_climate_database(country, year_begin, year_end, month)
    obs = df.groupby(["NAME"])["Month"].transform(np.sum) / month
    df = df[obs >= min_obs]
    return df

In [78]:
from sklearn.linear_model import LinearRegression

def coef(df):
    x = df[["Year"]]
    y = df["Temp"]
    LR = LinearRegression()
    LR.fit(x,y)
    return LR.coef_[0]

ModuleNotFoundError: No module named 'sklearn'

In [75]:
temperature_coefficient_plot("France", 2011, 2019, 8, 5)

Unnamed: 0,NAME,LATITUDE,LONGITUDE,Country,Year,Month,Temp
3,RENNES,48.0667,-1.7331,France,2011,8,17.80
4,RENNES,48.0667,-1.7331,France,2012,8,19.34
5,RENNES,48.0667,-1.7331,France,2013,8,19.05
6,RENNES,48.0667,-1.7331,France,2014,8,17.30
7,RENNES,48.0667,-1.7331,France,2015,8,19.05
...,...,...,...,...,...,...,...
539,PORT_AUX_FRANCAIS_ILES_KERGU,-49.3500,70.2500,France,2015,8,3.50
540,PORT_AUX_FRANCAIS_ILES_KERGU,-49.3500,70.2500,France,2016,8,2.45
541,PORT_AUX_FRANCAIS_ILES_KERGU,-49.3500,70.2500,France,2017,8,4.55
542,PORT_AUX_FRANCAIS_ILES_KERGU,-49.3500,70.2500,France,2018,8,2.90


## 4 Seasonal Difference by Latitude

In [127]:
def query_climate_database2(year_begin, year_end, month1, month2):
    cmd = "SELECT S.name, S.latitude, S.longitude, S.stnelev, C.country, T.year, T.month, T.temp "
    cmd += "FROM temperatures T "
    cmd += "LEFT JOIN stations S ON T.id = S.id "
    cmd += "LEFT JOIN countries C ON C.FIPS = S.FIPS "
    cmd += "WHERE T.year >=" + str(year_begin)
    cmd += " AND T.year <=" + str(year_end)
    cmd += " AND T.month IN (" + str(month1) + ", " + str(month2)+")"
    print(cmd)
    conn = sqlite3.connect("bp1.db")
    df = pd.read_sql_query(cmd, conn)
    conn.close()
    return df
df4 = query_climate_database2(2011, 2019, 1, 7)
df4

SELECT S.name, S.latitude, S.longitude, S.stnelev, C.country, T.year, T.month, T.temp FROM temperatures T LEFT JOIN stations S ON T.id = S.id LEFT JOIN countries C ON C.FIPS = S.FIPS WHERE T.year >=2011 AND T.year <=2019 AND T.month IN (1, 7)


Unnamed: 0,NAME,LATITUDE,LONGITUDE,STNELEV,Country,Year,Month,Temp
0,SAVE,57.7667,11.8667,18.0,Antigua and Barbuda,2011,1,-0.83
1,SAVE,57.7667,11.8667,18.0,Antigua and Barbuda,2011,7,18.75
2,SAVE,57.7667,11.8667,18.0,Antigua and Barbuda,2012,1,1.21
3,SAVE,57.7667,11.8667,18.0,Antigua and Barbuda,2012,7,17.71
4,SAVE,57.7667,11.8667,18.0,Antigua and Barbuda,2013,1,-1.04
...,...,...,...,...,...,...,...,...
229071,CHIPINGE,-20.2000,32.6160,1132.0,Zimbabwe,2012,1,21.86
229072,CHIPINGE,-20.2000,32.6160,1132.0,Zimbabwe,2012,7,15.63
229073,CHIPINGE,-20.2000,32.6160,1132.0,Zimbabwe,2013,7,15.28
229074,CHIPINGE,-20.2000,32.6160,1132.0,Zimbabwe,2015,1,21.75


In [125]:
df4.groupby(["NAME", "Month"])["Temp"].apply(np.mean).reset_index()
df4[df4["Month"]== 7] - df4[df4["Month" == 1]]

KeyError: False

In [166]:
def query_monthly_temps(year_begin, year_end, month1, month2):
    cmd = """
    DROP table month1;"""
    """
    
    CREATE TABLE month1 AS
    SELECT S.name, S.latitude, S.longitude, S.stnelev, C.country, 
        T.month, AVG(T.temp) as Temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON C.FIPS = S.FIPS 
    WHERE T.year >=""" + str(year_begin) + """
    AND T.year <=""" + str(year_end) + """
    AND T.month = """+ str(month1) + """
    GROUP BY S.name
    HAVING COUNT(T.temp) = ("""+str(year_end)+""" - """+str(year_begin)+""" + 1)
    """
    """
    CREATE TABLE month2 AS
    SELECT S.name, S.latitude, S.longitude, S.stnelev, C.country, 
        T.month, AVG(T.temp) as Temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON C.FIPS = S.FIPS 
    WHERE T.year >=""" + str(year_begin) + """
    AND T.year <=""" + str(year_end) + """
    AND T.month = """+ str(month2) + """
    GROUP BY S.name
    HAVING COUNT(T.temp) = ("""+str(year_end)+""" - """+str(year_begin)+""" + 1);
    
    """

    print(cmd)
    conn = sqlite3.connect("bp1.db")
    pd.read_sql_query(cmd, conn)
    conn.close()
    return
df2 = query_monthly_temps(2011, 2019, 1, 7)
df2


    DROP table month1;


DatabaseError: Execution failed on sql '
    DROP table month1;': no such table: month1

In [None]:
df.groupby(["NAME", "LATITUDE", "LONGITUDE", "STNELEV"])

In [154]:
df2["Month" == "7"] - df2["Month" == "1"]

KeyError: False

In [155]:
df2["Month"]

0         1
1         3
2         5
3         6
4         8
         ..
93904     8
93905     9
93906    10
93907    11
93908    12
Name: Month, Length: 93909, dtype: int64

In [None]:
fig = px.scatter(df2,
                 x = "Latitude")

In [None]:
fig = px.scatter(data_frame = penguins,
                 x = "Culmen Length (mm)",
                 y = "Culmen Depth (mm)",
                 color = "Species",
                 width = 500,
                 height = 300,
                 facet_col = "Sex"
                )

### Questions

Number of years of temperature data (decades)
Can't import sklearn