In [1]:
import pandas as pd
import numpy as np

## Get Datasets

Use urls for each and then read the csv

In [6]:

interval = "2011-2020"
temps_url = f"https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/noaa-ghcn/decades/{interval}.csv"
temperatures = pd.read_csv(temps_url)
temperatures.head(3)

Unnamed: 0,ID,Year,VALUE1,VALUE2,VALUE3,VALUE4,VALUE5,VALUE6,VALUE7,VALUE8,VALUE9,VALUE10,VALUE11,VALUE12
0,ACW00011604,2011,-83.0,-132.0,278.0,1040.0,1213.0,1663.0,1875.0,1723.0,1466.0,987.0,721.0,428.0
1,ACW00011604,2012,121.0,-98.0,592.0,646.0,1365.0,1426.0,1771.0,1748.0,1362.0,826.0,620.0,-234.0
2,ACW00011604,2013,-104.0,-93.0,-48.0,595.0,,1612.0,1855.0,1802.0,1359.0,1042.0,601.0,


###  Make them into a database

Open up a SQL connection

In [4]:
def prep_temp_df(df):
    df = df.set_index(keys=["ID", "Year"])
    df = df.stack()
    df = df.reset_index()
    df = df.rename(columns = {"level_2"  : "Month" , 0 : "Temp"})
    df["Month"] = df["Month"].str[5:].astype(int)
    df["Temp"]  = df["Temp"] / 100
    return(df)

In [5]:
import sqlite3
conn = sqlite3.connect("bp1.db")

In [6]:
decades = np.arange(1901, 2011, 10)
for start in decades:
    interval = str(start) + "-" + str(start+9)
    temps_url = f"https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/noaa-ghcn/decades/{interval}.csv"
    df_iter = pd.read_csv(temps_url, chunksize = 100000)
    for df in df_iter:
        cleaned = prep_temp_df(df)
        cleaned.to_sql("temperatures", conn, if_exists = "append", index = False)

Next make for stations

In [7]:
stations_url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/noaa-ghcn/station-metadata.csv"
stations = pd.read_csv(stations_url)
stations["FIPS"] = stations["ID"].str[0:2]
stations.head(3)

Unnamed: 0,ID,LATITUDE,LONGITUDE,STNELEV,NAME,FIPS
0,ACW00011604,57.7667,11.8667,18.0,SAVE,AC
1,AE000041196,25.333,55.517,34.0,SHARJAH_INTER_AIRP,AE
2,AEM00041184,25.617,55.933,31.0,RAS_AL_KHAIMAH_INTE,AE


In [8]:
stations.to_sql("stations", conn, if_exists = "replace", index = False)

In [9]:
countries_url = "https://raw.githubusercontent.com/mysociety/gaze/master/data/fips-10-4-to-iso-country-codes.csv"
countries = pd.read_csv(countries_url)
countries["FIPS"] = countries["FIPS 10-4"]
countries["ISO"] = countries["ISO 3166"]
countries["Country"] = countries["Name"]
countries = countries.drop(["FIPS 10-4", "ISO 3166", "Name"], axis = 1)
countries.head(3)

Unnamed: 0,FIPS,ISO,Country
0,AF,AF,Afghanistan
1,AX,-,Akrotiri
2,AL,AL,Albania


In [10]:
countries.to_sql("countries", conn, if_exists = "replace", index = False)

In [11]:
conn.close()

## 2 Write a Query Function

In [38]:
def query_climate_database(country, year_begin, year_end, month):
    cmd = """
    SELECT S.name, S.latitude, S.longitude, C.country, T.year, T.month, T.temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON C.FIPS = S.FIPS
    WHERE C.country= \"""" + str(country) + """\"
    AND T.year >=""" + str(year_begin) + """
    AND T.year <=""" + str(year_end) + """
    AND T.month =""" + str(month)
    conn = sqlite3.connect("bp1.db")
    df = pd.read_sql_query(cmd, conn)
    conn.close()
    return df

In [39]:
query_climate_database("Germany", 2009, 2012, 8)

## Write a Geographic Scatter Function for Yearly Temperature Increases

In [108]:
from sklearn.linear_model import LinearRegression

def coef(df):
    x = df[["Year"]]
    y = df["Temp"]
    LR = LinearRegression()
    LR.fit(x,y)
    return round(LR.coef_[0],3)

#coefs = df.groupby(["NAME", "LATITUDE", "LONGITUDE"]).apply(coef).reset_index()

fig = px.scatter_mapbox(coefs,
                            lat = "LATITUDE",
                            lon = "LONGITUDE",
                            color = 0,
                            color_continuous_midpoint = 0,
                            hover_name = "NAME",
                            mapbox_style = "carto-positron")

fig.show()

In [110]:
fig.update_layout

<bound method BaseFigure.update_layout of Figure({
    'data': [{'hovertemplate': ('<b>%{hovertext}</b><br><br>LAT' ... '%{marker.color}<extra></extra>'),
              'hovertext': [AACHEN, AACHEN_ORSBACH, AHAUS, ...,
                            ZINNWALD_GEORGENFE, ZUGSPITZE, ZWIESEL_AUT],
              'lat': array([50.7839, 50.7992, 52.083 , ..., 50.733 , 47.4219, 49.033 ]),
              'legendgroup': '',
              'lon': array([ 6.095 ,  6.025 ,  6.95  , ..., 13.75  , 10.9867, 13.233 ]),
              'marker': {'color': array([-1.87 ,  1.75 ,  0.23 , ...,  0.172,  0.36 ,  1.   ]),
                         'coloraxis': 'coloraxis'},
              'mode': 'markers',
              'name': '',
              'showlegend': False,
              'subplot': 'mapbox',
              'type': 'scattermapbox'}],
    'layout': {'coloraxis': {'cmid': 0,
                             'colorbar': {'title': {'text': '0'}},
                             'colorscale': [[0.0, '#0d0887'], [0.1111111

In [29]:
from plotly import express as px

In [112]:
def temperature_coefficient_plot(country, year_begin, year_end, month, min_obs, **kwargs):
    df = query_climate_database(country, year_begin, year_end, month)
    obs = df.groupby(["NAME"])["Month"].transform(np.sum) / month
    df = df[obs >= min_obs]
    df = df.reset_index()
    coefs = df.groupby(["NAME", "LATITUDE", "LONGITUDE"]).apply(coef).reset_index()
    coefs["Yearly\nIncrease"] = coefs[0]
    title = "Yearly Temperature Increase in Month " + str(month)
    title += "for stations in " + country +" "+ str(year_begin)
    title += "-" + str(year_end)
    fig = px.scatter_mapbox(coefs,
                            lat = "LATITUDE",
                            lon = "LONGITUDE",
                            color = "Yearly\nIncrease",
                            color_continuous_midpoint = 0,
                            hover_name = "NAME",
                            title = title,
                            mapbox_style = "carto-positron",
                            **kwargs)
    return fig


In [114]:
color_map = px.colors.diverging.RdGy_r # choose a colormap

fig = temperature_coefficient_plot("India", 1980, 2020, 1, 
                                   min_obs = 10,
                                   zoom = 2,
                                   #mapbox_style="carto-positron",
                                   color_continuous_scale=color_map)

fig.show()

## 4 Seasonal Difference by Latitude

In [127]:
def query_climate_database2(year_begin, year_end, month1, month2):
    cmd = "SELECT S.name, S.latitude, S.longitude, S.stnelev, C.country, T.year, T.month, T.temp "
    cmd += "FROM temperatures T "
    cmd += "LEFT JOIN stations S ON T.id = S.id "
    cmd += "LEFT JOIN countries C ON C.FIPS = S.FIPS "
    cmd += "WHERE T.year >=" + str(year_begin)
    cmd += " AND T.year <=" + str(year_end)
    cmd += " AND T.month IN (" + str(month1) + ", " + str(month2)+")"
    print(cmd)
    conn = sqlite3.connect("bp1.db")
    df = pd.read_sql_query(cmd, conn)
    conn.close()
    return df
df4 = query_climate_database2(2011, 2019, 1, 7)
df4

SELECT S.name, S.latitude, S.longitude, S.stnelev, C.country, T.year, T.month, T.temp FROM temperatures T LEFT JOIN stations S ON T.id = S.id LEFT JOIN countries C ON C.FIPS = S.FIPS WHERE T.year >=2011 AND T.year <=2019 AND T.month IN (1, 7)


Unnamed: 0,NAME,LATITUDE,LONGITUDE,STNELEV,Country,Year,Month,Temp
0,SAVE,57.7667,11.8667,18.0,Antigua and Barbuda,2011,1,-0.83
1,SAVE,57.7667,11.8667,18.0,Antigua and Barbuda,2011,7,18.75
2,SAVE,57.7667,11.8667,18.0,Antigua and Barbuda,2012,1,1.21
3,SAVE,57.7667,11.8667,18.0,Antigua and Barbuda,2012,7,17.71
4,SAVE,57.7667,11.8667,18.0,Antigua and Barbuda,2013,1,-1.04
...,...,...,...,...,...,...,...,...
229071,CHIPINGE,-20.2000,32.6160,1132.0,Zimbabwe,2012,1,21.86
229072,CHIPINGE,-20.2000,32.6160,1132.0,Zimbabwe,2012,7,15.63
229073,CHIPINGE,-20.2000,32.6160,1132.0,Zimbabwe,2013,7,15.28
229074,CHIPINGE,-20.2000,32.6160,1132.0,Zimbabwe,2015,1,21.75


In [125]:
df4.groupby(["NAME", "Month"])["Temp"].apply(np.mean).reset_index()
df4[df4["Month"]== 7] - df4[df4["Month" == 1]]

KeyError: False

In [146]:
def query_monthly_temps(year_begin, year_end, min_obs):
    cmd = """
    SELECT S.name, S.latitude, S.longitude, S.stnelev, C.country, 
        T.month, AVG(T.temp) as Temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON C.FIPS = S.FIPS 
    WHERE T.year >=""" + str(year_begin) + """
    AND T.year <=""" + str(year_end) + """
    GROUP BY S.name, T.month
    HAVING COUNT(T.temp) >= """ + str(min_obs)

    print(cmd)
    conn = sqlite3.connect("bp1.db")
    df = pd.read_sql_query(cmd, conn)
    conn.close()
    return df
df4 = query_monthly_temps(2011, 2019, 5)
df4[0:24]


    SELECT S.name, S.latitude, S.longitude, S.stnelev, C.country, 
        T.month, AVG(T.temp) as Temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON C.FIPS = S.FIPS 
    WHERE T.year >=2011
    AND T.year <=2019
    GROUP BY S.name, T.month
    HAVING COUNT(T.temp) >= 5


Unnamed: 0,NAME,LATITUDE,LONGITUDE,STNELEV,Country,Month,Temp
0,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,Canada,1,-6.178889
1,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,Canada,2,-6.655
2,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,Canada,3,-0.22
3,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,Canada,4,4.4975
4,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,Canada,5,10.205556
5,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,Canada,6,12.953333
6,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,Canada,7,16.18
7,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,Canada,8,16.068889
8,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,Canada,9,10.693333
9,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,Canada,10,4.754444


In [140]:
def max_diff(data_group):
    mask = data_group["Temp"][data_group["Month"] == 1]
    mask2 = data_group["Temp"][data_group["Month"] == 7]
    print(mask[1:10])
    return mask2 - mask


In [161]:
df4_1 = df4.groupby(["NAME", "LATITUDE", "LONGITUDE", "STNELEV"])["Temp"].aggregate([np.min, np.max]).reset_index()
df4_1.head()

Unnamed: 0,NAME,LATITUDE,LONGITUDE,STNELEV,amin,amax
0,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,-6.655,16.18
1,108_MILE_HOUSE_ABEL_LAKE,51.7,-121.4,994.0,-5.837143,16.9675
2,3_MILE_IDAHO,44.3958,-112.1081,2019.3,-6.411667,18.98625
3,A12_CPP,55.3992,3.8103,48.0,5.0325,16.15
4,AACHEN_ORSBACH,50.7992,6.025,231.0,2.8675,18.75875


In [162]:
df4_1["Difference"] = df4_1["amax"] - df4_1["amin"]

In [None]:
df4.groupby(["NAME"])["Temp"].aggregate([np.mean])

In [138]:
df4_1.head(4)

Unnamed: 0,NAME,LATITUDE,LONGITUDE,STNELEV,Month,Temp
0,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,1,-6.178889
1,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,2,-6.655
2,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,3,-0.22
3,100_MILE_HOUSE_6NE,51.6833,-121.2167,928.0,4,4.4975


In [154]:
df2["Month" == "7"] - df2["Month" == "1"]

KeyError: False

In [166]:
df4_1 = df4_1[df4_1["STNELEV"] != 9999]

In [167]:
fig = px.scatter(df4_1,
                 x = "LATITUDE",
                 y = "Difference",
                 color = "STNELEV"
                )
fig.show()

In [None]:
fig = px.scatter(data_frame = penguins,
                 x = "Culmen Length (mm)",
                 y = "Culmen Depth (mm)",
                 color = "Species",
                 width = 500,
                 height = 300,
                 facet_col = "Sex"
                )

### Grouping lats and longs

Number of years of temperature data (decades)
Can't import sklearn

In [170]:
def query_monthly_temps(year_begin, year_end, min_obs):
    cmd = """
    SELECT C.country, AVG(S.latitude), AVG(S.longitude),
        T.month, AVG(T.temp) as Temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON C.FIPS = S.FIPS 
    WHERE T.year >=""" + str(year_begin) + """
    AND T.year <=""" + str(year_end) + """
    GROUP BY C.country, ROUND(latitude/5, 0), ROUND(longitude/5, 0), T.month
    HAVING COUNT(T.temp) >= """ + str(min_obs)

    print(cmd)
    conn = sqlite3.connect("bp1.db")
    df = pd.read_sql_query(cmd, conn)
    conn.close()
    return df
df_5 = query_monthly_temps(2011, 2019, 5)


    SELECT C.country, AVG(S.latitude), AVG(S.longitude),
        T.month, AVG(T.temp) as Temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON C.FIPS = S.FIPS 
    WHERE T.year >=2011
    AND T.year <=2019
    GROUP BY C.country, ROUND(latitude/5, 0), ROUND(longitude/5, 0), T.month
    HAVING COUNT(T.temp) >= 5


In [173]:
df_5[df_5["Country"] == "None"]

Unnamed: 0,Country,AVG(S.latitude),AVG(S.longitude),Month,Temp


In [177]:
df_5["Country"][0]

In [180]:
df4[df4["STNELEV"] > 1000]

Unnamed: 0,NAME,LATITUDE,LONGITUDE,STNELEV,Country,Month,Temp
24,3_MILE_IDAHO,44.3958,-112.1081,2019.3,United States,1,-5.718571
25,3_MILE_IDAHO,44.3958,-112.1081,2019.3,United States,2,-5.668571
26,3_MILE_IDAHO,44.3958,-112.1081,2019.3,United States,3,-0.278571
27,3_MILE_IDAHO,44.3958,-112.1081,2019.3,United States,4,3.605714
28,3_MILE_IDAHO,44.3958,-112.1081,2019.3,United States,5,8.848571
...,...,...,...,...,...,...,...
148045,ZUNI,35.0706,-108.8389,1923.6,United States,8,20.513333
148046,ZUNI,35.0706,-108.8389,1923.6,United States,9,17.762857
148047,ZUNI,35.0706,-108.8389,1923.6,United States,10,10.897500
148048,ZUNI,35.0706,-108.8389,1923.6,United States,11,4.471667


In [205]:
def query_monthly_temps(year_begin, year_end, min_obs):
    cmd = """
    SELECT C.country, FLOOR(S.stnelev/500)*500 as elevation, stnelev, 
        T.month, AVG(T.temp) as Temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON C.FIPS = S.FIPS 
    WHERE T.year >=""" + str(year_begin) + """
    AND T.year <=""" + str(year_end) + """
    AND S.stnelev NOT IN (9999)
    GROUP BY C.country, FLOOR(S.stnelev/500)*500, T.month
    HAVING COUNT(T.temp) >= """ + str(min_obs)

    print(cmd)
    conn = sqlite3.connect("bp1.db")
    df = pd.read_sql_query(cmd, conn)
    conn.close()
    return df
df6=query_monthly_temps(2011, 2019, 5)
df6


    SELECT C.country, FLOOR(S.stnelev/500)*500 as elevation, stnelev, 
        T.month, AVG(T.temp) as Temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON C.FIPS = S.FIPS 
    WHERE T.year >=2011
    AND T.year <=2019
    AND S.stnelev NOT IN (9999)
    GROUP BY C.country, FLOOR(S.stnelev/500)*500, T.month
    HAVING COUNT(T.temp) >= 5


Unnamed: 0,Country,elevation,STNELEV,Month,Temp
0,,0.0,4.3,1,26.882000
1,,0.0,4.3,2,26.903333
2,,0.0,4.3,3,27.119333
3,,0.0,4.3,4,27.980667
4,,0.0,4.3,5,28.361176
...,...,...,...,...,...
5061,Zimbabwe,1000.0,1480.0,8,17.241818
5062,Zimbabwe,1000.0,1480.0,9,20.633333
5063,Zimbabwe,1000.0,1480.0,10,22.512500
5064,Zimbabwe,1000.0,1480.0,11,22.996957


In [223]:
df61 = df6.groupby(["Country", "elevation", "STNELEV"])["Temp"].aggregate([np.min, np.max, np.mean]).reset_index()

In [224]:
df61.head()
df61["Difference"] = df61["amax"]-df61["amin"]
df61

Unnamed: 0,Country,elevation,STNELEV,amin,amax,mean,Difference
0,Afghanistan,500.0,977.2,6.108889,31.378889,18.178426,25.270000
1,Afghanistan,1000.0,1010.0,7.681429,33.794286,20.852044,26.112857
2,Afghanistan,1500.0,1791.3,1.583333,27.632222,14.996389,26.048889
3,Albania,0.0,89.0,6.440833,26.152857,16.086980,19.712024
4,Algeria,0.0,24.0,11.989241,30.484744,20.918371,18.495503
...,...,...,...,...,...,...,...
437,Wallis and Futuna,0.0,27.0,26.579412,27.915882,27.392217,1.336471
438,Western Sahara,0.0,64.0,18.084286,23.937778,20.902222,5.853492
439,Zambia,500.0,986.0,26.840000,26.840000,26.840000,0.000000
440,Zambia,1000.0,1384.0,21.810000,23.860000,23.006979,2.050000


In [227]:
fig = px.scatter(df61,
                 x = "STNELEV",
                 y = "mean",
                 color = "Difference",
                 hover_name = "Country"
                )
fig.show()

In [240]:
def query_monthly_temps(year_begin, year_end, min_obs):
    cmd = """
    SELECT C.country, ROUND(S.latitude/5,1)*5 as bracket, ROUND(AVG(S.latitude),1) as avg_lat,
        T.month, AVG(T.temp) as Temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON C.FIPS = S.FIPS 
    WHERE T.year >=""" + str(year_begin) + """
    AND T.year <=""" + str(year_end) + """
    AND S.stnelev NOT IN (9999)
    GROUP BY C.country, ROUND(S.latitude/5,1)*5, T.month
    HAVING COUNT(T.temp) >= """ + str(min_obs)

    print(cmd)
    conn = sqlite3.connect("bp1.db")
    df = pd.read_sql_query(cmd, conn)
    conn.close()
    return df
df7=query_monthly_temps(2011, 2019, 5)
df7


    SELECT C.country, ROUND(S.latitude/5,1)*5 as bracket, ROUND(AVG(S.latitude),1) as avg_lat,
        T.month, AVG(T.temp) as Temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON C.FIPS = S.FIPS 
    WHERE T.year >=2011
    AND T.year <=2019
    AND S.stnelev NOT IN (9999)
    GROUP BY C.country, ROUND(S.latitude/5,1)*5, T.month
    HAVING COUNT(T.temp) >= 5


Unnamed: 0,Country,bracket,avg_lat,Month,Temp
0,,12.0,12.2,1,27.308750
1,,12.0,12.2,2,27.452500
2,,12.0,12.2,3,27.712500
3,,12.0,12.2,4,28.500000
4,,12.0,12.2,5,28.851111
...,...,...,...,...,...
20723,Zimbabwe,-18.0,-17.9,8,16.967500
20724,Zimbabwe,-18.0,-17.9,9,20.241250
20725,Zimbabwe,-18.0,-17.9,10,22.367500
20726,Zimbabwe,-18.0,-17.9,11,22.367778


In [244]:
avg = df7.groupby(["Country", "bracket", "avg_lat"])["Temp"].aggregate([np.mean]).reset_index()
month1 = df7[df7["Month"] == 1]
month2 = df7[df7["Month"] == 7]
df71 = pd.merge(month1, month2, on = ["Country", "bracket", "avg_lat"])
df71 = pd.merge(df71, avg, on = ["Country", "bracket", "avg_lat"])
df71["Difference"] = df71["Temp_y"] - df71["Temp_x"]

In [242]:
avg

Unnamed: 0,Country,bracket,avg_lat,mean
0,Afghanistan,31.5,31.5,20.852044
1,Afghanistan,34.0,34.2,18.178426
2,Afghanistan,34.5,34.6,14.996389
3,Albania,41.5,41.4,16.094621
4,Algeria,23.0,22.8,22.722766
...,...,...,...,...
1920,Wallis and Futuna,-13.0,-13.2,27.589167
1921,Western Sahara,23.5,23.7,20.863900
1922,Zambia,-18.0,-17.8,26.840000
1923,Zimbabwe,-20.0,-20.1,20.046824


In [250]:
fig = px.scatter(df71,
                 x = "avg_lat",
                 y = "mean",#"Difference",
                 color = "Difference",#"mean",
                 hover_name = "Country",
                 color_continuous_midpoint = 0,
                 color_continuous_scale = px.colors.diverging.RdGy_r
                )
fig.show()