In [2]:
# to do:
# add a repeatable block to get > 500 records if initial pull did not 
# enhance exceptions handling
# export printing of results into written log txt
# migrate unfound cities and status codes into separate df
# ISO Code Lookup on countries
# How to integrate state/province where appropriate?
# Visible low humidity in deserts

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from citipy import citipy
import time
import requests
from datetime import datetime
from config import weather_api_key
import string
from scipy.stats import linregress

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
lats = np.random.uniform(low=-90, high=90, size=1500)
longs = np.random.uniform(low=-180, high=90, size=1500)
lat_longs = zip(lats, longs)
lat_longs

In [None]:
base_url = "http://api.openweathermap.org/data/2.5/weather?units=Imperial&appid=" + weather_api_key

In [None]:
coordinates = list(lat_longs)
cities = list()

for coordinate in coordinates:
    city = citipy.nearest_city(latitude=coordinate[0], longitude=coordinate[1])
    if string.capwords(city.city_name) not in cities:
        cities.append(string.capwords(city.city_name)) # capitalizing all names since they came through in lowercase.  source https://favtutor.com/blogs/capitalize-first-letter-python
        
len(cities)

In [None]:
city_data = list()


print("Beginning Data Retrieval")
print("-------------------------------------")

record_count = 1
set_count = 1

for i, city in enumerate(cities):
    # group cities in sets of 50 for logging purposes
    if (i % 50 == 0 and i >= 50):
        set_count+=1
        record_count = 1
        # time.sleep(60) # delay execution for 60 seconds.  But why?
        
    city_url = base_url + "&q=" + city.replace(" ","+")
    
    print(f"Processing Record {record_count} of set {set_count} | {city}")
    
    record_count += 1
    
    try:
        #extract the JSON data using API URL
        city_weather = requests.get(city_url).json()
        # print(city_url)
        # print(city_weather)
        
        #parse out data points
        city_lat = city_weather["coord"]["lat"]
        city_long = city_weather["coord"]["lon"]
        city_max_temp = city_weather["main"]["temp_max"]
        city_humidity = city_weather["main"]["humidity"]
        city_clouds = city_weather["clouds"]["all"]
        city_wind = city_weather["wind"]["speed"]
        city_country = city_weather["sys"]["country"]
        
        #convert datetime to ISO
        city_date = datetime.utcfromtimestamp(city_weather["dt"]).strftime('%Y-%m-%d %H:%M:%S')
        
        #add to list
        city_data.append({"City": city,
                          "Lat": city_lat,
                          "Long": city_long,
                          "Max Temp": city_max_temp,
                          "Humidity": city_humidity,
                          "Cloudiness": city_clouds,
                          "Wind Speed": city_wind,
                          "Country": city_country,
                          "Date": city_date
                           })
        
        print(f"{city} processed successfully.")
        
    except:
        #Change me later to print out the error!
        
        status_code = requests.get(city_url).status_code
        
        if status_code == 404:
            print(f"ERROR: City {city} not found.")
        else:
            print(f"ERROR: City {city} not processed. StatusCode: {status_code}")
        pass
    
print("-------------------------------------")
print("Data Retrieval Complete!")
print("-------------------------------------")

In [None]:
city_data_df = pd.DataFrame(city_data)
city_data_df

In [None]:
column_order = ["City", "Country", "Date", "Lat", "Long", "Max Temp", "Humidity", "Cloudiness", "Wind Speed"]
city_data_df = city_data_df[column_order]
city_data_df

In [None]:
import os.path
output_data_file = os.path.join("weather_data", "cities.csv")
city_data_df.to_csv(output_data_file, index_label="City_ID")

## Plotting climate charts

In [None]:
lats = city_data_df["Lat"]
max_temps = city_data_df["Max Temp"]
humidity = city_data_df["Humidity"]
cloudiness = city_data_df["Cloudiness"]
wind_speed = city_data_df["Wind Speed"]

today = datetime.utcfromtimestamp(time.time()).strftime("%x")

In [None]:
plt.scatter(x = lats,
            y = max_temps,
           edgecolor = "black",
           linewidths=1,
           marker="o",
           alpha=0.8,
           label="Cities")

plt.title(f"Maximum Temperature by City Latitude on {today}")
plt.xlabel("Latitude")
plt.ylabel("Maximum Temperature")

plt.savefig("weather_data/Fig1.png")

plt.show()

In [None]:
plt.scatter(x = lats,
            y = humidity,
           edgecolor = "black",
           linewidths=1,
           marker="o",
           alpha=0.8,
           label="Cities")

plt.title(f"Humidity by City Latitude on {today}")
plt.xlabel("Latitude")
plt.ylabel("% Humidity")

plt.savefig("weather_data/Fig2.png")

plt.show()

In [None]:
plt.scatter(x = lats,
            y = cloudiness,
           edgecolor = "black",
           linewidths=1,
           marker="o",
           alpha=0.8,
           label="Cities")

plt.title(f"Cloudiness by City Latitude on {today}")
plt.xlabel("Latitude")
plt.ylabel("% Cloudiness")

plt.savefig("weather_data/Fig3.png")

plt.show()

In [None]:
plt.scatter(x = lats,
            y = wind_speed,
           edgecolor = "black",
           linewidths=1,
           marker="o",
           alpha=0.8,
           label="Cities")

plt.title(f"Wind Speed by City Latitude on {today}")
plt.xlabel("Latitude")
plt.ylabel("Wind Speed (mph)")

plt.savefig("weather_data/Fig4.png")

plt.show()

## Linear regression of collected data

Start by creating function.

In [None]:
def plot_linear_regression(x_values, y_values, title, y_label, text_coordinates):
    #Run linear regression against value inputs
    (slope, intercept, r_value, p_value, std_err) = linregress(x_values, y_values)
    
    # Calculate regression line
    regress_values = x_values * slope + intercept
    
    # Create equation
    line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
    
    # create scatter plot and reg line
    plt.scatter(x=x_values,y=y_values)
    plt.plot(x_values,regress_values,"r")
    plt.ylabel(y_label)
    plt.annotate(line_eq, text_coordinates, fontsize=15, color="red")
    plt.title(title)
    plt.xlabel("Latitude")
    
    plt.show()
    
    print(f"r = {r_value}\np = {p_value}\nstd error = {std_err}")

### Separate latitudes by hemisphere

In [None]:
index13 = city_data_df.loc[13]
index13

In [None]:
#boolean expression meant to filter dataframe in loc.  Think where clause
city_data_df["Lat"] >= 0

In [None]:
northern_hemi_df = city_data_df.loc[(city_data_df["Lat"] >= 0)]
southern_hemi_df = city_data_df.loc[(city_data_df["Lat"] < 0)]
southern_hemi_df.head()

In [None]:
x_values = northern_hemi_df["Lat"]
y_values = northern_hemi_df["Max Temp"]

plot_linear_regression(x_values,
                       y_values,
                       'Linear regresion on the Northern Hemisphere \n for Maximum Temperature',
                       'Max Temp',
                       (0,0))

In [None]:
x_values = southern_hemi_df["Lat"]
y_values = southern_hemi_df["Max Temp"]

plot_linear_regression(x_values,
                       y_values,
                       'Linear regresion on the Southern Hemisphere \n for Maximum Temperature',
                       'Max Temp',
                       (-50,80))

In [None]:
x_values = northern_hemi_df["Lat"]
y_values = northern_hemi_df["Humidity"]

plot_linear_regression(x_values,
                       y_values,
                       'Linear regresion on the Northern Hemisphere \n for % Humidity',
                       '% Humidity',
                       (40,20))

In [None]:
x_values = southern_hemi_df["Lat"]
y_values = southern_hemi_df["Humidity"]

plot_linear_regression(x_values,
                       y_values,
                       'Linear regresion on the Southern Hemisphere \n for % Humidity',
                       '% Humidity',
                       (-30,50))

In [None]:
x_values = northern_hemi_df["Lat"]
y_values = northern_hemi_df["Cloudiness"]

plot_linear_regression(x_values,
                       y_values,
                       'Linear regression on the Northern Hemisphere \n for % Cloudiness',
                       '% Humidity',
                       (40,50))

In [None]:
x_values = southern_hemi_df["Lat"]
y_values = southern_hemi_df["Cloudiness"]

plot_linear_regression(x_values,
                       y_values,
                       'Linear regression on the Southern Hemisphere \n for % Cloudiness',
                       '% Humidity',
                       (-55,50))

In [None]:
x_values = northern_hemi_df["Lat"]
y_values = northern_hemi_df["Wind Speed"]

plot_linear_regression(x_values,
                       y_values,
                       'Linear regression on the Northern Hemisphere \n for Wind Speed',
                       'Wind Speed',
                       (0,35))

In [None]:
x_values = southern_hemi_df["Lat"]
y_values = southern_hemi_df["Wind Speed"]

plot_linear_regression(x_values,
                       y_values,
                       'Linear regression on the Southern Hemisphere \n for Wind Speed',
                       'Wind Speed',
                       (-40,20))