In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import json
import datetime
from scipy.stats import linregress

# Import API key
from config import weather_api_key

# Incorporated citipy to determine city based on latitude and longitude
from citipy import citipy

# Output File (CSV)
output_data_file = "output_data/cities.csv"

# Range of latitudes and longitudes
lat_range = (-90, 90)
lng_range = (-180, 180)

## Generate Cities List

In [2]:
# Define a function that creates a set of "n" random lat and lng combinations
def city_set(n):
    
    lats = np.random.uniform(lat_range[0], lat_range[1], size=n)
    lngs = np.random.uniform(lng_range[0], lng_range[1], size=n)
    return zip(lats, lngs)

In [7]:
# Request weather data (temperature, humidity, cloudiness, windspeed) from OpenWeather

attempted = []
cities = []
lat = []
lng = []
temp = []
humidity = []
cloudiness = []
windspeed = []
country = []
date = []

print("-----------------------------------")
print("Retrieving Data – Please be patient")
print("-----------------------------------")


while len(cities) < 50:
    
    sample = city_set(10)
    for coord in sample:
        
        city = citipy.nearest_city(coord[0], coord[1]).city_name
        
        if city in attempted:
            break
        
        elif city not in cities:
            
            # Avoid exceeding 60 calls per minute from API request
            time.sleep(1)
            
            # Request data from API
            city_url = f"http://api.openweathermap.org/data/2.5/weather?q={city}&appid={weather_api_key}&units=imperial"
            response = requests.get(city_url).json()
            
            try:
                lat.append(response["coord"]["lat"])
                lng.append(response["coord"]["lon"])
                temp.append(response["main"]["temp_max"])
                humidity.append(response["main"]["humidity"])
                cloudiness.append(response["clouds"]["all"])
                windspeed.append(response["wind"]["speed"])
                country.append(response["sys"]["country"])
                date.append(response["dt"])
                cities.append(city)
                print(f"Found data for city #{len(cities)}: {city}")
                
            except:
                print(f"City '{city}' not found. Proceeding to next city.")
                pass
            
            attempted.append(city)
    
print("-------------------------------------------")
print(f"Successfully retrieved data for {len(cities)} cities.")
print("-------------------------------------------")

-----------------------------------
Retrieving Data – Please be patient
-----------------------------------
Found data for city #1: uyuni
Found data for city #2: ushuaia
Found data for city #3: batesville
Found data for city #4: arkansas city
City 'taolanaro' not found. Proceeding to next city.
Found data for city #5: san quintin
Found data for city #6: bluff
Found data for city #7: vila do maio
Found data for city #8: road town
Found data for city #9: vaini
Found data for city #10: tuy hoa
Found data for city #11: marienburg
Found data for city #12: jamestown
Found data for city #13: port elizabeth
Found data for city #14: vanderhoof
Found data for city #15: severo-kurilsk
Found data for city #16: ilulissat
Found data for city #17: hobart
Found data for city #18: shelburne
Found data for city #19: bendigo
Found data for city #20: albany
Found data for city #21: birganj
Found data for city #22: punta arenas
Found data for city #23: ixtapa
Found data for city #24: tessalit
City 'barents

In [None]:
# Create a dataframe
weather_df = pd.DataFrame({
    "City":cities,
    "Latitude":lat,
    "Longitude":lng,
    "Max Temperature (°F)":temp,
    "Humidity (%)":humidity,
    "Cloudiness (%)":cloudiness,
    "Wind Speed (mph)":windspeed,
    "Country Code":country,
    "Date & Time":date
})

# Extract average timestamp & store for later
mean_date = weather_df["Date & Time"].mean()
retrieval_date = datetime.datetime.fromtimestamp(mean_date).date()

# Export data into a csv
weather_df.to_csv(output_data_file,index = False)


weather_df.head()

## Scatterplots

This scatterplot shows the relationship between Temperature and Latitude of each city.

In [None]:
x_values = weather_df["Latitude"]
y_str = "Max Temperature (°F)"
y_values = weather_df[y_str]

plt.figure(figsize=(8,6))
plt.scatter(x_values, y_values)
plt.title(f'{y_str} vs. Latitude ({retrieval_date})')
plt.xlabel("Latitude")
plt.ylabel(y_str)

plt.savefig("output_data/Temperature_vs_Latitude.png")
plt.show()

This scatterplot shows the relationship between Humidity and Latitude of each city.

In [None]:
x_values = weather_df["Latitude"]
y_str = "Humidity (%)"
y_values = weather_df[y_str]

plt.figure(figsize=(8,6))
plt.scatter(x_values, y_values)
plt.title(f'{y_str} vs. Latitude ({retrieval_date})')
plt.xlabel("Latitude")
plt.ylabel(y_str)

plt.savefig("output_data/Humidity_vs_Latitude.png")
plt.show()

This scatterplot shows the relationship between Cloudiness and Latitude of each city.

In [None]:
x_values = weather_df["Latitude"]
y_str = "Cloudiness (%)"
y_values = weather_df[y_str]

plt.figure(figsize=(8,6))
plt.scatter(x_values, y_values)
plt.title(f'{y_str} vs. Latitude ({retrieval_date})')
plt.xlabel("Latitude")
plt.ylabel(y_str)

plt.savefig("output_data/Cloudiness_vs_Latitude.png")
plt.show()

This scatterplot shows the relationship between Wind Speed and Latitude of each city.

In [None]:
x_values = weather_df["Latitude"]
y_str = "Wind Speed (mph)"
y_values = weather_df[y_str]

plt.figure(figsize=(8,6))
plt.scatter(x_values, y_values)
plt.title(f'{y_str} vs. Latitude ({retrieval_date})')
plt.xlabel("Latitude")
plt.ylabel(y_str)

plt.savefig("output_data/Wind_Speed_vs_Latitude.png")
plt.show()

## Linear Regression

To analyze each relationship using linear regression, we will separate the data by hemisphere (since we are exploring whether distance away from the equator, where latitude is zero, is correlated with the weather data).

In [None]:
# Create a DataFrame for each hemisphere

north_df = weather_df.loc[weather_df["Latitude"]>0]

south_df = weather_df.loc[weather_df["Latitude"]<0]

#### Northern Hemisphere: Temperature vs. Latitude

In [None]:
x_values = north_df["Latitude"]
y_str = "Max Temperature (°F)"
y_values = north_df[y_str]

plt.figure(figsize=(8,6))
plt.scatter(x_values, y_values)
plt.title(f'{y_str} vs. Latitude ({retrieval_date}, N. Hemi.)')
plt.xlabel("Latitude")
plt.ylabel(y_str)

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = slope * x_values + intercept
plt.plot(x_values, regress_values,color="brown")

x = max(x_values)-.4*(max(x_values)-min(x_values))
y_range = (max(y_values)- min(y_values))
                      
plt.text(x,max(y_values)-y_range*.05,f"y = {round(slope,4)} * x + {round(intercept,4)}",c="brown",size=12)
plt.text(x,max(y_values)-y_range*.13,f"r-value: {round(rvalue,6)}",c="brown",size=12)
plt.text(x,max(y_values)-y_range*.21,f"r^2-value: {round(rvalue**2,6)}",c="brown",size=12)

plt.savefig("output_data/Temperature_vs_Latitude_North.png")
plt.show()

#### Southern Hemisphere: Temperature vs. Latitude

In [None]:
x_values = south_df["Latitude"]
y_str = "Max Temperature (°F)"
y_values = south_df[y_str]

plt.figure(figsize=(8,6))
plt.scatter(x_values, y_values)
plt.title(f'{y_str} vs. Latitude ({retrieval_date}, S. Hemi.)')
plt.xlabel("Latitude")
plt.ylabel(y_str)

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = slope * x_values + intercept
plt.plot(x_values, regress_values,color="purple")

x = min(x_values)
y_range = (max(y_values)- min(y_values))

plt.text(x,max(y_values)-y_range*.05,f"y = {round(slope,4)} * x + {round(intercept,4)}",c="purple",size=12)
plt.text(x,max(y_values)-y_range*.13,f"r-value: {round(rvalue,6)}",c="purple",size=12)
plt.text(x,max(y_values)-y_range*.21,f"r^2-value: {round(rvalue**2,6)}",c="purple",size=12)

plt.savefig("output_data/Temperature_vs_Latitude_South.png")
plt.show()

In [None]:
# Analysis here after testing

#### Northen Hemisphere: Humidity vs. Latitude

In [None]:
x_values = north_df["Latitude"]
y_str = "Humidity (%)"
y_values = north_df[y_str]

plt.figure(figsize=(8,6))
plt.scatter(x_values, y_values)
plt.title(f'{y_str} vs. Latitude ({retrieval_date}, N. Hemi.)')
plt.xlabel("Latitude")
plt.ylabel(y_str)

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = slope * x_values + intercept
plt.plot(x_values, regress_values,color="brown")

x = max(x_values)-.4*(max(x_values)-min(x_values))
y_range = (max(y_values)- min(y_values))
                      
plt.text(x,max(y_values)-y_range*.05,f"y = {round(slope,4)} * x + {round(intercept,4)}",c="brown",size=12)
plt.text(x,max(y_values)-y_range*.13,f"r-value: {round(rvalue,6)}",c="brown",size=12)
plt.text(x,max(y_values)-y_range*.21,f"r^2-value: {round(rvalue**2,6)}",c="brown",size=12)

plt.savefig("output_data/Humidity_vs_Latitude_North.png")
plt.show()

#### Southern Hemisphere: Humidity vs. Latitude

In [None]:
x_values = south_df["Latitude"]
y_str = "Humidity (%)"
y_values = south_df[y_str]

plt.figure(figsize=(8,6))
plt.scatter(x_values, y_values)
plt.title(f'{y_str} vs. Latitude ({retrieval_date}, S. Hemi.)')
plt.xlabel("Latitude")
plt.ylabel(y_str)

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = slope * x_values + intercept
plt.plot(x_values, regress_values,color="purple")

x = min(x_values)
y_range = (max(y_values)- min(y_values))

plt.text(x,max(y_values)-y_range*.05,f"y = {round(slope,4)} * x + {round(intercept,4)}",c="purple",size=12)
plt.text(x,max(y_values)-y_range*.13,f"r-value: {round(rvalue,6)}",c="purple",size=12)
plt.text(x,max(y_values)-y_range*.21,f"r^2-value: {round(rvalue**2,6)}",c="purple",size=12)

plt.savefig("output_data/Humidity_vs_Latitude_South.png")
plt.show()

In [None]:
# Analysis Later

#### Northern Hemisphere: Cloudiness vs. Latitude

In [None]:
x_values = north_df["Latitude"]
y_str = "Cloudiness (%)"
y_values = north_df[y_str]

plt.figure(figsize=(8,6))
plt.scatter(x_values, y_values)
plt.title(f'{y_str} vs. Latitude ({retrieval_date}, N. Hemi.)')
plt.xlabel("Latitude")
plt.ylabel(y_str)

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = slope * x_values + intercept
plt.plot(x_values, regress_values,color="brown")

x = max(x_values)-.4*(max(x_values)-min(x_values))
y_range = (max(y_values)- min(y_values))
                      
plt.text(x,max(y_values)-y_range*.05,f"y = {round(slope,4)} * x + {round(intercept,4)}",c="brown",size=12)
plt.text(x,max(y_values)-y_range*.13,f"r-value: {round(rvalue,6)}",c="brown",size=12)
plt.text(x,max(y_values)-y_range*.21,f"r^2-value: {round(rvalue**2,6)}",c="brown",size=12)

plt.savefig("output_data/Cloudiness_vs_Latitude_North.png")
plt.show()

#### Southern Hemisphere: Cloudiness vs. Latitude

In [None]:
x_values = south_df["Latitude"]
y_str = "Cloudiness (%)"
y_values = south_df[y_str]

plt.figure(figsize=(8,6))
plt.scatter(x_values, y_values)
plt.title(f'{y_str} vs. Latitude ({retrieval_date}, S. Hemi.)')
plt.xlabel("Latitude")
plt.ylabel(y_str)

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = slope * x_values + intercept
plt.plot(x_values, regress_values,color="purple")

x = min(x_values)
y_range = (max(y_values)- min(y_values))

plt.text(x,max(y_values)-y_range*.05,f"y = {round(slope,4)} * x + {round(intercept,4)}",c="purple",size=12)
plt.text(x,max(y_values)-y_range*.13,f"r-value: {round(rvalue,6)}",c="purple",size=12)
plt.text(x,max(y_values)-y_range*.21,f"r^2-value: {round(rvalue**2,6)}",c="purple",size=12)

plt.savefig("output_data/Cloudiness_vs_Latitude_South.png")
plt.show()

In [None]:
# Analysis Later

#### Northern Hemisphere: Wind Speed vs. Latitude

In [None]:
x_values = north_df["Latitude"]
y_str = "Wind Speed (mph)"
y_values = north_df[y_str]

plt.figure(figsize=(8,6))
plt.scatter(x_values, y_values)
plt.title(f'{y_str} vs. Latitude ({retrieval_date}, N. Hemi.)')
plt.xlabel("Latitude")
plt.ylabel(y_str)

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = slope * x_values + intercept
plt.plot(x_values, regress_values,color="brown")

x = max(x_values)-.4*(max(x_values)-min(x_values))
y_range = (max(y_values)- min(y_values))
                      
plt.text(x,max(y_values)-y_range*.05,f"y = {round(slope,4)} * x + {round(intercept,4)}",c="brown",size=12)
plt.text(x,max(y_values)-y_range*.13,f"r-value: {round(rvalue,6)}",c="brown",size=12)
plt.text(x,max(y_values)-y_range*.21,f"r^2-value: {round(rvalue**2,6)}",c="brown",size=12)

plt.savefig("output_data/Wind_Speed_vs_Latitude_North.png")
plt.show()

#### Southern Hemisphere: Wind Speed vs. Latitude

In [None]:
x_values = south_df["Latitude"]
y_str = "Wind Speed (mph)"
y_values = south_df[y_str]

plt.figure(figsize=(8,6))
plt.scatter(x_values, y_values)
plt.title(f'{y_str} vs. Latitude ({retrieval_date}, S. Hemi.)')
plt.xlabel("Latitude")
plt.ylabel(y_str)

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = slope * x_values + intercept
plt.plot(x_values, regress_values,color="purple")

x = min(x_values)
y_range = (max(y_values)- min(y_values))

plt.text(x,max(y_values)-y_range*.05,f"y = {round(slope,4)} * x + {round(intercept,4)}",c="purple",size=12)
plt.text(x,max(y_values)-y_range*.13,f"r-value: {round(rvalue,6)}",c="purple",size=12)
plt.text(x,max(y_values)-y_range*.21,f"r^2-value: {round(rvalue**2,6)}",c="purple",size=12)

plt.savefig("output_data/Wind_Speed_vs_Latitude_South.png")
plt.show()

In [None]:
# Analysis Later