# Exploratory Data Anaysis
## Load Dependencies

In [39]:
import csv
import pandas as pd
import numpy as np
from functools import reduce
import requests
import time
from datetime import datetime
import requests.exceptions
import os
import json
import random
import re
from collections import defaultdict

## Load Dataframes

In [6]:
wine_scores = pd.read_csv('data\clean\cleaned_combined_wine_data.csv')
weather_data = pd.read_csv('data\clean\combined_weather_data_clean.csv')

In [7]:
# Sanity Check
wine_scores.head()

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Vintage
0,Calem 1961 Colheita Tawny Port (Port),,Port,,Portugal,95,320.0,Calem,Port Blend,1961
1,Calem 1961 Colheita Tawny (Port),,Port,,Portugal,95,320.0,Calem,Port,1961
2,Warre's 1961 Reserve Tawny Port (Port),,Port,,Portugal,89,111.0,Warre's,Port Blend,1961
3,Wiese & Krohn 1961 Colheita Port (Port),,Port,,Portugal,92,200.0,Wiese & Krohn,Port Blend,1961
4,Cossart Gordon 1962 Bual (Madeira),,Madeira,,Portugal,96,355.0,Cossart Gordon,Madeira,1962


In [5]:
weather_data.head()

Unnamed: 0,Station ID,Country,City,Data Type,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,5046,INDONESIA,KIJANG TANJUNG PINANG,4,1993,25.6,25.6,25.7,26.0,25.9,26.8,25.8,26.3,25.8,25.4,25.4,25.9
1,5046,INDONESIA,KIJANG TANJUNG PINANG,4,1995,26.0,25.6,25.9,26.0,26.6,26.6,26.3,26.2,26.7,26.1,25.9,25.6
2,5046,INDONESIA,KIJANG TANJUNG PINANG,5,1993,246.6,61.9,285.8,324.2,424.1,169.6,261.7,157.9,240.3,398.5,473.6,683.9
3,5046,INDONESIA,KIJANG TANJUNG PINANG,5,1995,388.0,325.1,190.7,357.9,299.5,334.3,250.9,213.4,265.6,496.3,630.8,277.1
4,5046,INDONESIA,KIJANG TANJUNG PINANG,6,1993,29.6,30.7,30.6,31.3,29.8,31.2,30.7,31.3,30.7,30.3,29.9,29.9


## Working Dataframes
We need to create some temporary working dataframes for us to do preliminary data exploration. First lets reduce the amount of weather data we have by just grabbing the data from countries and years that we have wines for.

In [10]:
wine_data_df = pd.read_csv('data\clean\cleaned_combined_wine_data.csv')
weather_data_df = pd.read_csv('data\clean\combined_weather_data_clean.csv')

# Rename the 'Vintage' column in wine_data_df to 'Year' for consistency
wine_data_df.rename(columns={'Vintage': 'Year'}, inplace=True)

# Standardize country names in wine data to uppercase for matching
wine_data_df['Country'] = wine_data_df['Country'].str.upper()

# Filter the weather data to only include rows that match the Country and Year in the wine data
matched_weather_data = weather_data_df[weather_data_df[['Country', 'Year']].apply(tuple, 1).isin(wine_data_df[['Country', 'Year']].apply(tuple, 1))]

# Reset the index of the DataFrame
matched_weather_data.reset_index(drop=True, inplace=True)

# Display the first few rows of the filtered weather data
matched_weather_data.head()

Unnamed: 0,Station ID,Country,City,Data Type,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,0,AUSTRALIA,SCONE SOIL CONS 0,4,1985,24.6,22.4,21.5,17.7,14.2,10.2,10.6,11.3,13.2,17.2,19.1,22.8
1,0,AUSTRALIA,SCONE SOIL CONS 0,4,1987,25.7,24.7,19.6,17.9,14.2,12.0,10.1,13.1,14.8,17.1,20.0,22.6
2,0,AUSTRALIA,SCONE SOIL CONS 0,5,1985,4.6,88.8,48.6,53.8,36.2,55.2,17.0,45.2,76.4,143.2,33.4,121.6
3,0,AUSTRALIA,SCONE SOIL CONS 0,5,1987,90.6,4.8,82.8,5.0,73.4,20.8,9.8,110.2,19.2,42.6,41.0,113.8
4,6600,SWITZERLAND,ST. CHRISCHONA,4,2004,0.9,2.3,4.8,9.5,11.7,15.7,17.2,17.7,14.4,10.7,4.0,0.2


Now we want to get rid of the Data Type column, so its easier for someone to read all relevant weather data in one dataframe. Atomosphereric pressure isn't relevant for what we are trying to calculate, and not to mention not all locations measure atomosphereric pressure, so we will be getting ride of data type 2 and 3 if it ever pops up.

In [16]:
# Function to rename columns based on the data type
def rename_columns(df, data_type):
    prefix = {
        4: 'daily_temp',
        5: 'precipitation',
        6: 'daily_temp_MAX',
        7: 'daily_temp_MIN',
        8: 'humidity'
    }.get(data_type, 'unknown')

    month_cols = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    new_month_cols = [f'{prefix}_{month}' for month in month_cols]
    rename_dict = dict(zip(month_cols, new_month_cols))
    df.rename(columns=rename_dict, inplace=True)
    return df

# Splitting the DataFrame into separate DataFrames based on Data Type
dfs = {}
for dtype in [4, 5, 6, 7, 8]:
    df = matched_weather_data[matched_weather_data['Data Type'] == dtype].copy()
    df = rename_columns(df, dtype)
    dfs[dtype] = df

# Drop the 'Data Type' column from each DataFrame and rename the month columns
for dtype, df in dfs.items():
    df = df.drop(columns=['Data Type'])
    df = rename_columns(df, dtype)
    dfs[dtype] = df
    
# Merge the DataFrames back together with explicit suffixes for overlapping columns
merged_weather_df = reduce(lambda left, right: pd.merge(left, right, on=['Station ID', 'Country', 'City', 'Year'], how='outer', suffixes=('', '_duplicate')), dfs.values())

# Remove any columns that were duplicated during the merge
merged_weather_df = merged_weather_df[[col for col in merged_weather_df.columns if not col.endswith('_duplicate')]]

# Reset the index of the DataFrame
merged_weather_df.reset_index(drop=True, inplace=True)

# Check results
merged_weather_df.head()

Unnamed: 0,Station ID,Country,City,Year,daily_temp_Jan,daily_temp_Feb,daily_temp_Mar,daily_temp_Apr,daily_temp_May,daily_temp_Jun,...,humidity_Mar,humidity_Apr,humidity_May,humidity_Jun,humidity_Jul,humidity_Aug,humidity_Sep,humidity_Oct,humidity_Nov,humidity_Dec
0,0,AUSTRALIA,SCONE SOIL CONS 0,1985,24.6,22.4,21.5,17.7,14.2,10.2,...,,,,,,,,,,
1,0,AUSTRALIA,SCONE SOIL CONS 0,1987,25.7,24.7,19.6,17.9,14.2,12.0,...,,,,,,,,,,
2,6600,SWITZERLAND,ST. CHRISCHONA,2004,0.9,2.3,4.8,9.5,11.7,15.7,...,,,,,,,,,,
3,6601,SWITZERLAND,BASEL / BINNINGEN,2004,2.3,3.0,5.7,10.6,13.2,17.7,...,,,,,,,,,,
4,6601,SWITZERLAND,BASEL / BINNINGEN,2011,2.4,3.9,7.5,13.4,16.6,18.1,...,,,,,,,,,,


In [17]:
merged_weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40321 entries, 0 to 40320
Data columns (total 64 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Station ID          40321 non-null  int64  
 1   Country             40321 non-null  object 
 2   City                40321 non-null  object 
 3   Year                40321 non-null  int64  
 4   daily_temp_Jan      39238 non-null  float64
 5   daily_temp_Feb      39238 non-null  float64
 6   daily_temp_Mar      39238 non-null  float64
 7   daily_temp_Apr      39238 non-null  float64
 8   daily_temp_May      39238 non-null  float64
 9   daily_temp_Jun      39238 non-null  float64
 10  daily_temp_Jul      39238 non-null  float64
 11  daily_temp_Aug      39238 non-null  float64
 12  daily_temp_Sep      39238 non-null  float64
 13  daily_temp_Oct      39238 non-null  float64
 14  daily_temp_Nov      39238 non-null  float64
 15  daily_temp_Dec      39238 non-null  float64
 16  prec

In [18]:
# Save dataframe to CSV
merged_weather_df.to_csv('data\clean\merged_weather.csv', index=False)

Next we will filter out wine data that have matching location and vintages.

In [21]:
# Load Dataframes
wine_data = pd.read_csv('data/clean/cleaned_combined_wine_data.csv')
merged_weather_df = pd.read_csv('data\clean\merged_weather.csv')

# Create a temporary DataFrame with unique city-country pairs
unique_city_country = merged_weather_df[['City', 'Country']].drop_duplicates()
unique_city_country['City'] = unique_city_country['City'].str.upper()
unique_city_country['City Words'] = unique_city_country['City'].apply(lambda x: x.split())

# Function to check for any matching word and country match
def any_matching_word_and_country(row, city_country_df):
    for region_key in ['Region 1', 'Region 2', 'Region 3']:
        region = row[region_key]
        if pd.notna(region):
            region_words = set(region.upper().split())
            for _, city_country_row in city_country_df.iterrows():
                if any(word in region_words for word in city_country_row['City Words']) and row['Country'].upper() == city_country_row['Country'].upper():
                    return True
    return False

# Ensure all relevant columns are strings and handle NaNs
wine_data[['Region 1', 'Region 2', 'Region 3']] = wine_data[['Region 1', 'Region 2', 'Region 3']].fillna('').astype(str)
wine_data['Country'] = wine_data['Country'].fillna('').astype(str)

# Filter wine_data
filtered_wine_data = wine_data[wine_data.apply(lambda row: any_matching_word_and_country(row, unique_city_country), axis=1)]

# Check result
filtered_wine_data.head()

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Vintage
43,Bodegas Dios Baco S.L. NV 1970 Oxford Pedro Xi...,Andalucia,,Jerez,Spain,85,40.0,Bodegas Dios Baco S.L.,Sherry,1970
75,Adega de Favaios 1980 Moscatel do Douro,,Moscatel do Douro,,Portugal,93,137.0,Adega de Favaios,Moscatel,1980
77,Adega de Favaios 1980 Moscatel (Moscatel do Do...,,Moscatel do Douro,,Portugal,93,137.0,Adega de Favaios,Muscat,1980
79,Cuva Vella 1980 Vintage Muscat (Valencia),Levante,,Valencia,Spain,90,65.0,Cuva Vella,Muscat,1980
87,Moulin Touchais 1982 Chenin Blanc (Coteaux du ...,Loire Valley,,Coteaux du Layon,France,95,64.0,Moulin Touchais,Chenin Blanc,1982


In [22]:
# Reset the index of the DataFrame
filtered_wine_data.reset_index(drop=True, inplace=True)

# Check result
filtered_wine_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16760 entries, 0 to 16759
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Wine Name  16760 non-null  object 
 1   Region 1   16760 non-null  object 
 2   Region 2   16760 non-null  object 
 3   Region 3   16760 non-null  object 
 4   Country    16760 non-null  object 
 5   Score      16760 non-null  int64  
 6   Price      15406 non-null  float64
 7   Winery     16760 non-null  object 
 8   Variety    16705 non-null  object 
 9   Vintage    16760 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 1.3+ MB


In [24]:
# Save dataframe to CSV
filtered_wine_data.to_csv(r'data\clean\filtered_wine_data.csv', index=False)

With the low amount of match rate, we would need to approach this differently. We would need to use a weather API to obtain the weather information for wines that are missing related weather data. Keep in mind, some wines don't have an associated specific city and some wines just have a wine region. So what would make more sense is to have an api look up the geocoordinates of the wine region and then plug it into a weather API to get the necessary information.

## Implementing an API Solution

The following script uses Bing API to grab geocoordinates of regions mentioned in our list of wines, then uses the geocoordinates to grab climate data from NOAA api. Since we are using free service level of these APIs we need the script to stop when we hit the daily request limit. 

To start off let's create a proof of concent test script that tests out all the logic we need for our full script. The following script samples a random row from our clean wine csv file and try to return weather databased on it.

In [145]:
# Constants for rate limiting
NOAA_RATE_LIMIT = 0.2  # seconds between requests
NOAA_DAILY_LIMIT = 10000
noaa_request_count = 0  # Initialize request count

# Function to clean and prepare keywords
def clean_keywords(text):
    if pd.isna(text):
        return ""
    # Remove text within parentheses
    text = re.sub(r'\(.*?\)', '', text).strip()
    return text

# Function to create variations of the winery name
def create_winery_variations(winery):
    variations = ["Wines", "Wine", "Vineyard", "Estate", "Winery", "Vineyards"]
    return [f"{winery} {variation}" for variation in variations] + [winery]

# Function to construct search query
def construct_query(row, winery_variations):
    regions = [clean_keywords(row['Region 1']), clean_keywords(row['Region 2']), clean_keywords(row['Region 3'])]
    country = clean_keywords(row['Country'])

    # Construct different combinations
    combinations = []
    for winery_variation in winery_variations:
        for i in range(len(regions)):
            for j in range(i, len(regions)):
                query_parts = [winery_variation] + regions[i:j+1] + [country]
                combinations.append(' '.join(filter(None, query_parts)))

    return combinations, regions, country

# Function to calculate the bounding box
def get_gps_bounding_box(latitude, longitude, deg_lat=1.0, deg_lon=1.0):
    n = min(90, latitude + deg_lat)
    s = max(-90, latitude - deg_lat)
    e = min(180, longitude + deg_lon)
    w = max(-180, longitude - deg_lon)
    return n, w, s, e

# Function to find weather stations by bounding box with rate limiting
def get_stations_by_bounding_box(lat, lon, api_token):
    global noaa_request_count
    if noaa_request_count >= NOAA_DAILY_LIMIT:
        print("NOAA daily request limit reached.")
        return []

    time.sleep(NOAA_RATE_LIMIT)
    n, w, s, e = get_gps_bounding_box(lat, lon)
    url = f"https://www.ncei.noaa.gov/access/services/search/v1/data?dataset=global-summary-of-the-month&boundingBox={n},{w},{s},{e}&dataTypes=TMIN,TMAX,PRCP,TAVG&limit=10&offset=0"
    headers = {'token': api_token}
    try:
        response = requests.get(url, headers=headers, timeout=10)  # 10 seconds timeout
        noaa_request_count += 1
        if response.status_code == 200:
            stations_data = response.json().get('results', [])
            valid_stations = []
            for station in stations_data:
                station_id = station['id']
                # Extract the valid part of the station ID
                valid_station_id = station_id.split(':')[1].split('.')[0]
                valid_stations.append(valid_station_id)
            return valid_stations
    except requests.exceptions.Timeout:
        print("NOAA request timed out.")
    except Exception as e:
        print(f"Error in NOAA request: {e}")
    return []

# Function to get monthly climate data for a station with rate limiting
def get_mly_climate_data_for_station(station_id, vintage_year, api_token):
    global noaa_request_count
    if noaa_request_count >= NOAA_DAILY_LIMIT:
        print("NOAA daily request limit reached.")
        return {}

    start_date = f"{vintage_year}-01-01"
    end_date = f"{vintage_year}-12-31"
    time.sleep(NOAA_RATE_LIMIT)
    url = f"https://www.ncei.noaa.gov/access/services/data/v1?dataset=global-summary-of-the-month&dataTypes=TMIN,TMAX,PRCP,TAVG&stations={station_id}&startDate={start_date}&endDate={end_date}&format=json&units=standard&includeAttributes=false"
    headers = {'token': api_token}

    try:
        response = requests.get(url, headers=headers, timeout=10)
        noaa_request_count += 1

        if response.status_code == 200:
            monthly_data_raw = response.json()
            monthly_data = {f"{data_type}_{month}": [] for data_type in ['TMIN', 'TMAX', 'PRCP', 'TAVG'] for month in range(1, 13)}

            for record in monthly_data_raw:
                month = int(record['DATE'].split('-')[1])
                for data_type in ['TMIN', 'TMAX', 'PRCP', 'TAVG']:
                    if record.get(data_type) is not None:
                        monthly_data[f"{data_type}_{month}"].append(float(record[data_type]))

            # Averaging the data
            averaged_data = {key: sum(values) / len(values) if values else None for key, values in monthly_data.items()}
            return averaged_data

    except requests.exceptions.Timeout:
        print("NOAA request timed out.")
    except Exception as e:
        print(f"Error in NOAA request: {e}")
    return {}

# Function to check if address matches the country
def is_country_match(country, address):
    return country.lower() in address.lower()

# Function to check if address matches any of the regions and country
def is_region_country_match(regions, country, address):
    region_country_text = " ".join([r for r in regions if r] + [country]).lower()
    return any(region.lower() in address.lower() for region in region_country_text.split()) and country.lower() in address.lower()

# Read CSV file
df = pd.read_csv('data/clean/cleaned_combined_wine_data.csv')

# Select a random row
random_row = df.sample().iloc[0]

# Get winery name and create variations
winery = clean_keywords(random_row['Winery'])
winery_variations = create_winery_variations(winery)

# Construct queries
queries, regions, country = construct_query(random_row, winery_variations)

# Bing Maps API Key
bing_maps_key = 'Insert Key'

# NOAA API token
noaa_api_token = "Insert Key"

# Function to search using Bing Maps API
def search_address(query):
    url = f"http://dev.virtualearth.net/REST/v1/Locations?query={query}&key={bing_maps_key}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    return None

# Function to extract address, location name, and geocoordinates from the result
def extract_address_info(result):
    if result and 'resourceSets' in result:
        resources = result['resourceSets'][0]['resources']
        if resources:
            location_name = resources[0]['name']
            address = resources[0]['address']['formattedAddress']
            coordinates = resources[0]['point']['coordinates'] if 'point' in resources[0] else None
            return location_name, address, coordinates
    return None, None, None

# Function to check if the winery name is part of the business name in the location
def is_correct_business_match(winery_variations, location_name, address, country):
    return any(winery_variation.lower() in location_name.lower() for winery_variation in winery_variations) and is_country_match(country, address)

# Function to find the best match for the winery
def find_best_match(queries, winery_variations, regions, country):
    best_match = None
    for query in queries:
        result = search_address(query)
        location_name, address, coordinates = extract_address_info(result)
        if location_name and address and coordinates:
            if is_correct_business_match(winery_variations, location_name, address, country):
                return (query, location_name, address, coordinates)
            elif is_region_country_match(regions, country, address) and not best_match:
                best_match = (query, location_name, address, coordinates)

    return best_match

# Find the best match
best_match = find_best_match(queries, winery_variations, regions, country)
if best_match:
    query, location_name, address, coordinates = best_match
    print(f"Address Found: {query}, {location_name}, {address}, {coordinates}")
    lat, lon = coordinates
    station_results = get_stations_by_bounding_box(lat, lon, noaa_api_token)

    aggregated_weather_data = {
        "TMAX": defaultdict(list),
        "TMIN": defaultdict(list),
        "PRCP": defaultdict(list),
        "TAVG": defaultdict(list)
    }

    # Collecting data from each station
    for station_id in station_results:
        station_weather_data = get_mly_climate_data_for_station(station_id, vintage_year, noaa_api_token)
        if station_weather_data:
            for key, value in station_weather_data.items():
                if value is not None:
                    metric, month = key.split('_')
                    aggregated_weather_data[metric][int(month)].append(value)

    # Averaging the data across all stations
    averaged_weather_data = {}
    for metric, monthly_values in aggregated_weather_data.items():
        for month in range(1, 13):
            if monthly_values[month]:
                avg_value = sum(monthly_values[month]) / len(monthly_values[month])
                averaged_weather_data[f"{metric}_{month}"] = avg_value
            else:
                averaged_weather_data[f"{metric}_{month}"] = None

    print("Averaged Weather Data:", averaged_weather_data)
else:
    print("No valid address found.")

Address Found: Gadais Pere et Fils Wines Loire Valley France, Loire Valley, Indre-et-Loire, France, Loire Valley, Indre-et-Loire, France, [47.39888763, 0.7027778]
Averaged Weather Data: {'TMAX_1': 30.84444444444444, 'TMAX_2': 37.166666666666664, 'TMAX_3': 46.288888888888884, 'TMAX_4': 59.333333333333336, 'TMAX_5': 68.56, 'TMAX_6': 79.12, 'TMAX_7': 87.59, 'TMAX_8': 83.44, 'TMAX_9': 75.41, 'TMAX_10': 64.24444444444445, 'TMAX_11': 52.57777777777778, 'TMAX_12': 42.1, 'TMIN_1': 13.211111111111114, 'TMIN_2': 16.266666666666666, 'TMIN_3': 26.54444444444444, 'TMIN_4': 37.5875, 'TMIN_5': 48.279999999999994, 'TMIN_6': 57.160000000000004, 'TMIN_7': 63.89, 'TMIN_8': 60.03000000000001, 'TMIN_9': 51.85000000000001, 'TMIN_10': 41.22222222222222, 'TMIN_11': 30.299999999999997, 'TMIN_12': 22.766666666666666, 'PRCP_1': 1.59, 'PRCP_2': 1.9749999999999999, 'PRCP_3': 2.6, 'PRCP_4': 4.241428571428572, 'PRCP_5': 4.363333333333333, 'PRCP_6': 3.397777777777778, 'PRCP_7': 3.9377777777777783, 'PRCP_8': 6.4455555

After running the script a few times, it is obvious it is not perfect. Not all locations can be pinpointed to the exact city or village that each wine is produced. This is mostly the fault of the wine data we have, since not all data have detailed wine location information. This could skew our data to higher class wines where more detailed location information is provided. However the good news is that there are low class wines from smaller countries where even if there isn't perfect location data, the difference in weather data is within a margin of error (unless there are some geographic feature that makes the are have a different weather pattern). Also there are wines where only the province or state is listed, so using the averaged weather data in the area would give a good indication, since these wines are meant to represent that they are made with grapes all across the state, county, or province.

Next we can write a full script that iterate through a CSV. Here to make sure we can get a result to make sure the script works, I've created a CSV file that contains a small sample of wines for the script to be tested on.

In [146]:
# Constants for rate limiting
NOAA_RATE_LIMIT = 0.2  # seconds between requests
NOAA_DAILY_LIMIT = 10000
noaa_request_count = 0  # Initialize request count

# Bing Maps API Key (replace with your key)
bing_maps_key = 'Insert Key'

# NOAA API token (replace with your token)
noaa_api_token = 'Insert Key'

# Function to clean and prepare keywords
def clean_keywords(text):
    if pd.isna(text):
        return ""
    # Remove text within parentheses
    text = re.sub(r'\(.*?\)', '', text).strip()
    return text

# Function to create variations of the winery name
def create_winery_variations(winery):
    variations = ["Wines", "Wine", "Vineyard", "Estate", "Winery", "Vineyards"]
    return [f"{winery} {variation}" for variation in variations] + [winery]

# Function to construct search query
def construct_query(row, winery_variations):
    regions = [clean_keywords(row['Region 1']), clean_keywords(row['Region 2']), clean_keywords(row['Region 3'])]
    country = clean_keywords(row['Country'])

    # Construct different combinations
    combinations = []
    for winery_variation in winery_variations:
        for i in range(len(regions)):
            for j in range(i, len(regions)):
                query_parts = [winery_variation] + regions[i:j+1] + [country]
                combinations.append(' '.join(filter(None, query_parts)))

    return combinations, regions, country

# Function to calculate the bounding box
def get_gps_bounding_box(latitude, longitude, deg_lat=1.0, deg_lon=1.0):
    n = min(90, latitude + deg_lat)
    s = max(-90, latitude - deg_lat)
    e = min(180, longitude + deg_lon)
    w = max(-180, longitude - deg_lon)
    return n, w, s, e

# Function to find weather stations by bounding box with rate limiting
def get_stations_by_bounding_box(lat, lon, api_token):
    global noaa_request_count
    if noaa_request_count >= NOAA_DAILY_LIMIT:
        print("NOAA daily request limit reached.")
        return []

    time.sleep(NOAA_RATE_LIMIT)
    n, w, s, e = get_gps_bounding_box(lat, lon)
    url = f"https://www.ncei.noaa.gov/access/services/search/v1/data?dataset=global-summary-of-the-month&boundingBox={n},{w},{s},{e}&dataTypes=TMIN,TMAX,PRCP,TAVG&limit=10&offset=0"
    headers = {'token': api_token}
    try:
        response = requests.get(url, headers=headers, timeout=10)  # 10 seconds timeout
        noaa_request_count += 1
        if response.status_code == 200:
            stations_data = response.json().get('results', [])
            valid_stations = []
            for station in stations_data:
                station_id = station['id']
                # Extract the valid part of the station ID
                valid_station_id = station_id.split(':')[1].split('.')[0]
                valid_stations.append(valid_station_id)
            return valid_stations
    except requests.exceptions.Timeout:
        print("NOAA request timed out.")
    except Exception as e:
        print(f"Error in NOAA request: {e}")
    return []

# Function to check if address matches the country
def is_country_match(country, address):
    return country.lower() in address.lower()

# Function to check if address matches any of the regions and country
def is_region_country_match(regions, country, address):
    region_country_text = " ".join([r for r in regions if r] + [country]).lower()
    return any(region.lower() in address.lower() for region in region_country_text.split()) and country.lower() in address.lower()

# Function to search using Bing Maps API
def search_address(query):
    url = f"http://dev.virtualearth.net/REST/v1/Locations?query={query}&key={bing_maps_key}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    return None

# Function to get monthly climate data for a station with rate limiting
def get_mly_climate_data_for_station(station_id, vintage_year, api_token):
    global noaa_request_count
    if noaa_request_count >= NOAA_DAILY_LIMIT:
        print("NOAA daily request limit reached.")
        return {}

    start_date = f"{vintage_year}-01-01"
    end_date = f"{vintage_year}-12-31"
    time.sleep(NOAA_RATE_LIMIT)
    url = f"https://www.ncei.noaa.gov/access/services/data/v1?dataset=global-summary-of-the-month&dataTypes=TMIN,TMAX,PRCP,TAVG&stations={station_id}&startDate={start_date}&endDate={end_date}&format=json&units=standard&includeAttributes=false"
    headers = {'token': api_token}

    try:
        response = requests.get(url, headers=headers, timeout=10)
        noaa_request_count += 1

        if response.status_code == 200:
            monthly_data_raw = response.json()
            monthly_data = {f"{data_type}_{month}": [] for data_type in ['TMIN', 'TMAX', 'PRCP', 'TAVG'] for month in range(1, 13)}

            for record in monthly_data_raw:
                month = int(record['DATE'].split('-')[1])
                for data_type in ['TMIN', 'TMAX', 'PRCP', 'TAVG']:
                    if record.get(data_type) is not None:
                        monthly_data[f"{data_type}_{month}"].append(float(record[data_type]))

            # Averaging the data
            averaged_data = {key: sum(values) / len(values) if values else None for key, values in monthly_data.items()}
            return averaged_data

    except requests.exceptions.Timeout:
        print("NOAA request timed out.")
    except Exception as e:
        print(f"Error in NOAA request: {e}")
    return {}

# Function to extract address, location name, and geocoordinates from the result
def extract_address_info(result):
    if result and 'resourceSets' in result:
        resources = result['resourceSets'][0]['resources']
        if resources:
            location_name = resources[0]['name']
            address = resources[0]['address']['formattedAddress']
            coordinates = resources[0]['point']['coordinates'] if 'point' in resources[0] else None
            return location_name, address, coordinates
    return None, None, None

# Function to check if the winery name is part of the business name in the location
def is_correct_business_match(winery_variations, location_name, address, country):
    return any(winery_variation.lower() in location_name.lower() for winery_variation in winery_variations) and is_country_match(country, address)

# Function to find the best match for the winery
def find_best_match(queries, winery_variations, regions, country):
    best_match = None
    for query in queries:
        result = search_address(query)
        location_name, address, coordinates = extract_address_info(result)
        if location_name and address and coordinates:
            if is_correct_business_match(winery_variations, location_name, address, country):
                return (query, location_name, address, coordinates)
            elif is_region_country_match(regions, country, address) and not best_match:
                best_match = (query, location_name, address, coordinates)

    return best_match

# Function to process each wine entry
def process_wine_entry(row):
    winery = clean_keywords(row['Winery'])
    winery_variations = create_winery_variations(winery)
    queries, regions, country = construct_query(row, winery_variations)

    best_match = find_best_match(queries, winery_variations, regions, country)
    if best_match:
        query, location_name, address, coordinates = best_match
        lat, lon = coordinates
        station_results = get_stations_by_bounding_box(lat, lon, noaa_api_token)

        aggregated_weather_data = {
            "TMAX": defaultdict(list),
            "TMIN": defaultdict(list),
            "PRCP": defaultdict(list),
            "TAVG": defaultdict(list)
        }

        for station_id in station_results:
            station_weather_data = get_mly_climate_data_for_station(station_id, row['Vintage'], noaa_api_token)
            if station_weather_data:
                for key, value in station_weather_data.items():
                    if value is not None:
                        metric, month = key.split('_')
                        aggregated_weather_data[metric][int(month)].append(value)

        averaged_weather_data = {}
        for metric, monthly_values in aggregated_weather_data.items():
            for month in range(1, 13):
                if monthly_values[month]:
                    avg_value = sum(monthly_values[month]) / len(monthly_values[month])
                    averaged_weather_data[f"{metric}_{month}"] = avg_value
                else:
                    averaged_weather_data[f"{metric}_{month}"] = None

        return {
            "Winery": row['Winery'],
            "Wine Name": row['Wine Name'],
            "Vintage": row['Vintage'],
            "Location Name": location_name,
            "Address": address,
            "Latitude": lat,
            "Longitude": lon,
            **averaged_weather_data
        }
    else:
        print(f"No valid address found for {row['Winery']}")
        return None

# Read CSV file
wine_data = pd.read_csv('data/clean/cleaned_combined_wine_data.csv')

# Process each wine entry
processed_wine_data = []
for _, row in wine_data.iterrows():
    processed_data = process_wine_entry(row)
    if processed_data:
        processed_wine_data.append(processed_data)

# Convert the list of dictionaries to a DataFrame
weather_enhanced_wine_data = pd.DataFrame(processed_wine_data)

# Save the results
weather_enhanced_wine_data.to_csv('data/intermediate/weather_enhanced_wine_data.csv', index=False)

# Display the results
weather_enhanced_wine_data.head()

NOAA request timed out.
NOAA request timed out.
No valid address found for Wiese & Krohn
NOAA request timed out.
No valid address found for W. & J. Graham's
No valid address found for Wiese & Krohn
No valid address found for Wiese & Krohn
No valid address found for W. & J. Graham's
No valid address found for W. & J. Graham's
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
No valid address found for Wiese & Krohn
No valid address found for Wiese & Krohn
No valid address found for Sebastiani
NOAA request timed out.
No valid address found for Sebastiani
No valid address found for Sebastiani
No valid address found for Heidsieck & Co Monopole
No valid address found for Heidsieck & Co Monopole
No valid address found for Argyle
No valid address found for Heidsieck & Co Monopole
NOAA request timed out.
No valid address found for Wellington
NOAA request timed out.
NOAA request timed out.
No valid add

In [147]:
test_data = pd.read_csv('data/intermediate/weather_enhanced_wine_data_sample.csv')
test_data

Unnamed: 0,Winery,Wine Name,Vintage,Location Name,Address,Latitude,Longitude,TMAX_1,TMAX_2,TMAX_3,...,TAVG_3,TAVG_4,TAVG_5,TAVG_6,TAVG_7,TAVG_8,TAVG_9,TAVG_10,TAVG_11,TAVG_12
0,Calem,Calem 1961 Colheita Tawny Port (Port),1961,Portugal,Portugal,39.682198,-7.968288,33.366667,43.200000,51.133333,...,41.166667,44.566667,55.766667,68.633333,72.833333,73.800000,64.900000,53.433333,39.066667,26.033333
1,Calem,Calem 1961 Colheita Tawny (Port),1961,Portugal,Portugal,39.682198,-7.968288,36.975000,47.466667,54.350000,...,44.050000,46.825000,57.875000,69.650000,74.025000,74.475000,66.975000,54.825000,42.400000,29.850000
2,Warre's,Warre's 1961 Reserve Tawny Port (Port),1961,Portugal,Portugal,39.682198,-7.968288,36.975000,47.466667,54.350000,...,44.050000,46.825000,57.875000,69.650000,74.025000,74.475000,66.975000,54.825000,42.400000,29.850000
3,Cossart Gordon,Cossart Gordon 1962 Bual (Madeira),1962,Portugal,Portugal,39.682198,-7.968288,37.066667,39.875000,47.700000,...,39.200000,51.175000,66.425000,70.025000,72.275000,73.425000,63.825000,57.425000,42.350000,29.350000
4,Van Zellers,Van Zellers 1962 Palmer Colheita White Port (P...,1962,Portugal,Portugal,39.682198,-7.968288,37.066667,39.875000,47.700000,...,39.200000,51.175000,66.425000,70.025000,72.275000,73.425000,63.825000,57.425000,42.350000,29.350000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,Salon,Salon 1990 Le Mesnil Blanc de Blancs Brut Char...,1990,"Salon-de-Provence, Bouches-du-Rhône, France","Salon-de-Provence, Bouches-du-Rhône, France",43.640297,5.086390,44.087500,43.911111,51.377778,...,41.055556,49.644444,57.144444,69.077778,72.777778,71.755556,65.711111,52.866667,43.655556,28.300000
186,Howard's Folly,Howard's Folly 1991 Casa Manoel Boullush White...,1991,Portugal,Portugal,39.682198,-7.968288,32.622222,44.622222,51.066667,...,40.522222,51.766667,63.700000,70.666667,74.200000,73.155556,63.644444,52.166667,35.911111,32.855556
187,Moulin Touchais,Moulin Touchais 1991 Chenin Blanc (Coteaux du ...,1991,"Loire Valley, Indre-et-Loire, France","Loire Valley, Indre-et-Loire, France",47.398888,0.702778,32.622222,44.622222,51.066667,...,40.522222,51.766667,63.700000,70.666667,74.200000,73.155556,63.644444,52.166667,35.911111,32.855556
188,Montecillo,Montecillo 1991 Seleccion Especial Gran Reserv...,1991,"Montecillo, Cantabria, Spain","Montecillo, Cantabria, Spain",42.774986,-3.975144,32.622222,44.622222,51.066667,...,40.522222,51.766667,63.700000,70.666667,74.200000,73.155556,63.644444,52.166667,35.911111,32.855556


From the previous test we can see that the script works to some extent. Now we add a few more logic to the script, such as making it auto-save whenever an error occurs, double checking for duplicates if the script is ran multiple times, and have capacity to have more than one API key so the whole csv file can be iterated through within a reasonable time.

In [None]:
# Constants for NOAA API rate limiting
NOAA_RATE_LIMIT = 0.2  # Seconds between requests
NOAA_DAILY_LIMIT = 10000  # Daily request limit per NOAA API token
noaa_request_count = 0  # Initialize NOAA request count

# List of NOAA API tokens
noaa_api_tokens = [
    'Key 1',
    'Key 2',
    'Key 3'
]
current_noaa_token_index = 0  # Current NOAA API token index

# Annual limit for Bing Maps API transactions
BING_MAPS_ANNUAL_LIMIT = 125000

# List of Bing Maps API tokens and their transaction counts
bing_maps_keys_info = [
    {'key': 'Key 1', 'transactions': 0},
    {'key': 'Key 2', 'transactions': 0},
    {'key': 'Key 3', 'transactions': 0}
]
current_bing_key_index = 0  # Current Bing Maps API token index

# Function to get the current NOAA API token
def get_current_noaa_api_token():
    return noaa_api_tokens[current_noaa_token_index]

# Function to increment the NOAA token index
def increment_noaa_token_index():
    global current_noaa_token_index
    current_noaa_token_index += 1
    if current_noaa_token_index >= len(noaa_api_tokens):
        print("All NOAA API tokens have reached their daily limit.")
        return False
    return True

# Function to get the current Bing Maps API key
def get_current_bing_maps_key():
    return bing_maps_keys_info[current_bing_key_index]['key']

# Function to increment the Bing Maps key index
def increment_bing_key_index():
    global current_bing_key_index
    current_bing_key_index += 1
    if current_bing_key_index >= len(bing_maps_keys_info):
        print("All Bing Maps API tokens have reached their annual limit.")
        return False
    return True

# Function to check and update Bing Maps transactions
def check_and_update_bing_transactions():
    global current_bing_key_index
    if bing_maps_keys_info[current_bing_key_index]['transactions'] >= BING_MAPS_ANNUAL_LIMIT:
        if not increment_bing_key_index():
            return False
    bing_maps_keys_info[current_bing_key_index]['transactions'] += 1
    return True

# Function to clean and prepare keywords
def clean_keywords(text):
    if pd.isna(text):
        return ""
    # Remove text within parentheses
    text = re.sub(r'\(.*?\)', '', text).strip()
    return text

# Function to create variations of the winery name
def create_winery_variations(winery):
    variations = ["Wines", "Wine", "Vineyard", "Estate", "Winery", "Vineyards"]
    return [f"{winery} {variation}" for variation in variations] + [winery]

# Function to construct search query
def construct_query(row, winery_variations):
    regions = [clean_keywords(row['Region 1']), clean_keywords(row['Region 2']), clean_keywords(row['Region 3'])]
    country = clean_keywords(row['Country'])

    # Construct different combinations
    combinations = []
    for winery_variation in winery_variations:
        for i in range(len(regions)):
            for j in range(i, len(regions)):
                query_parts = [winery_variation] + regions[i:j+1] + [country]
                combinations.append(' '.join(filter(None, query_parts)))

    return combinations, regions, country

# Function to calculate the bounding box
def get_gps_bounding_box(latitude, longitude, deg_lat=1.0, deg_lon=1.0):
    n = min(90, latitude + deg_lat)
    s = max(-90, latitude - deg_lat)
    e = min(180, longitude + deg_lon)
    w = max(-180, longitude - deg_lon)
    return n, w, s, e

# Function to find weather stations by bounding box with rate limiting
def get_stations_by_bounding_box(lat, lon):
    global noaa_request_count
    if noaa_request_count >= NOAA_DAILY_LIMIT:
        if not increment_noaa_token_index():
            return []  # All NOAA tokens exhausted
        noaa_request_count = 0  # Reset request count for new token

    api_token = get_current_noaa_api_token()
    time.sleep(NOAA_RATE_LIMIT)
    n, w, s, e = get_gps_bounding_box(lat, lon)
    url = f"https://www.ncei.noaa.gov/access/services/search/v1/data?dataset=global-summary-of-the-month&boundingBox={n},{w},{s},{e}&dataTypes=TMIN,TMAX,PRCP,TAVG&limit=10&offset=0"
    headers = {'token': api_token}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        noaa_request_count += 1
        if response.status_code == 200:
            stations_data = response.json().get('results', [])
            valid_stations = [station['id'].split(':')[1].split('.')[0] for station in stations_data]
            return valid_stations
    except requests.exceptions.Timeout:
        print("NOAA request timed out.")
        save_progress(processed_wine_data)  # Save progress on error
    except Exception as e:
        print(f"Error in NOAA request: {e}")
        save_progress(processed_wine_data)  # Save progress on error
    return []

# Function to get monthly climate data for a station with rate limiting
def get_mly_climate_data_for_station(station_id, vintage_year):
    global noaa_request_count
    if noaa_request_count >= NOAA_DAILY_LIMIT:
        if not increment_noaa_token_index():
            return {}  # All NOAA tokens exhausted
        noaa_request_count = 0  # Reset request count for new token

    api_token = get_current_noaa_api_token()
    time.sleep(NOAA_RATE_LIMIT)
    url = f"https://www.ncei.noaa.gov/access/services/data/v1?dataset=global-summary-of-the-month&dataTypes=TMIN,TMAX,PRCP,TAVG&stations={station_id}&startDate={vintage_year}-01-01&endDate={vintage_year}-12-31&format=json&units=standard&includeAttributes=false"
    headers = {'token': api_token}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        noaa_request_count += 1
        if response.status_code == 200:
            monthly_data_raw = response.json()
            monthly_data = {f"{data_type}_{month}": [] for data_type in ['TMIN', 'TMAX', 'PRCP', 'TAVG'] for month in range(1, 13)}

            for record in monthly_data_raw:
                month = int(record['DATE'].split('-')[1])
                for data_type in ['TMIN', 'TMAX', 'PRCP', 'TAVG']:
                    if record.get(data_type) is not None:
                        monthly_data[f"{data_type}_{month}"].append(float(record[data_type]))
            # Averaging the data
            processed_data = {key: sum(values) / len(values) if values else None for key, values in monthly_data.items()}
            return processed_data
    except requests.exceptions.Timeout:
        print("NOAA request timed out.")
        save_progress(processed_wine_data)  # Save progress on error
    except Exception as e:
        print(f"Error in NOAA request: {e}")
        save_progress(processed_wine_data)  # Save progress on error
    return {}

# Function to search using Bing Maps API
def search_address(query):
    if not check_and_update_bing_transactions():
        return None  # All Bing Maps keys exhausted or reached limit

    key = get_current_bing_maps_key()
    url = f"http://dev.virtualearth.net/REST/v1/Locations?query={query}&key={key}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 401:  # Assuming 401 indicates a key limit
            if increment_bing_key_index():
                return search_address(query)  # Retry with the next key
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        save_progress(processed_wine_data)  # Save progress on error
    return None

# Additional helper functions (e.g., is_country_match, is_region_country_match, extract_address_info)
def is_country_match(country, address):
    return country.lower() in address.lower()

def is_region_country_match(regions, country, address):
    address_lower = address.lower()
    return any(region.lower() in address_lower for region in regions if region) and country.lower() in address_lower

def extract_address_info(result):
    if result and 'resourceSets' in result and result['resourceSets']:
        resources = result['resourceSets'][0]['resources']
        if resources:
            location_name = resources[0].get('name', '')
            address = resources[0]['address'].get('formattedAddress', '')
            coordinates = resources[0]['point']['coordinates'] if 'point' in resources[0] else None
            return location_name, address, coordinates
    return None, None, None

# Function to append data to CSV
def append_to_csv(dataframe, filepath):
    dataframe.to_csv(filepath, mode='a', header=not os.path.exists(filepath), index=False)
    print("Data appended to:", filepath)

# Function to save progress whenever an error occurs
def save_progress(processed_data, filepath='data/intermediate/weather_enhanced_wine_data.csv'):
    current_data_df = pd.DataFrame(processed_data)
    append_to_csv(current_data_df, filepath)

# Function to process each wine entry with error handling
def process_wine_entry(row):
    try:
        winery = clean_keywords(row['Winery'])
        winery_variations = create_winery_variations(winery)
        queries, regions, country = construct_query(row, winery_variations)
        for query in queries:
            result = search_address(query)
            location_name, address, coordinates = extract_address_info(result)
            if location_name and address and coordinates:
                if is_correct_business_match(winery_variations, location_name, address, country) or is_region_country_match(regions, country, address):
                    lat, lon = coordinates
                    station_results = get_stations_by_bounding_box(lat, lon)
                    if station_results:
                        aggregated_weather_data = defaultdict(list)
                        for station_id in station_results:
                            station_weather_data = get_mly_climate_data_for_station(station_id, row['Vintage'])
                            for key, value in station_weather_data.items():
                                if value is not None:  # Only append if value is not None
                                    aggregated_weather_data[key].append(value)
                        averaged_weather_data = {k: sum(v)/len(v) for k, v in aggregated_weather_data.items() if v}
                        return {
                            "Winery": row['Winery'],
                            "Wine Name": row['Wine Name'],
                            "Vintage": row['Vintage'],
                            "Location Name": location_name,
                            "Address": address,
                            "Latitude": lat,
                            "Longitude": lon,
                            **averaged_weather_data
                        }
        print(f"No valid address found for {row['Winery']}")
        return None
    except Exception as e:
        print(f"Error processing {row['Winery']}: {e}")
        save_progress(processed_wine_data)  # Save progress on error
        raise  # Optionally re-raise the exception

# Main script execution
intermediate_file_path = 'data/intermediate/weather_enhanced_wine_data.csv'

# Read existing data if available
if os.path.exists(intermediate_file_path):
    processed_wine_data_df = pd.read_csv(intermediate_file_path)
else:
    processed_wine_data_df = pd.DataFrame()

# Read the main wine data
wine_data = pd.read_csv('data/clean/cleaned_combined_wine_data.csv')

# Filter out already processed wines
processed_wine_names = set(processed_wine_data_df['Wine Name'])
wine_data = wine_data[~wine_data['Wine Name'].isin(processed_wine_names)]

# Process each wine entry
processed_wine_data = []
for _, row in wine_data.iterrows():
    processed_data = process_wine_entry(row)
    if processed_data:
        processed_wine_data.append(processed_data)

# Append new data to the intermediate file
new_processed_data_df = pd.DataFrame(processed_wine_data)
append_to_csv(new_processed_data_df, intermediate_file_path)

# Display the results
new_processed_data_df.head()

No valid address found for Wiese & Krohn
No valid address found for W. & J. Graham's
No valid address found for Wiese & Krohn
No valid address found for Wiese & Krohn
No valid address found for W. & J. Graham's
No valid address found for W. & J. Graham's
No valid address found for Wiese & Krohn
No valid address found for Wiese & Krohn
No valid address found for Sebastiani
No valid address found for Sebastiani
No valid address found for Sebastiani
No valid address found for Heidsieck & Co Monopole
No valid address found for Heidsieck & Co Monopole
No valid address found for Argyle
No valid address found for Heidsieck & Co Monopole
No valid address found for Wellington
No valid address found for Gan Eden
No valid address found for Argyle
No valid address found for Gloria Ferrer
No valid address found for Sebastiani
No valid address found for Fortino
No valid address found for Iron Horse
NOAA request timed out.
Data appended to: data/intermediate/weather_enhanced_wine_data.csv
NOAA reques

No valid address found for Broadbent
No valid address found for Iron Horse
NOAA request timed out.
Data appended to: data/intermediate/weather_enhanced_wine_data.csv
No valid address found for Mumm Cuvee Napa
No valid address found for Vigna Piccola
No valid address found for Agrapart & Fils
NOAA request timed out.
Data appended to: data/intermediate/weather_enhanced_wine_data.csv
No valid address found for Duck Pond
NOAA request timed out.
Data appended to: data/intermediate/weather_enhanced_wine_data.csv
No valid address found for Kollwentz
No valid address found for Stellenryck
No valid address found for Pellegrini Vineyards
No valid address found for Domaine Carneros
No valid address found for Glenora
No valid address found for Bruno Giacosa
No valid address found for Iron Horse
No valid address found for Kunde
No valid address found for Moet & Chandon
No valid address found for Roederer Estate
No valid address found for Columbia Crest
No valid address found for Bodegas Corral
No v