# Exploratory Data Anaysis
## Load Dependencies

In [14]:
import csv
import pandas as pd
import numpy as np
from functools import reduce
import requests
import time
from datetime import datetime
import requests.exceptions
import os

## Load Dataframes

In [6]:
wine_scores = pd.read_csv('data\clean\cleaned_combined_wine_data.csv')
weather_data = pd.read_csv('data\clean\combined_weather_data_clean.csv')

In [7]:
# Sanity Check
wine_scores.head()

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Vintage
0,Calem 1961 Colheita Tawny Port (Port),,Port,,Portugal,95,320.0,Calem,Port Blend,1961
1,Calem 1961 Colheita Tawny (Port),,Port,,Portugal,95,320.0,Calem,Port,1961
2,Warre's 1961 Reserve Tawny Port (Port),,Port,,Portugal,89,111.0,Warre's,Port Blend,1961
3,Wiese & Krohn 1961 Colheita Port (Port),,Port,,Portugal,92,200.0,Wiese & Krohn,Port Blend,1961
4,Cossart Gordon 1962 Bual (Madeira),,Madeira,,Portugal,96,355.0,Cossart Gordon,Madeira,1962


In [5]:
weather_data.head()

Unnamed: 0,Station ID,Country,City,Data Type,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,5046,INDONESIA,KIJANG TANJUNG PINANG,4,1993,25.6,25.6,25.7,26.0,25.9,26.8,25.8,26.3,25.8,25.4,25.4,25.9
1,5046,INDONESIA,KIJANG TANJUNG PINANG,4,1995,26.0,25.6,25.9,26.0,26.6,26.6,26.3,26.2,26.7,26.1,25.9,25.6
2,5046,INDONESIA,KIJANG TANJUNG PINANG,5,1993,246.6,61.9,285.8,324.2,424.1,169.6,261.7,157.9,240.3,398.5,473.6,683.9
3,5046,INDONESIA,KIJANG TANJUNG PINANG,5,1995,388.0,325.1,190.7,357.9,299.5,334.3,250.9,213.4,265.6,496.3,630.8,277.1
4,5046,INDONESIA,KIJANG TANJUNG PINANG,6,1993,29.6,30.7,30.6,31.3,29.8,31.2,30.7,31.3,30.7,30.3,29.9,29.9


## Working Dataframes
We need to create some temporary working dataframes for us to do preliminary data exploration. First lets reduce the amount of weather data we have by just grabbing the data from countries and years that we have wines for.

In [10]:
wine_data_df = pd.read_csv('data\clean\cleaned_combined_wine_data.csv')
weather_data_df = pd.read_csv('data\clean\combined_weather_data_clean.csv')

# Rename the 'Vintage' column in wine_data_df to 'Year' for consistency
wine_data_df.rename(columns={'Vintage': 'Year'}, inplace=True)

# Standardize country names in wine data to uppercase for matching
wine_data_df['Country'] = wine_data_df['Country'].str.upper()

# Filter the weather data to only include rows that match the Country and Year in the wine data
matched_weather_data = weather_data_df[weather_data_df[['Country', 'Year']].apply(tuple, 1).isin(wine_data_df[['Country', 'Year']].apply(tuple, 1))]

# Reset the index of the DataFrame
matched_weather_data.reset_index(drop=True, inplace=True)

# Display the first few rows of the filtered weather data
matched_weather_data.head()

Unnamed: 0,Station ID,Country,City,Data Type,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,0,AUSTRALIA,SCONE SOIL CONS 0,4,1985,24.6,22.4,21.5,17.7,14.2,10.2,10.6,11.3,13.2,17.2,19.1,22.8
1,0,AUSTRALIA,SCONE SOIL CONS 0,4,1987,25.7,24.7,19.6,17.9,14.2,12.0,10.1,13.1,14.8,17.1,20.0,22.6
2,0,AUSTRALIA,SCONE SOIL CONS 0,5,1985,4.6,88.8,48.6,53.8,36.2,55.2,17.0,45.2,76.4,143.2,33.4,121.6
3,0,AUSTRALIA,SCONE SOIL CONS 0,5,1987,90.6,4.8,82.8,5.0,73.4,20.8,9.8,110.2,19.2,42.6,41.0,113.8
4,6600,SWITZERLAND,ST. CHRISCHONA,4,2004,0.9,2.3,4.8,9.5,11.7,15.7,17.2,17.7,14.4,10.7,4.0,0.2


Now we want to get rid of the Data Type column, so its easier for someone to read all relevant weather data in one dataframe. Atomosphereric pressure isn't relevant for what we are trying to calculate, and not to mention not all locations measure atomosphereric pressure, so we will be getting ride of data type 2 and 3 if it ever pops up.

In [16]:
# Function to rename columns based on the data type
def rename_columns(df, data_type):
    prefix = {
        4: 'daily_temp',
        5: 'precipitation',
        6: 'daily_temp_MAX',
        7: 'daily_temp_MIN',
        8: 'humidity'
    }.get(data_type, 'unknown')

    month_cols = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    new_month_cols = [f'{prefix}_{month}' for month in month_cols]
    rename_dict = dict(zip(month_cols, new_month_cols))
    df.rename(columns=rename_dict, inplace=True)
    return df

# Splitting the DataFrame into separate DataFrames based on Data Type
dfs = {}
for dtype in [4, 5, 6, 7, 8]:
    df = matched_weather_data[matched_weather_data['Data Type'] == dtype].copy()
    df = rename_columns(df, dtype)
    dfs[dtype] = df

# Drop the 'Data Type' column from each DataFrame and rename the month columns
for dtype, df in dfs.items():
    df = df.drop(columns=['Data Type'])
    df = rename_columns(df, dtype)
    dfs[dtype] = df
    
# Merge the DataFrames back together with explicit suffixes for overlapping columns
merged_weather_df = reduce(lambda left, right: pd.merge(left, right, on=['Station ID', 'Country', 'City', 'Year'], how='outer', suffixes=('', '_duplicate')), dfs.values())

# Remove any columns that were duplicated during the merge
merged_weather_df = merged_weather_df[[col for col in merged_weather_df.columns if not col.endswith('_duplicate')]]

# Reset the index of the DataFrame
merged_weather_df.reset_index(drop=True, inplace=True)

# Check results
merged_weather_df.head()

Unnamed: 0,Station ID,Country,City,Year,daily_temp_Jan,daily_temp_Feb,daily_temp_Mar,daily_temp_Apr,daily_temp_May,daily_temp_Jun,...,humidity_Mar,humidity_Apr,humidity_May,humidity_Jun,humidity_Jul,humidity_Aug,humidity_Sep,humidity_Oct,humidity_Nov,humidity_Dec
0,0,AUSTRALIA,SCONE SOIL CONS 0,1985,24.6,22.4,21.5,17.7,14.2,10.2,...,,,,,,,,,,
1,0,AUSTRALIA,SCONE SOIL CONS 0,1987,25.7,24.7,19.6,17.9,14.2,12.0,...,,,,,,,,,,
2,6600,SWITZERLAND,ST. CHRISCHONA,2004,0.9,2.3,4.8,9.5,11.7,15.7,...,,,,,,,,,,
3,6601,SWITZERLAND,BASEL / BINNINGEN,2004,2.3,3.0,5.7,10.6,13.2,17.7,...,,,,,,,,,,
4,6601,SWITZERLAND,BASEL / BINNINGEN,2011,2.4,3.9,7.5,13.4,16.6,18.1,...,,,,,,,,,,


In [17]:
merged_weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40321 entries, 0 to 40320
Data columns (total 64 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Station ID          40321 non-null  int64  
 1   Country             40321 non-null  object 
 2   City                40321 non-null  object 
 3   Year                40321 non-null  int64  
 4   daily_temp_Jan      39238 non-null  float64
 5   daily_temp_Feb      39238 non-null  float64
 6   daily_temp_Mar      39238 non-null  float64
 7   daily_temp_Apr      39238 non-null  float64
 8   daily_temp_May      39238 non-null  float64
 9   daily_temp_Jun      39238 non-null  float64
 10  daily_temp_Jul      39238 non-null  float64
 11  daily_temp_Aug      39238 non-null  float64
 12  daily_temp_Sep      39238 non-null  float64
 13  daily_temp_Oct      39238 non-null  float64
 14  daily_temp_Nov      39238 non-null  float64
 15  daily_temp_Dec      39238 non-null  float64
 16  prec

In [18]:
# Save dataframe to CSV
merged_weather_df.to_csv('data\clean\merged_weather.csv', index=False)

Next we will filter out wine data that have matching location and vintages.

In [21]:
# Load Dataframes
wine_data = pd.read_csv('data/clean/cleaned_combined_wine_data.csv')
merged_weather_df = pd.read_csv('data\clean\merged_weather.csv')

# Create a temporary DataFrame with unique city-country pairs
unique_city_country = merged_weather_df[['City', 'Country']].drop_duplicates()
unique_city_country['City'] = unique_city_country['City'].str.upper()
unique_city_country['City Words'] = unique_city_country['City'].apply(lambda x: x.split())

# Function to check for any matching word and country match
def any_matching_word_and_country(row, city_country_df):
    for region_key in ['Region 1', 'Region 2', 'Region 3']:
        region = row[region_key]
        if pd.notna(region):
            region_words = set(region.upper().split())
            for _, city_country_row in city_country_df.iterrows():
                if any(word in region_words for word in city_country_row['City Words']) and row['Country'].upper() == city_country_row['Country'].upper():
                    return True
    return False

# Ensure all relevant columns are strings and handle NaNs
wine_data[['Region 1', 'Region 2', 'Region 3']] = wine_data[['Region 1', 'Region 2', 'Region 3']].fillna('').astype(str)
wine_data['Country'] = wine_data['Country'].fillna('').astype(str)

# Filter wine_data
filtered_wine_data = wine_data[wine_data.apply(lambda row: any_matching_word_and_country(row, unique_city_country), axis=1)]

# Check result
filtered_wine_data.head()

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Vintage
43,Bodegas Dios Baco S.L. NV 1970 Oxford Pedro Xi...,Andalucia,,Jerez,Spain,85,40.0,Bodegas Dios Baco S.L.,Sherry,1970
75,Adega de Favaios 1980 Moscatel do Douro,,Moscatel do Douro,,Portugal,93,137.0,Adega de Favaios,Moscatel,1980
77,Adega de Favaios 1980 Moscatel (Moscatel do Do...,,Moscatel do Douro,,Portugal,93,137.0,Adega de Favaios,Muscat,1980
79,Cuva Vella 1980 Vintage Muscat (Valencia),Levante,,Valencia,Spain,90,65.0,Cuva Vella,Muscat,1980
87,Moulin Touchais 1982 Chenin Blanc (Coteaux du ...,Loire Valley,,Coteaux du Layon,France,95,64.0,Moulin Touchais,Chenin Blanc,1982


In [22]:
# Reset the index of the DataFrame
filtered_wine_data.reset_index(drop=True, inplace=True)

# Check result
filtered_wine_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16760 entries, 0 to 16759
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Wine Name  16760 non-null  object 
 1   Region 1   16760 non-null  object 
 2   Region 2   16760 non-null  object 
 3   Region 3   16760 non-null  object 
 4   Country    16760 non-null  object 
 5   Score      16760 non-null  int64  
 6   Price      15406 non-null  float64
 7   Winery     16760 non-null  object 
 8   Variety    16705 non-null  object 
 9   Vintage    16760 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 1.3+ MB


In [24]:
# Save dataframe to CSV
filtered_wine_data.to_csv(r'data\clean\filtered_wine_data.csv', index=False)

With the low amount of match rate, we would need to approach this differently. We would need to use a weather API to obtain the weather information for wines that are missing related weather data. Keep in mind, some wines don't have an associated specific city and some wines just have a wine region. So what would make more sense is to have an api look up the geocoordinates of the wine region and then plug it into a weather API to get the necessary information.

## Implementing an API Solution

The following script uses locationiq API to grab geocoordinates of regions mentioned in our list of wines, then uses the geocoordinates to grab climate data from NOAA api.

In [None]:
# Constants for rate limiting
LOCATIONIQ_RATE_LIMIT = 0.5  # seconds between requests (2 requests per second)
NOAA_RATE_LIMIT = 0.2  # seconds between requests (5 requests per second)
LOCATIONIQ_DAILY_LIMIT = 5000
NOAA_DAILY_LIMIT = 10000

# Counters for tracking requests
locationiq_request_count = 0
noaa_request_count = 0

# Function to get coordinates from LocationIQ API with rate limiting and daily limit
def get_locationiq_coordinates(region, api_key):
    global locationiq_request_count
    if locationiq_request_count >= LOCATIONIQ_DAILY_LIMIT:
        print("LocationIQ daily request limit reached.")
        return None, None

    time.sleep(LOCATIONIQ_RATE_LIMIT)
    try:
        url = f"https://us1.locationiq.com/v1/search.php?key={api_key}&q={region}&format=json"
        response = requests.get(url)
        locationiq_request_count += 1
        if response.status_code == 200:
            data = response.json()
            return float(data[0]['lat']), float(data[0]['lon'])
    except Exception as e:
        print(f"Error fetching coordinates for region {region}: {e}")
    return None, None

# Function to calculate the bounding box
def get_gps_bounding_box(latitude, longitude, deg_lat=1.0, deg_lon=1.0):
    n = min(90, latitude + deg_lat)
    s = max(-90, latitude - deg_lat)
    e = min(180, longitude + deg_lon)
    w = max(-180, longitude - deg_lon)
    return n, w, s, e

# Function to find weather stations by bounding box with rate limiting
def get_stations_by_bounding_box(lat, lon, api_token):
    global noaa_request_count
    if noaa_request_count >= NOAA_DAILY_LIMIT:
        print("NOAA daily request limit reached.")
        return []

    time.sleep(NOAA_RATE_LIMIT)
    n, w, s, e = get_gps_bounding_box(lat, lon)
    url = f"https://www.ncei.noaa.gov/access/services/search/v1/data?dataset=global-summary-of-the-month&boundingBox={n},{w},{s},{e}&dataTypes=TMIN,TMAX,PRCP&limit=10&offset=0"
    headers = {'token': api_token}
    try:
        response = requests.get(url, headers=headers, timeout=10)  # 10 seconds timeout
        noaa_request_count += 1
        if response.status_code == 200:
            return response.json().get('results', [])
    except requests.exceptions.Timeout:
        print("NOAA request timed out.")
    except Exception as e:
        print(f"Error in NOAA request: {e}")
    return []

# Function to get monthly climate data for a station with rate limiting
def get_mly_climate_data_for_station(station_id, vintage_year, api_token):
    global noaa_request_count
    if noaa_request_count >= NOAA_DAILY_LIMIT:
        print("NOAA daily request limit reached.")
        return []

    # Constructing the start and end dates for the given vintage year
    start_date = f"{vintage_year}-01-01"
    end_date = f"{vintage_year}-12-31"

    time.sleep(NOAA_RATE_LIMIT)
    url = f"https://www.ncei.noaa.gov/access/services/data/v1?dataset=global-summary-of-the-month&dataTypes=TMIN,TMAX,PRCP&stations={station_id}&startDate={start_date}&endDate={end_date}&format=json&units=standard&includeAttributes=false"
    headers = {'token': api_token}

    try:
        response = requests.get(url, headers=headers, timeout=10)  # 10 seconds timeout
        noaa_request_count += 1
        if response.status_code == 200:
            return response.json()
    except requests.exceptions.Timeout:
        print("NOAA request timed out.")
    except Exception as e:
        print(f"Error in NOAA request: {e}")
    return []

# Function to process wine data with weather information
def process_wine_data_with_weather(wine_data, filtered_wine_data, locationiq_api_key, noaa_api_token):
    global locationiq_request_count, noaa_request_count
    unique_wines = wine_data[~wine_data.isin(filtered_wine_data).all(1)]
    weather_data = []

    for _, row in unique_wines.iterrows():
        vintage_year = row['Vintage']
        if pd.notna(vintage_year) and isinstance(vintage_year, (int, float)):
            vintage_year = str(int(vintage_year))  # Convert to string for API request
            for region_col in ['Region 1', 'Region 2', 'Region 3']:
                region = row[region_col]
                if pd.notna(region):
                    lat, lon = get_locationiq_coordinates(region, locationiq_api_key)
                    if lat is not None and lon is not None and locationiq_request_count < LOCATIONIQ_DAILY_LIMIT:
                        stations = get_stations_by_bounding_box(lat, lon, noaa_api_token)
                        if stations and noaa_request_count < NOAA_DAILY_LIMIT:
                            station_id = stations[0]['id']
                            weather_data_for_station = get_mly_climate_data_for_station(station_id, vintage_year, noaa_api_token)
                            if weather_data_for_station:
                                weather_data.append({
                                    'Wine Name': row['Wine Name'],
                                    'Vintage': vintage_year,
                                    'Region': region,
                                    'Station ID': station_id,
                                    'Weather Data': weather_data_for_station
                                })
                                if locationiq_request_count >= LOCATIONIQ_DAILY_LIMIT or noaa_request_count >= NOAA_DAILY_LIMIT:
                                    return pd.DataFrame(weather_data)
                                break  # Stop after getting data for the first valid region
                            
    return pd.DataFrame(weather_data)
                                

# Load the original and filtered wine data
wine_data = pd.read_csv('data/clean/cleaned_combined_wine_data.csv')
filtered_wine_data = pd.read_csv('data/clean/filtered_wine_data.csv')

# LocationIQ API key
locationiq_api_key = "INSERT API KEY"

# NOAA API token
noaa_api_token = "INSERT API KEY"

# Process the wine data with weather information
weather_enhanced_wine_data = process_wine_data_with_weather(wine_data, filtered_wine_data, locationiq_api_key, noaa_api_token)

# Display the results
weather_enhanced_wine_data.head()

# Save the results to a CSV file
weather_enhanced_wine_data.to_csv('data/intermediate/weather_enhanced_wine_data.csv', index=False)

NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request timed out.
NOAA request tim

Since we are using free service level of these APIs the previous script will stop when we hit the daily request limit. So I need a slightly modified script that will continue where the previous script left off.

In [32]:
# Constants for rate limiting
LOCATIONIQ_RATE_LIMIT = 0.5  # seconds between requests (2 requests per second)
NOAA_RATE_LIMIT = 0.2  # seconds between requests (5 requests per second)
LOCATIONIQ_DAILY_LIMIT = 5000
NOAA_DAILY_LIMIT = 10000

# Counters for tracking requests
locationiq_request_count = 0
noaa_request_count = 0

# Function to get coordinates from LocationIQ API with rate limiting and daily limit
def get_locationiq_coordinates(region, api_key):
    global locationiq_request_count
    if locationiq_request_count >= LOCATIONIQ_DAILY_LIMIT:
        print("LocationIQ daily request limit reached.")
        return None, None

    time.sleep(LOCATIONIQ_RATE_LIMIT)
    try:
        url = f"https://us1.locationiq.com/v1/search.php?key={api_key}&q={region}&format=json"
        response = requests.get(url)
        locationiq_request_count += 1
        if response.status_code == 200:
            data = response.json()
            return float(data[0]['lat']), float(data[0]['lon'])
    except Exception as e:
        print(f"Error fetching coordinates for region {region}: {e}")
    return None, None

# Function to calculate the bounding box
def get_gps_bounding_box(latitude, longitude, deg_lat=1.0, deg_lon=1.0):
    n = min(90, latitude + deg_lat)
    s = max(-90, latitude - deg_lat)
    e = min(180, longitude + deg_lon)
    w = max(-180, longitude - deg_lon)
    return n, w, s, e

# Function to find weather stations by bounding box with rate limiting
def get_stations_by_bounding_box(lat, lon, api_token):
    global noaa_request_count
    if noaa_request_count >= NOAA_DAILY_LIMIT:
        print("NOAA daily request limit reached.")
        return []

    time.sleep(NOAA_RATE_LIMIT)
    n, w, s, e = get_gps_bounding_box(lat, lon)
    url = f"https://www.ncei.noaa.gov/access/services/search/v1/data?dataset=global-summary-of-the-month&boundingBox={n},{w},{s},{e}&dataTypes=TMIN,TMAX,PRCP&limit=10&offset=0"
    headers = {'token': api_token}
    try:
        response = requests.get(url, headers=headers, timeout=10)  # 10 seconds timeout
        noaa_request_count += 1
        if response.status_code == 200:
            return response.json().get('results', [])
    except requests.exceptions.Timeout:
        print("NOAA request timed out.")
    except Exception as e:
        print(f"Error in NOAA request: {e}")
    return []

# Function to get monthly climate data for a station with rate limiting
def get_mly_climate_data_for_station(station_id, vintage_year, api_token):
    global noaa_request_count
    if noaa_request_count >= NOAA_DAILY_LIMIT:
        print("NOAA daily request limit reached.")
        return []

    start_date = f"{vintage_year}-01-01"
    end_date = f"{vintage_year}-12-31"

    time.sleep(NOAA_RATE_LIMIT)
    url = f"https://www.ncei.noaa.gov/access/services/data/v1?dataset=global-summary-of-the-month&dataTypes=TMIN,TMAX,PRCP&stations={station_id}&startDate={start_date}&endDate={end_date}&format=json&units=standard&includeAttributes=false"
    headers = {'token': api_token}
    try:
        response = requests.get(url, headers=headers, timeout=10)  # 10 seconds timeout
        noaa_request_count += 1
        if response.status_code == 200:
            return response.json()
    except requests.exceptions.Timeout:
        print("NOAA request timed out.")
    except Exception as e:
        print(f"Error in NOAA request: {e}")
    return []

def process_wine_data_with_weather(wine_data, filtered_wine_data, processed_wine_data, locationiq_api_key, noaa_api_token):
    global locationiq_request_count, noaa_request_count
    unique_wines = wine_data[~wine_data.isin(filtered_wine_data).all(1)]
    weather_data = []

    for _, row in unique_wines.iterrows():
        vintage_year = row['Vintage']
        for region_col in ['Region 1', 'Region 2', 'Region 3']:
            region = row[region_col]
            if pd.notna(region) and pd.notna(vintage_year):
                lat, lon = get_locationiq_coordinates(region, locationiq_api_key)
                if lat is not None and lon is not None and locationiq_request_count < LOCATIONIQ_DAILY_LIMIT:
                    stations = get_stations_by_bounding_box(lat, lon, noaa_api_token)
                    if stations and noaa_request_count < NOAA_DAILY_LIMIT:
                        station_id = stations[0]['id']
                        weather_data_for_station = get_mly_climate_data_for_station(station_id, str(int(vintage_year)), noaa_api_token)
                        if weather_data_for_station:
                            weather_data.append({
                                'Wine Name': row['Wine Name'],
                                'Vintage': vintage_year,
                                'Region': region,
                                'Station ID': station_id,
                                'Weather Data': weather_data_for_station
                            })
                            if locationiq_request_count >= LOCATIONIQ_DAILY_LIMIT or noaa_request_count >= NOAA_DAILY_LIMIT:
                                return pd.DataFrame(weather_data)
                            break  # Stop after getting data for the first valid region

    return pd.concat([processed_wine_data, pd.DataFrame(weather_data)], ignore_index=True)

# Check if the previous results file exists
processed_file_path = 'data/intermediate/weather_enhanced_wine_data.csv'

if os.path.exists(processed_file_path):
    processed_wine_data = pd.read_csv(processed_file_path)
else:
    processed_wine_data = pd.DataFrame()

# Load the original and filtered wine data
wine_data = pd.read_csv('data/clean/cleaned_combined_wine_data.csv')
filtered_wine_data = pd.read_csv('data/clean/filtered_wine_data.csv')

# Exclude wines already processed
wine_data = wine_data[~wine_data['Wine Name'].isin(processed_wine_data['Wine Name'])]

# LocationIQ API key
locationiq_api_key = "INSERT API KEY"

# NOAA API token
noaa_api_token = "INSERT API KEY"

# Process the wine data with weather information
weather_enhanced_wine_data = process_wine_data_with_weather(wine_data, filtered_wine_data, processed_wine_data, locationiq_api_key, noaa_api_token)

# Save the updated results to the CSV file
weather_enhanced_wine_data.to_csv(processed_file_path, index=False)

# Display the results
weather_enhanced_wine_data.head()

NameError: name 'weather_enhanced_wine_data' is not defined