# Preprocessing - Weather Underground

In [15]:
#imports
import numpy as np
import pandas as pd
import os
import re
import string
import time
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [4]:
#set directory locations
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
wine_processed_folder = parent_directory + '/data/wine-com/processed/'
wunderground_raw_folder = parent_directory + '/data/wunderground-com/raw/'
wunderground_processed_folder = parent_directory + '/data/wunderground-com/processed/'

### Load Listing Data

In [5]:
df = pd.read_csv(wine_processed_folder + '1678665697.3855994.txt',
                 sep = '|')

In [51]:
df.head()

Unnamed: 0,Product_Name,Product_Varietal,Product_Origin,Product_Price,Product_Attributes,User_Avg_Rating,User_Rating_Count,Product_Vintage,Critical_Avg_Rating,Critical_Rating_Count,Appellation_Level,Weather_Origin,filename
0,Domaine Taupenot-Merme Nuits-Saint-Georges Les...,Pinot Noir,"Nuits-St-Georges, Cote de Nuits, Cote d'Or, Bu...",199.99,Red Wine,0.0,0,2017,93.0,3,5,"Ouges, Côte-d’Or, France","Ouges, Côte-d’Or, France_2017"
1,Domaine de la Romanee-Conti Echezeaux Grand Cru,Pinot Noir,"Flagey-Echezeaux, Cote de Nuits, Cote d'Or, Bu...",3299.99,Red Wine,0.0,0,2018,95.0,4,5,"Ouges, Côte-d’Or, France","Ouges, Côte-d’Or, France_2018"
2,Lincourt Rancho Santa Rosa Pinot Noir,Pinot Noir,"Sta. Rita Hills, Santa Barbara, Central Coast,...",40.99,Red Wine,4.0,31,2018,90.666667,3,5,"Goleta, CA","Goleta, CA_2018"
3,Domaine Claude Dugat Gevrey-Chambertin,Pinot Noir,"Gevrey-Chambertin, Cote de Nuits, Cote d'Or, B...",179.99,Red Wine,0.0,0,2020,92.0,3,5,"Ouges, Côte-d’Or, France","Ouges, Côte-d’Or, France_2020"
4,Domaine Prieur-Brunet Santenay Maladiere Premi...,Pinot Noir,"Santenay, Cote de Beaune, Cote d'Or, Burgundy,...",65.99,Red Wine,0.0,0,2020,92.333333,3,5,"Ouges, Côte-d’Or, France","Ouges, Côte-d’Or, France_2020"


### Load and Map Weather Locations

In [22]:
with open(wunderground_raw_folder + '/1_location-map.json', 'r') as fileObj:
    map_data = json.load(fileObj)
    fileObj.close()

In [44]:
def generate_weather_mappings(location, mapping_dict):
    try:
        weather_location = mapping_dict[location]
    except Exception:
        weather_location = None
    return weather_location

df['Weather_Origin'] = df['Product_Origin'].apply(generate_weather_mappings, mapping_dict = map_data)

### Retrieve/Process Weather Data

In [50]:
df['filename'] = df['Weather_Origin'] + '_' + df['Product_Vintage'].astype(str)

In [70]:
header_dict = {
    '1': 'Temperature',
    '2':' Dew_Point',
    '3': 'Humidity',
    '4': 'Wind_Speed ',
    '5': 'Pressure',
    '6': 'Precipitation'
}

In [104]:
file_lines = dict()
weather_data = dict()
for location in df['filename'].unique():
    # filter the list using the partial location match
    location_files = [filename for filename in os.listdir(wunderground_raw_folder) if str(location) in filename]
    if len(location_files) == 12:
        file_lines[location] = dict()
        weather_data[location] = dict()
        for file in location_files:
            with open(wunderground_raw_folder + file) as fileObj:
                file_lines[file]= fileObj.readlines()
                fileObj.close()
            data_categories = file_lines[file][1].split(',')
            idx = 0
            for category in data_categories:
                if idx == 0:
                    month_text, values = category.split(' ', 1)
                    days_values = values.split(' ')
                    days_count = len(days_values)
                    idx += 1
                elif idx == 6:
                    pre_text, values = category.split(' ', 1)
                    precipitation_values = values.strip('\n').split(' ')
                    precipitation_values = [float(i) for i in precipitation_values]
                    weather_data[location][month_text + '_Average_' + header_dict[str(idx)]] = sum(precipitation_values)/days_count
                    weather_data[location][month_text + '_Total_' + header_dict[str(idx)]] = sum(precipitation_values)
                else:
                    max_text, avg_text, min_text, values = category.split(' ', 3)
                    max_avg_min_values = values.split(' ')
                    max_avg_min_values = [float(i) for i in max_avg_min_values]
                    max_values = max_avg_min_values[0::3]
                    weather_data[location][month_text + '_Max_High_' + header_dict[str(idx)]] = max(max_values)
                    weather_data[location][month_text + '_Avg_High_' + header_dict[str(idx)]] = sum(max_values)/days_count
                    weather_data[location][month_text + '_Min_High_' + header_dict[str(idx)]] = min(max_values)
                    avg_values = max_avg_min_values[1::3]
                    weather_data[location][month_text + '_Avg_' + header_dict[str(idx)]] = sum(avg_values)/days_count
                    min_values = max_avg_min_values[2::3]
                    weather_data[location][month_text + '_Max_Low_' + header_dict[str(idx)]] = max(min_values)
                    weather_data[location][month_text + '_Avg_Low_' + header_dict[str(idx)]] = sum(min_values)/days_count
                    weather_data[location][month_text + '_Min_Low_' + header_dict[str(idx)]] = min(min_values)
                    idx += 1

In [112]:
weather_df = pd.DataFrame.from_dict(weather_data, orient = 'index').reset_index(drop=False)
weather_df['filename'] = weather_df['index']
weather_df.head(10)

Unnamed: 0,index,Jan_Max_High_Temperature,Jan_Avg_High_Temperature,Jan_Min_High_Temperature,Jan_Avg_Temperature,Jan_Max_Low_Temperature,Jan_Avg_Low_Temperature,Jan_Min_Low_Temperature,Jan_Max_High_ Dew_Point,Jan_Avg_High_ Dew_Point,...,Oct_Max_High_Pressure,Oct_Avg_High_Pressure,Oct_Min_High_Pressure,Oct_Avg_Pressure,Oct_Max_Low_Pressure,Oct_Avg_Low_Pressure,Oct_Min_Low_Pressure,Oct_Average_Precipitation,Oct_Total_Precipitation,filename
0,"Ouges, Côte-d’Or, France_2017",48.0,36.451613,27.0,31.074194,43.0,25.516129,14.0,46.0,29.451613,...,29.6,29.458065,29.1,29.406452,29.5,29.354839,29.1,0.0,0.0,"Ouges, Côte-d’Or, France_2017"
1,"Ouges, Côte-d’Or, France_2018",57.0,49.736842,43.0,45.789474,48.0,41.421053,30.0,52.0,45.263158,...,29.6,29.329032,28.8,29.232258,29.6,29.16129,28.3,0.0,0.0,"Ouges, Côte-d’Or, France_2018"
2,"Goleta, CA_2018",79.0,67.129032,60.0,55.219355,57.0,44.903226,35.0,59.0,51.032258,...,30.1,29.948387,29.8,29.9,30.0,29.848387,29.7,0.013548,0.42,"Goleta, CA_2018"
3,"Ouges, Côte-d’Or, France_2020",57.0,45.516129,32.0,39.390323,46.0,33.096774,23.0,52.0,40.129032,...,29.5,29.219355,28.5,29.16129,29.5,29.077419,28.3,0.0,0.0,"Ouges, Côte-d’Or, France_2020"
4,"Yakima, WA_2019",104.0,42.774194,31.0,32.883871,34.0,26.709677,19.0,97.0,33.774194,...,29.5,29.093548,28.6,28.977419,29.3,28.883871,28.4,0.016452,0.51,"Yakima, WA_2019"
5,"Ouges, Côte-d’Or, France_2016",57.0,45.129032,30.0,39.66129,45.0,34.354839,21.0,52.0,40.354839,...,29.8,29.377419,28.9,29.322581,29.7,29.258065,28.8,0.0,0.0,"Ouges, Côte-d’Or, France_2016"
6,"Ouges, Côte-d’Or, France_2019",48.0,40.677419,28.0,36.070968,41.0,31.387097,21.0,45.0,35.645161,...,29.4,29.274194,29.0,29.206452,29.3,29.112903,28.9,0.0,0.0,"Ouges, Côte-d’Or, France_2019"
7,"San Luis Obispo, CA_2019",76.0,63.258065,56.0,52.464516,53.0,43.774194,31.0,59.0,49.032258,...,30.0,29.796774,29.7,29.732258,29.9,29.687097,29.5,0.0,0.0,"San Luis Obispo, CA_2019"
8,"San Bernardino, CA_2019",81.0,66.129032,54.0,52.393548,57.0,44.354839,32.0,57.0,40.83871,...,29.0,28.790323,28.6,28.719355,28.8,28.63871,28.5,0.0,0.0,"San Bernardino, CA_2019"
9,"Victoria, British Columbia, Canada_2019",52.0,47.451613,43.0,44.958065,46.0,41.645161,36.0,48.0,40.645161,...,30.5,30.183871,29.6,30.116129,30.4,30.045161,29.6,0.0,0.0,"Victoria, British Columbia, Canada_2019"


### Merge Datasets

In [113]:
#perform sql-style left join of review data onto main df
df = pd.merge(df,
              weather_df,
              on='filename',
              how='left')

### Data Clean Up

In [115]:
df = df.drop(columns = ['index', 'filename'])

In [118]:
df = df.dropna()

In [119]:
df.shape

(1115, 456)

### Clean & Write Data

In [120]:
df.to_csv(wunderground_processed_folder + '1678665697.3855994.txt',
          sep = '|',
          index=False)