In [419]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [420]:
df_wine = pd.read_csv('data/sonoma/sonoma_wine_cleaned.csv')
# Selected 9 weather stations, based on EDA results and will use an updated dataframe
df_weather = pd.read_csv('data/updated_weather_df.csv')

Steps for Pre-processing: 
- Add a Year column to df_weather (after compressing each station data by year)
- Add a County column to df_weather
- Drop unnecessary columns (all cols except Year and Yield for df_weather and Moving Average columns and Name column in df_weather)
- Merge df_wine to df_weather on columns Year and County

In [421]:
df_wine.head()

Unnamed: 0,Year,CommodityCode,CropName,CountyCode,County,HarvestedAcres,Yield(Unit/Acre),Production,Price(Dollars/Unit),Unit,Value(Dollars)
0,2020,216299,GRAPESWINE,97,Sonoma,56800.0,2.61,148000.0,2375.08,Tons,351512000
1,2019,216299,GRAPESWINE,97,Sonoma,57500.0,4.0,230000.0,2843.57,Tons,654021000
2,2018,216299,GRAPESWINE,97,Sonoma,59900.0,4.61,276000.0,2817.66,Tons,777675000
3,2017,216299,GRAPESWINE,97,Sonoma,60000.0,3.43,206000.0,2807.34,Tons,578313000
4,2016,216299,GRAPESWINE,97,Sonoma,60000.0,3.77,226000.0,2595.21,Tons,586518000


In [422]:
df_weather.head()

Unnamed: 0,STATION,NAME,PRCP,TMAX,TMIN,Moving_Avg_PRCP_yr,Moving_Avg_TMIN_yr,Moving_Avg_TMAX_yr,DATE
0,USC00048351,"SONOMA, CA US",0.0,,,,,,1980-01-01
1,USC00048351,"SONOMA, CA US",0.0,61.0,37.0,,,,1980-01-02
2,USC00048351,"SONOMA, CA US",0.0,61.0,35.0,,,,1980-01-03
3,USC00048351,"SONOMA, CA US",0.05,56.0,41.0,,,,1980-01-04
4,USC00048351,"SONOMA, CA US",0.01,56.0,44.0,,,,1980-01-05


In [423]:
df_weather.STATION.unique()

array(['USC00048351', 'USC00047965', 'USC00043578', 'USC00043875',
       'USC00046370', 'USC00046826', 'USC00043191', 'USC00041838',
       'USW00023213'], dtype=object)

In [424]:
df_weather.drop(['Moving_Avg_PRCP_yr', 'Moving_Avg_TMIN_yr', 'Moving_Avg_TMAX_yr', 'NAME'], axis = 1, inplace = True)

In [425]:
df_weather.head()

Unnamed: 0,STATION,PRCP,TMAX,TMIN,DATE
0,USC00048351,0.0,,,1980-01-01
1,USC00048351,0.0,61.0,37.0,1980-01-02
2,USC00048351,0.0,61.0,35.0,1980-01-03
3,USC00048351,0.05,56.0,41.0,1980-01-04
4,USC00048351,0.01,56.0,44.0,1980-01-05


In [426]:
df_wine.columns

Index(['Year', 'CommodityCode', 'CropName', 'CountyCode', 'County',
       'HarvestedAcres', 'Yield(Unit/Acre)', 'Production',
       'Price(Dollars/Unit)', 'Unit', 'Value(Dollars)'],
      dtype='object')

In [427]:
df_wine.drop(['CommodityCode', 'CropName', 'CountyCode', 'HarvestedAcres', 'Production',
       'Price(Dollars/Unit)', 'Unit', 'Value(Dollars)'], axis = 1, inplace = True)

In [428]:
df_wine.head()

Unnamed: 0,Year,County,Yield(Unit/Acre)
0,2020,Sonoma,2.61
1,2019,Sonoma,4.0
2,2018,Sonoma,4.61
3,2017,Sonoma,3.43
4,2016,Sonoma,3.77


In [429]:
df_weather.dtypes

STATION     object
PRCP       float64
TMAX       float64
TMIN       float64
DATE        object
dtype: object

In [430]:
df_weather['DATE'] = pd.to_datetime(df_weather['DATE'])
df_weather = df_weather.set_index('DATE')

In [431]:
date_grouped = df_weather.groupby([df_weather.index.year, df_weather.STATION])
print(type(date_grouped))
date_grouped.head()

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>


Unnamed: 0_level_0,STATION,PRCP,TMAX,TMIN
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980-01-01,USC00048351,0.00,,
1980-01-02,USC00048351,0.00,61.0,37.0
1980-01-03,USC00048351,0.00,61.0,35.0
1980-01-04,USC00048351,0.05,56.0,41.0
1980-01-05,USC00048351,0.01,56.0,44.0
...,...,...,...,...
2020-01-01,USW00023213,0.00,64.0,42.0
2020-01-02,USW00023213,0.00,68.0,38.0
2020-01-03,USW00023213,0.00,61.0,36.0
2020-01-04,USW00023213,0.10,62.0,37.0


In [432]:
mean_precip = date_grouped['PRCP'].mean()
mean_precip

DATE  STATION    
1980  USC00048351    0.069945
1981  USC00048351    0.095836
1982  USC00048351    0.184082
1983  USC00048351    0.179315
1984  USC00048351    0.065881
                       ...   
2020  USC00046370    0.054576
      USC00046826    0.027143
      USC00047965    0.031477
      USC00048351    0.025565
      USW00023213    0.028333
Name: PRCP, Length: 301, dtype: float64

In [433]:
type(mean_precip)

pandas.core.series.Series

In [434]:
mean_tmin = date_grouped['TMIN'].mean()
mean_tmin.head()

DATE  STATION    
1980  USC00048351    44.373134
1981  USC00048351    45.407713
1982  USC00048351    42.228650
1983  USC00048351    46.047486
1984  USC00048351    43.486567
Name: TMIN, dtype: float64

In [435]:
mean_tmax = date_grouped['TMAX'].mean()
mean_tmax.head()

DATE  STATION    
1980  USC00048351    75.148352
1981  USC00048351    76.173077
1982  USC00048351    73.478022
1983  USC00048351    73.515152
1984  USC00048351    75.376119
Name: TMAX, dtype: float64

In [436]:
type(mean_tmax)
df_tmax = mean_tmax.to_frame()
df_tmax.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,TMAX
DATE,STATION,Unnamed: 2_level_1
1980,USC00048351,75.148352
1981,USC00048351,76.173077
1982,USC00048351,73.478022
1983,USC00048351,73.515152
1984,USC00048351,75.376119


In [437]:
df_temp = df_tmax.merge(mean_tmin, left_index=True, right_index=True)
df_averages = df_temp.merge(mean_precip, left_index=True, right_index=True)
df_averages.tail(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,TMAX,TMIN,PRCP
DATE,STATION,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019,USW00023213,72.216438,45.347945,0.142356
2020,USC00041838,75.994444,46.672222,0.039159
2020,USC00043191,64.949612,35.634241,0.042478
2020,USC00043578,75.822404,44.062842,0.045027
2020,USC00043875,76.806011,47.918994,0.034208
2020,USC00046370,69.721591,48.09375,0.054576
2020,USC00046826,73.288515,45.896359,0.027143
2020,USC00047965,75.229462,46.033994,0.031477
2020,USC00048351,75.133531,46.396396,0.025565
2020,USW00023213,74.661202,44.480874,0.028333


In [438]:
df_averages.reset_index(inplace=True)
df_averages.head()

Unnamed: 0,DATE,STATION,TMAX,TMIN,PRCP
0,1980,USC00048351,75.148352,44.373134,0.069945
1,1981,USC00048351,76.173077,45.407713,0.095836
2,1982,USC00048351,73.478022,42.22865,0.184082
3,1983,USC00048351,73.515152,46.047486,0.179315
4,1984,USC00048351,75.376119,43.486567,0.065881


In [454]:
df_averages.rename(columns={'DATE': 'Year'}, inplace=True)
df_averages.head()

Unnamed: 0,Year,STATION,TMAX,TMIN,PRCP
0,1980,USC00048351,75.148352,44.373134,0.069945
1,1981,USC00048351,76.173077,45.407713,0.095836
2,1982,USC00048351,73.478022,42.22865,0.184082
3,1983,USC00048351,73.515152,46.047486,0.179315
4,1984,USC00048351,75.376119,43.486567,0.065881


In [439]:
df_weather['PRCP'].isna().sum()

3783

In [440]:
df_weather.head(10)

Unnamed: 0_level_0,STATION,PRCP,TMAX,TMIN
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980-01-01,USC00048351,0.0,,
1980-01-02,USC00048351,0.0,61.0,37.0
1980-01-03,USC00048351,0.0,61.0,35.0
1980-01-04,USC00048351,0.05,56.0,41.0
1980-01-05,USC00048351,0.01,56.0,44.0
1980-01-06,USC00048351,0.0,56.0,43.0
1980-01-07,USC00048351,0.0,64.0,41.0
1980-01-08,USC00048351,0.0,60.0,47.0
1980-01-09,USC00048351,0.5,55.0,48.0
1980-01-10,USC00048351,0.06,55.0,38.0


In [441]:
df_weather.reset_index(inplace=True)
df_weather.head()

Unnamed: 0,DATE,STATION,PRCP,TMAX,TMIN
0,1980-01-01,USC00048351,0.0,,
1,1980-01-02,USC00048351,0.0,61.0,37.0
2,1980-01-03,USC00048351,0.0,61.0,35.0
3,1980-01-04,USC00048351,0.05,56.0,41.0
4,1980-01-05,USC00048351,0.01,56.0,44.0


In [442]:
df_weather.dtypes

DATE       datetime64[ns]
STATION            object
PRCP              float64
TMAX              float64
TMIN              float64
dtype: object

In [443]:
df_weather['Year'] = pd.DatetimeIndex(df_weather['DATE']).year
df_weather['County'] = 'Sonoma'
df_weather.tail(10)

Unnamed: 0,DATE,STATION,PRCP,TMAX,TMIN,Year,County
104616,2020-12-22,USW00023213,0.0,63.0,34.0,2020,Sonoma
104617,2020-12-23,USW00023213,0.0,58.0,28.0,2020,Sonoma
104618,2020-12-24,USW00023213,0.0,56.0,30.0,2020,Sonoma
104619,2020-12-25,USW00023213,1.06,54.0,39.0,2020,Sonoma
104620,2020-12-26,USW00023213,0.01,60.0,37.0,2020,Sonoma
104621,2020-12-27,USW00023213,0.0,58.0,34.0,2020,Sonoma
104622,2020-12-28,USW00023213,0.0,60.0,35.0,2020,Sonoma
104623,2020-12-29,USW00023213,0.0,64.0,31.0,2020,Sonoma
104624,2020-12-30,USW00023213,0.15,57.0,29.0,2020,Sonoma
104625,2020-12-31,USW00023213,0.0,62.0,37.0,2020,Sonoma


In [444]:
df_weather.drop(['DATE'], axis = 1, inplace = True)
df_weather.head()

Unnamed: 0,STATION,PRCP,TMAX,TMIN,Year,County
0,USC00048351,0.0,,,1980,Sonoma
1,USC00048351,0.0,61.0,37.0,1980,Sonoma
2,USC00048351,0.0,61.0,35.0,1980,Sonoma
3,USC00048351,0.05,56.0,41.0,1980,Sonoma
4,USC00048351,0.01,56.0,44.0,1980,Sonoma


In [445]:
new_weather = df_weather.drop_duplicates(subset=['Year', 'STATION'], keep='first')
new_weather.head()

Unnamed: 0,STATION,PRCP,TMAX,TMIN,Year,County
0,USC00048351,0.0,,,1980,Sonoma
366,USC00048351,0.0,53.0,38.0,1981,Sonoma
731,USC00048351,0.5,51.0,35.0,1982,Sonoma
1096,USC00048351,0.0,48.0,32.0,1983,Sonoma
1461,USC00048351,0.0,63.0,36.0,1984,Sonoma


In [446]:
print(new_weather.shape)
print(new_weather.isna().sum())

(301, 6)
STATION     0
PRCP       26
TMAX       52
TMIN       55
Year        0
County      0
dtype: int64


In [447]:
grouped = new_weather.groupby(['STATION', 'Year'])
grouped.head()

Unnamed: 0,STATION,PRCP,TMAX,TMIN,Year,County
0,USC00048351,0.0,,,1980,Sonoma
366,USC00048351,0.0,53.0,38.0,1981,Sonoma
731,USC00048351,0.5,51.0,35.0,1982,Sonoma
1096,USC00048351,0.0,48.0,32.0,1983,Sonoma
1461,USC00048351,0.0,63.0,36.0,1984,Sonoma
...,...,...,...,...,...,...
102799,USW00023213,0.0,52.0,24.0,2016,Sonoma
103165,USW00023213,0.0,54.0,31.0,2017,Sonoma
103530,USW00023213,0.0,67.0,39.0,2018,Sonoma
103895,USW00023213,0.0,61.0,32.0,2019,Sonoma


In [448]:
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [449]:
new_weather.isna().sum()

STATION     0
PRCP       26
TMAX       52
TMIN       55
Year        0
County      0
dtype: int64

In [450]:
new_weather.head()

Unnamed: 0,STATION,PRCP,TMAX,TMIN,Year,County
0,USC00048351,0.0,,,1980,Sonoma
366,USC00048351,0.0,53.0,38.0,1981,Sonoma
731,USC00048351,0.5,51.0,35.0,1982,Sonoma
1096,USC00048351,0.0,48.0,32.0,1983,Sonoma
1461,USC00048351,0.0,63.0,36.0,1984,Sonoma


In [465]:
new_weather.shape

(301, 6)

In [470]:
new_weather.fillna(df_averages)

Unnamed: 0,STATION,PRCP,TMAX,TMIN,Year,County
0,USC00048351,0.0,75.148352,44.373134,1980,Sonoma
366,USC00048351,0.0,53.000000,38.000000,1981,Sonoma
731,USC00048351,0.5,51.000000,35.000000,1982,Sonoma
1096,USC00048351,0.0,48.000000,32.000000,1983,Sonoma
1461,USC00048351,0.0,63.000000,36.000000,1984,Sonoma
...,...,...,...,...,...,...
102799,USW00023213,0.0,52.000000,24.000000,2016,Sonoma
103165,USW00023213,0.0,54.000000,31.000000,2017,Sonoma
103530,USW00023213,0.0,67.000000,39.000000,2018,Sonoma
103895,USW00023213,0.0,61.000000,32.000000,2019,Sonoma


In [471]:
new_weather.shape

(301, 6)

In [472]:
new_weather.isna().sum()

STATION     0
PRCP       26
TMAX       52
TMIN       55
Year        0
County      0
dtype: int64

In [None]:
# Finally, merge wine onto new_weather by County, Year

In [475]:
# df = pd.merge(left = new_weather, right = df_averages, how = 'left', left_on = ['Year', 'STATION'], right_on = ['Year', 'STATION'])
df = pd.merge(left=new_weather, right=df_wine, how='left', left_on=['Year', 'County'], right_on=['Year', 'County'])

In [476]:
df.head()

Unnamed: 0,STATION,PRCP,TMAX,TMIN,Year,County,Yield(Unit/Acre)
0,USC00048351,0.0,,,1980,Sonoma,3.34
1,USC00048351,0.0,53.0,38.0,1981,Sonoma,3.39
2,USC00048351,0.5,51.0,35.0,1982,Sonoma,4.0
3,USC00048351,0.0,48.0,32.0,1983,Sonoma,3.14
4,USC00048351,0.0,63.0,36.0,1984,Sonoma,3.81
