In [62]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [63]:
df_wine = pd.read_csv('data/sonoma/sonoma_wine_cleaned.csv')
# Selected 9 weather stations, based on EDA results and will use an updated dataframe
df_weather = pd.read_csv('data/updated_weather_df.csv')

Steps for Pre-processing: 
- Add a Year column to df_weather (after compressing each station data by year)
- Add a County column to df_weather
- Drop unnecessary columns (all cols except Year and Yield for df_weather and Moving Average columns and Name column in df_weather)
- Merge df_wine to df_weather on columns Year and County

In [64]:
df_wine.head()

Unnamed: 0,Year,CommodityCode,CropName,CountyCode,County,HarvestedAcres,Yield(Unit/Acre),Production,Price(Dollars/Unit),Unit,Value(Dollars)
0,2020,216299,GRAPESWINE,97,Sonoma,56800.0,2.61,148000.0,2375.08,Tons,351512000
1,2019,216299,GRAPESWINE,97,Sonoma,57500.0,4.0,230000.0,2843.57,Tons,654021000
2,2018,216299,GRAPESWINE,97,Sonoma,59900.0,4.61,276000.0,2817.66,Tons,777675000
3,2017,216299,GRAPESWINE,97,Sonoma,60000.0,3.43,206000.0,2807.34,Tons,578313000
4,2016,216299,GRAPESWINE,97,Sonoma,60000.0,3.77,226000.0,2595.21,Tons,586518000


In [65]:
df_weather.head()

Unnamed: 0,STATION,NAME,PRCP,TMAX,TMIN,Moving_Avg_PRCP_yr,Moving_Avg_TMIN_yr,Moving_Avg_TMAX_yr,DATE
0,USC00048351,"SONOMA, CA US",0.0,,,,,,1980-01-01
1,USC00048351,"SONOMA, CA US",0.0,61.0,37.0,,,,1980-01-02
2,USC00048351,"SONOMA, CA US",0.0,61.0,35.0,,,,1980-01-03
3,USC00048351,"SONOMA, CA US",0.05,56.0,41.0,,,,1980-01-04
4,USC00048351,"SONOMA, CA US",0.01,56.0,44.0,,,,1980-01-05


In [66]:
df_weather.STATION.unique()

array(['USC00048351', 'USC00047965', 'USC00043578', 'USC00043875',
       'USC00046370', 'USC00046826', 'USC00043191', 'USC00041838',
       'USW00023213'], dtype=object)

In [67]:
df_weather.drop(['Moving_Avg_PRCP_yr', 'Moving_Avg_TMIN_yr', 'Moving_Avg_TMAX_yr', 'NAME'], axis = 1, inplace = True)

In [68]:
df_weather.head()

Unnamed: 0,STATION,PRCP,TMAX,TMIN,DATE
0,USC00048351,0.0,,,1980-01-01
1,USC00048351,0.0,61.0,37.0,1980-01-02
2,USC00048351,0.0,61.0,35.0,1980-01-03
3,USC00048351,0.05,56.0,41.0,1980-01-04
4,USC00048351,0.01,56.0,44.0,1980-01-05


In [69]:
df_wine.columns

Index(['Year', 'CommodityCode', 'CropName', 'CountyCode', 'County',
       'HarvestedAcres', 'Yield(Unit/Acre)', 'Production',
       'Price(Dollars/Unit)', 'Unit', 'Value(Dollars)'],
      dtype='object')

In [70]:
df_wine.drop(['CommodityCode', 'CropName', 'CountyCode', 'HarvestedAcres', 'Production',
       'Price(Dollars/Unit)', 'Unit', 'Value(Dollars)'], axis = 1, inplace = True)

In [71]:
df_wine.head()

Unnamed: 0,Year,County,Yield(Unit/Acre)
0,2020,Sonoma,2.61
1,2019,Sonoma,4.0
2,2018,Sonoma,4.61
3,2017,Sonoma,3.43
4,2016,Sonoma,3.77


In [72]:
df_weather.dtypes

STATION     object
PRCP       float64
TMAX       float64
TMIN       float64
DATE        object
dtype: object

In [73]:
df_weather['DATE'] = pd.to_datetime(df_weather['DATE'])
df_weather = df_weather.set_index('DATE')

In [74]:
date_grouped = df_weather.groupby([df_weather.index.year, df_weather.STATION])
date_grouped.head()

Unnamed: 0_level_0,STATION,PRCP,TMAX,TMIN
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980-01-01,USC00048351,0.00,,
1980-01-02,USC00048351,0.00,61.0,37.0
1980-01-03,USC00048351,0.00,61.0,35.0
1980-01-04,USC00048351,0.05,56.0,41.0
1980-01-05,USC00048351,0.01,56.0,44.0
...,...,...,...,...
2020-01-01,USW00023213,0.00,64.0,42.0
2020-01-02,USW00023213,0.00,68.0,38.0
2020-01-03,USW00023213,0.00,61.0,36.0
2020-01-04,USW00023213,0.10,62.0,37.0


In [75]:
mean_precip = date_grouped['PRCP'].mean()
mean_precip

DATE  STATION    
1980  USC00048351    0.069945
1981  USC00048351    0.095836
1982  USC00048351    0.184082
1983  USC00048351    0.179315
1984  USC00048351    0.065881
                       ...   
2020  USC00046370    0.054576
      USC00046826    0.027143
      USC00047965    0.031477
      USC00048351    0.025565
      USW00023213    0.028333
Name: PRCP, Length: 301, dtype: float64

In [76]:
mean_tmin = date_grouped['TMIN'].mean()
mean_tmin

DATE  STATION    
1980  USC00048351    44.373134
1981  USC00048351    45.407713
1982  USC00048351    42.228650
1983  USC00048351    46.047486
1984  USC00048351    43.486567
                       ...    
2020  USC00046370    48.093750
      USC00046826    45.896359
      USC00047965    46.033994
      USC00048351    46.396396
      USW00023213    44.480874
Name: TMIN, Length: 301, dtype: float64

In [77]:
mean_tmax = date_grouped['TMAX'].mean()
mean_tmax

DATE  STATION    
1980  USC00048351    75.148352
1981  USC00048351    76.173077
1982  USC00048351    73.478022
1983  USC00048351    73.515152
1984  USC00048351    75.376119
                       ...    
2020  USC00046370    69.721591
      USC00046826    73.288515
      USC00047965    75.229462
      USC00048351    75.133531
      USW00023213    74.661202
Name: TMAX, Length: 301, dtype: float64

In [78]:
df_weather['PRCP'].isna().sum()

3783

In [79]:
df_weather.head(10)

Unnamed: 0_level_0,STATION,PRCP,TMAX,TMIN
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980-01-01,USC00048351,0.0,,
1980-01-02,USC00048351,0.0,61.0,37.0
1980-01-03,USC00048351,0.0,61.0,35.0
1980-01-04,USC00048351,0.05,56.0,41.0
1980-01-05,USC00048351,0.01,56.0,44.0
1980-01-06,USC00048351,0.0,56.0,43.0
1980-01-07,USC00048351,0.0,64.0,41.0
1980-01-08,USC00048351,0.0,60.0,47.0
1980-01-09,USC00048351,0.5,55.0,48.0
1980-01-10,USC00048351,0.06,55.0,38.0


In [80]:
df_weather.reset_index(inplace=True)
df_weather.head()

Unnamed: 0,DATE,STATION,PRCP,TMAX,TMIN
0,1980-01-01,USC00048351,0.0,,
1,1980-01-02,USC00048351,0.0,61.0,37.0
2,1980-01-03,USC00048351,0.0,61.0,35.0
3,1980-01-04,USC00048351,0.05,56.0,41.0
4,1980-01-05,USC00048351,0.01,56.0,44.0


In [81]:
df_weather.dtypes

DATE       datetime64[ns]
STATION            object
PRCP              float64
TMAX              float64
TMIN              float64
dtype: object

In [82]:
df_weather['YEAR'] = pd.DatetimeIndex(df_weather['DATE']).year
df_weather.tail(10)

Unnamed: 0,DATE,STATION,PRCP,TMAX,TMIN,YEAR
104616,2020-12-22,USW00023213,0.0,63.0,34.0,2020
104617,2020-12-23,USW00023213,0.0,58.0,28.0,2020
104618,2020-12-24,USW00023213,0.0,56.0,30.0,2020
104619,2020-12-25,USW00023213,1.06,54.0,39.0,2020
104620,2020-12-26,USW00023213,0.01,60.0,37.0,2020
104621,2020-12-27,USW00023213,0.0,58.0,34.0,2020
104622,2020-12-28,USW00023213,0.0,60.0,35.0,2020
104623,2020-12-29,USW00023213,0.0,64.0,31.0,2020
104624,2020-12-30,USW00023213,0.15,57.0,29.0,2020
104625,2020-12-31,USW00023213,0.0,62.0,37.0,2020


In [83]:
df_weather.drop(['DATE'], axis = 1, inplace = True)
df_weather.head()

Unnamed: 0,STATION,PRCP,TMAX,TMIN,YEAR
0,USC00048351,0.0,,,1980
1,USC00048351,0.0,61.0,37.0,1980
2,USC00048351,0.0,61.0,35.0,1980
3,USC00048351,0.05,56.0,41.0,1980
4,USC00048351,0.01,56.0,44.0,1980


In [84]:
new_weather = df_weather.drop_duplicates(subset=['YEAR', 'STATION'], keep='first')
# .groupby(['STATION']).transform(lambda x: x.fillna(x.mean()))
new_weather.head()

Unnamed: 0,STATION,PRCP,TMAX,TMIN,YEAR
0,USC00048351,0.0,,,1980
366,USC00048351,0.0,53.0,38.0,1981
731,USC00048351,0.5,51.0,35.0,1982
1096,USC00048351,0.0,48.0,32.0,1983
1461,USC00048351,0.0,63.0,36.0,1984


In [85]:
print(new_weather.shape)
print(new_weather.isna().sum())

(301, 5)
STATION     0
PRCP       26
TMAX       52
TMIN       55
YEAR        0
dtype: int64


In [86]:
grouped = new_weather.groupby(['STATION', 'YEAR'])
grouped.head()

Unnamed: 0,STATION,PRCP,TMAX,TMIN,YEAR
0,USC00048351,0.0,,,1980
366,USC00048351,0.0,53.0,38.0,1981
731,USC00048351,0.5,51.0,35.0,1982
1096,USC00048351,0.0,48.0,32.0,1983
1461,USC00048351,0.0,63.0,36.0,1984
...,...,...,...,...,...
102799,USW00023213,0.0,52.0,24.0,2016
103165,USW00023213,0.0,54.0,31.0,2017
103530,USW00023213,0.0,67.0,39.0,2018
103895,USW00023213,0.0,61.0,32.0,2019


In [91]:
new_weather.isna().sum()

STATION     0
PRCP       26
TMAX       52
TMIN       55
YEAR        0
dtype: int64

In [None]:
# Need to apply x.mean by year *and* station to 

# This does not work: 
# df_weather['PRCP'] = grouped['PRCP'].transform(lambda x: x.fillna(x.mean()))
# df_weather['TMIN'] = grouped['TMIN'].transform(lambda x: x.fillna(x.mean()))
# df_weather['TMAX'] = grouped['TMAX'].transform(lambda x: x.fillna(x.mean()))