## Imports

In [638]:
import numpy as np
import pandas as pd 
import os as os
from time import time, strftime
from datetime import datetime

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, explained_variance_score, r2_score

import warnings
warnings.filterwarnings('ignore')

## Create Classes for Data Processing

In [582]:
class Load_Data(BaseEstimator, TransformerMixin):
    def __init__(self, features=None):
        self.features = features
        self.weather_dir = ''
        self.soil_dir = ''
        self.drop_columns = ['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'AWND_ATTRIBUTES', 'PGTM_ATTRIBUTES', 
                             'PSUN', 'PSUN_ATTRIBUTES', 'SNOW', 'SNOW_ATTRIBUTES', 'SNWD', 'SNWD_ATTRIBUTES', 'TAVG',
                             'TAVG_ATTRIBUTES', 'TMAX_ATTRIBUTES', 'TMIN_ATTRIBUTES', 'TSUN', 'TSUN_ATTRIBUTES', 'WDF2_ATTRIBUTES', 
                             'WDF5_ATTRIBUTES', 'WSF2_ATTRIBUTES','WSF5_ATTRIBUTES', 'WT01_ATTRIBUTES', 'WT02_ATTRIBUTES', 
                             'WT03_ATTRIBUTES', 'WT06_ATTRIBUTES', 'WT08_ATTRIBUTES', 'PRCP_ATTRIBUTES']
        
    def fit(self, w_dir, s_dir):
        self.weather_dir = w_dir
        self.soil_dir = s_dir
        return self
    
    def transform(self, X):
        #Aggregate all 43 files into one file
        file_list = os.listdir(self.soil_dir)
        agg_data = pd.DataFrame()
        for file in file_list:
            path = self.soil_dir + file
            curr_data = pd.read_csv(path, sep='\t')
            agg_data = agg_data.append(curr_data)
        
        #Drop rows with only NAs for measurement values
        soil = agg_data.dropna(thresh=10)
        
        #Import weather files and drop unnessecary fields
        weather = pd.read_csv(self.weather_dir)
        drop_cols = list(set(weather.columns).intersection(self.drop_columns))
        weather = weather.drop(columns = self.drop_columns)
        
        #Convert both files to use same datetime
        soil['Date'] = pd.to_datetime(soil['Date'])
        weather['DATE'] = pd.to_datetime(weather['DATE'])
        
        #Join previous 10 days weather to moisture readings
        for i in range(0, 11):
            weather_new = weather.add_suffix('_' + str(i))
            soil = soil.merge(weather_new, how = 'left', left_on = 'Date', right_on = weather['DATE'] - pd.DateOffset(i * -1))
            
        date_attribs = ['Date', 'DATE_0', 'DATE_1', 'DATE_2', 'DATE_3', 'DATE_4','DATE_5', 'DATE_6', 'DATE_7', 'DATE_8', 'DATE_9', 'DATE_10']
        if 'DATE_0' in list(soil.columns):
            soil.drop(columns = date_attribs, inplace = True)
        soil['Location'] = soil['Location'].astype('object')
            
        return soil

In [583]:
class Feature_Engineer(BaseEstimator, TransformerMixin):
    def __init__(self, features=None):
        self.features = features
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        #Add categorical feature that simply stores if it rained that day or not
        for i in range(11):
            col_name = 'PRCP_' + str(i)
            rain_y_n_name = 'RAIN_Y_N_' + str(i)
            X[rain_y_n_name] = np.nan
            X[rain_y_n_name].loc[X[col_name] > 0] = 1
            X[rain_y_n_name].loc[X[col_name] == 0] = 0
            X[rain_y_n_name] = X[rain_y_n_name].astype('object')
        return X

In [584]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        print(X)
        return X[self.attribute_names].values

In [585]:
class Convert_Date(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names = None):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['Date'] = pd.to_timedelta(X['Date']).dt.total_seconds().astype(int)
        return X

## Create a Pipeline That Uses Data Processing Class

In [586]:
%%time
soil_file_dir = 'data/soil/'
weather_file_dir = 'data/weather/weather_data.csv'
x = 0

pre_work_pipeline = Pipeline([
    ('prework', Load_Data()),
    ('features', Feature_Engineer())
])

pre_work_pipeline.fit(weather_file_dir, soil_file_dir)
prework_df = pre_work_pipeline.transform(x)
#Save to CSV so that we do not need to import and clean data everytime
prework_df.to_csv('clean_data.csv')

Wall time: 15.6 s


## Make Data Frames for Each Depth

The moisture data is taken at various depths. We want to build models seperately for different depths. So we need to make a dataframe for each depth so that we can elminate entire rows where the predictor is NA


In [587]:
# First split out y values
y_cols = ['VW_30cm', 'VW_60cm', 'VW_90cm', 'VW_120cm', 'VW_150cm']
X_sets = {}
y_sets = {}
x_cols = [col for col in prework_df.columns if col not in y_cols]
X = prework_df.loc[:, x_cols]
#y = prework_df.loc[:, y_cols]

for cols in y_cols:
    dataset_name = cols[3:]
    holder = prework_df.dropna(subset = [cols])
    X_sets[dataset_name] = holder[x_cols].fillna(0)
    y_sets[dataset_name] = holder[cols]

## Split Train and Test

In [588]:
# Split training and test data
# 80-20 ratio
# Trying to keep same ratios for each location using stratify
# Could have done this in the cell above, but wanted a seperate step for this
X_train_set = {}
X_test_set = {}
y_train_set = {}
y_test_set = {}

for cols in y_cols:
    dataset_name = cols[3:]  
    X_train_set[dataset_name], X_test_set[dataset_name], y_train_set[dataset_name], y_test_set[dataset_name] = train_test_split(X_sets[dataset_name], y_sets[dataset_name], \
                                                                                                                                test_size=0.2, stratify = X_sets[dataset_name]['Location'], random_state=42)

## Generic Pipeline

In [610]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

num_attribs = X_train_set['60cm'].select_dtypes(exclude=['object', 'category']).columns
cat_attribs = X_train_set['60cm'].select_dtypes(include=['object', 'category']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value = 0)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value = '')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_attribs),
        ('cat', categorical_transformer, cat_attribs)
    ])

pipe_with_estimator = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('classifier', RandomForestRegressor())])



## Linear Regression Tests

In [618]:
pipe_with_estimator.fit(X_train_set['60cm'], y_train_set['60cm'])


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=0,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                              

In [622]:
explained_variance_score(y_test_set['60cm'], pipe_with_estimator.predict(X_test_set['60cm']))

0.9013707703385528

In [640]:
data_cols = ['30cm', '60cm', '90cm', '120cm', '150cm']
try:
    log
except NameError:
    log = pd.DataFrame(columns = ['Experiment', 'Depth', 'Fit_Time', 'Pred_Time', 'r2_score', 'exp_var_score', 'datetime'])
    
for cols in data_cols:
    t0 = time()
    pipe_with_estimator.fit(X_train_set[cols], y_train_set[cols])
    t1 = time()
    preds = pipe_with_estimator.predict(X_test_set[cols])
    t2 = time()
    expvar = explained_variance_score(y_test_set[cols], preds)
    r2sc = r2_score(y_test_set[cols], preds)
    now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    log.loc[len(log)] = ['First Linear Reg', cols, t1-t0, t2-t1, r2sc, expvar, now]
    
print(log)

         Experiment  Depth   Fit_Time  Pred_Time  r2_score  exp_var_score  \
0  First Linear Reg   30cm  58.701572   0.238179  0.890208       0.890208   
1  First Linear Reg   60cm  58.847758   0.169551  0.898520       0.898522   
2  First Linear Reg   90cm  54.927837   0.172255  0.882179       0.882180   
3  First Linear Reg  120cm  64.877650   0.197685  0.884033       0.884033   
4  First Linear Reg  150cm  63.256277   0.170247  0.876900       0.876920   

              datetime  
0  2020-11-02 20:10:26  
1  2020-11-02 20:11:25  
2  2020-11-02 20:12:20  
3  2020-11-02 20:13:25  
4  2020-11-02 20:14:28  


## Using Pipeline in Gridsearch

In [615]:
params = {
            'max_depth': [9, 15, 22, 26, 30],
            #'max_features': [1, 3, 5],
            #'n_estimators':[20, 80, 150, 200, 300]
}
grid_search = GridSearchCV(pipe_with_estimator, params, scoring='r2', cv=5, n_jobs=-1, verbose=3)
grid_search.fit(X_train_set['30cm'], y_train_set['30cm'])

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


ValueError: Invalid parameter max_depth for estimator Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=0,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                                                 verbose=0)),
                                                                  ('scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=...
                ('classifier',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators='warn', n_jobs=None,
                                       oob_score=False, random_state=None,
                                       verbose=0, warm_start=False))],
         verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.