# Robust yield prediction of various farm processing units

### Import all the required libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_data = pd.read_csv("C:/Users/Admin/OneDrive/Desktop/Capstone/train_data.csv")
train_weather = pd.read_csv("C:/Users/Admin/OneDrive/Desktop/Capstone/train_weather.csv")
farm_data = pd.read_csv("C:/Users/Admin/OneDrive/Desktop/Capstone/farm_data.csv")

In [3]:
train_data.head()

Unnamed: 0,date,farm_id,ingredient_type,yield
0,2016-01-01 00:00:00,fid_110884,ing_w,0.0
1,2016-01-01 00:00:00,fid_90053,ing_w,0.0
2,2016-01-01 00:00:00,fid_17537,ing_w,0.0
3,2016-01-01 00:00:00,fid_110392,ing_w,0.0
4,2016-01-01 00:00:00,fid_62402,ing_w,0.0


In [4]:
train_data.dtypes

date                object
farm_id             object
ingredient_type     object
yield              float64
dtype: object

In [5]:
farm_data.head()

Unnamed: 0,farm_id,operations_commencing_year,num_processing_plants,farm_area,farming_company,deidentified_location
0,fid_110884,2008.0,,690.455096,Obery Farms,location 7369
1,fid_90053,2004.0,,252.69616,Obery Farms,location 7369
2,fid_17537,1991.0,,499.446528,Obery Farms,location 7369
3,fid_110392,2002.0,,2200.407555,Obery Farms,location 7369
4,fid_62402,1975.0,,10833.140121,Obery Farms,location 7369


In [6]:
farm_data.dtypes

farm_id                        object
operations_commencing_year    float64
num_processing_plants         float64
farm_area                     float64
farming_company                object
deidentified_location          object
dtype: object

In [7]:
train_weather.head()

Unnamed: 0,timestamp,deidentified_location,temp_obs,cloudiness,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed
0,2016-01-01 00:00:00,location 7369,25.0,6.0,0.0,20.0,1019.7,,0.0
1,2016-01-01 01:00:00,location 7369,24.4,,70.0,21.1,1020.2,-1.0,1.5
2,2016-01-01 02:00:00,location 7369,22.8,2.0,0.0,21.1,1020.2,0.0,0.0
3,2016-01-01 03:00:00,location 7369,21.1,2.0,0.0,20.6,1020.1,0.0,0.0
4,2016-01-01 04:00:00,location 7369,20.0,2.0,250.0,20.0,1020.0,-1.0,2.6


In [8]:
train_weather.dtypes

timestamp                 object
deidentified_location     object
temp_obs                 float64
cloudiness               float64
wind_direction           float64
dew_temp                 float64
pressure_sea_level       float64
precipitation            float64
wind_speed               float64
dtype: object

### Data Cleaning for datasets 

1. Train Data - 

In [9]:
#Convert date Object to datetime format
train_data['date'] = pd.to_datetime(train_data['date'], format='%Y-%m-%d %H:%M:%S')

In [10]:
#Type Conversion
train_data['farm_id'] = train_data['farm_id'].astype('str')

In [13]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 4 columns):
 #   Column           Dtype         
---  ------           -----         
 0   date             datetime64[ns]
 1   farm_id          object        
 2   ingredient_type  object        
 3   yield            float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 616.9+ MB


In [11]:
#Drop duplicates
train_data1 = train_data.drop_duplicates(subset=None, keep= 'first', inplace=False, ignore_index=False)

In [12]:
train_data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20215983 entries, 0 to 20216099
Data columns (total 4 columns):
 #   Column           Dtype         
---  ------           -----         
 0   date             datetime64[ns]
 1   farm_id          object        
 2   ingredient_type  object        
 3   yield            float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 771.2+ MB


2. Farm Data - 

In [12]:
#Drop 'operations_commencing_year' column from the farm data
farm_data.drop('operations_commencing_year', axis=1, inplace=True)

In [13]:
#Type Casting
farm_data['farm_id'] = farm_data['farm_id'].astype('str')

In [14]:
#Check for Null Values
farm_data.isnull().sum()

farm_id                     0
num_processing_plants    1094
farm_area                   0
farming_company             0
deidentified_location       0
dtype: int64

In [15]:
#Fill missing values with median 
farm_data['num_processing_plants'].fillna(farm_data['num_processing_plants'].median(), inplace=True)
#Type Conversion
farm_data['num_processing_plants'] = farm_data['num_processing_plants'].astype('int64')

3. Weather Data -

In [16]:
#Convert date Object to datetime format
train_weather['timestamp'] = pd.to_datetime(train_weather['timestamp'], format='%Y-%m-%d %H:%M:%S')

In [17]:
#Dropping 'cloudiness' column has more missing values.
train_weather.drop('cloudiness', axis = 1, inplace =  True)

### Merging the Datasets

In [18]:
#Merging train_data and farm_data on 'farm_id' column
train_data_merged = pd.merge(train_data1,farm_data, on = 'farm_id')

In [19]:
train_data_merged.head()

Unnamed: 0,date,farm_id,ingredient_type,yield,num_processing_plants,farm_area,farming_company,deidentified_location
0,2016-01-01 00:00:00,fid_110884,ing_w,0.0,7,690.455096,Obery Farms,location 7369
1,2016-01-01 01:00:00,fid_110884,ing_w,0.0,7,690.455096,Obery Farms,location 7369
2,2016-01-01 02:00:00,fid_110884,ing_w,0.0,7,690.455096,Obery Farms,location 7369
3,2016-01-01 03:00:00,fid_110884,ing_w,0.0,7,690.455096,Obery Farms,location 7369
4,2016-01-01 04:00:00,fid_110884,ing_w,0.0,7,690.455096,Obery Farms,location 7369


In [20]:
#Renaming date column as timestamp
train_data_merged.rename(columns = {'date':'timestamp'},inplace = True)

In [21]:
#Drop deidentified_location column from the merged dataset
train_data_merged.drop('deidentified_location', axis=1, inplace=True)

In [22]:
train_data_merged.shape

(20602665, 7)

In [23]:
train_weather.shape

(139773, 8)

In [24]:
train_data_merged.isnull().sum()

timestamp                0
farm_id                  0
ingredient_type          0
yield                    0
num_processing_plants    0
farm_area                0
farming_company          0
dtype: int64

In [25]:
#Drop 'timestamp' from weather data
train_weather.drop('timestamp', axis=1, inplace=True)

In [26]:
train_weather.shape

(139773, 7)

In [27]:
train_data_merged.shape

(20602665, 7)

In [28]:
#Final Merged Dataset 
train_final_merged = pd.merge(train_data_merged,train_weather,left_index = True, right_index = True)

In [29]:
train_final_merged.nunique()

timestamp                8784
farm_id                    14
ingredient_type             2
yield                    6456
num_processing_plants       1
farm_area                  14
farming_company             4
deidentified_location      16
temp_obs                  619
wind_direction             43
dew_temp                  522
pressure_sea_level        709
precipitation             128
wind_speed                 58
dtype: int64

In [30]:
train_final_merged.shape

(139773, 14)

In [31]:
train_final_merged.isnull().sum()

timestamp                    0
farm_id                      0
ingredient_type              0
yield                        0
num_processing_plants        0
farm_area                    0
farming_company              0
deidentified_location        0
temp_obs                    55
wind_direction            6268
dew_temp                   113
pressure_sea_level       10618
precipitation            50289
wind_speed                 304
dtype: int64

In [32]:
train_final_merged.head()

Unnamed: 0,timestamp,farm_id,ingredient_type,yield,num_processing_plants,farm_area,farming_company,deidentified_location,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed
0,2016-01-01 00:00:00,fid_110884,ing_w,0.0,7,690.455096,Obery Farms,location 7369,25.0,0.0,20.0,1019.7,,0.0
1,2016-01-01 01:00:00,fid_110884,ing_w,0.0,7,690.455096,Obery Farms,location 7369,24.4,70.0,21.1,1020.2,-1.0,1.5
2,2016-01-01 02:00:00,fid_110884,ing_w,0.0,7,690.455096,Obery Farms,location 7369,22.8,0.0,21.1,1020.2,0.0,0.0
3,2016-01-01 03:00:00,fid_110884,ing_w,0.0,7,690.455096,Obery Farms,location 7369,21.1,0.0,20.6,1020.1,0.0,0.0
4,2016-01-01 04:00:00,fid_110884,ing_w,0.0,7,690.455096,Obery Farms,location 7369,20.0,250.0,20.0,1020.0,-1.0,2.6


In [33]:
train_final_merged.drop('farm_id', axis=1, inplace=True)

In [34]:
train_final_merged.head()

Unnamed: 0,timestamp,ingredient_type,yield,num_processing_plants,farm_area,farming_company,deidentified_location,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed
0,2016-01-01 00:00:00,ing_w,0.0,7,690.455096,Obery Farms,location 7369,25.0,0.0,20.0,1019.7,,0.0
1,2016-01-01 01:00:00,ing_w,0.0,7,690.455096,Obery Farms,location 7369,24.4,70.0,21.1,1020.2,-1.0,1.5
2,2016-01-01 02:00:00,ing_w,0.0,7,690.455096,Obery Farms,location 7369,22.8,0.0,21.1,1020.2,0.0,0.0
3,2016-01-01 03:00:00,ing_w,0.0,7,690.455096,Obery Farms,location 7369,21.1,0.0,20.6,1020.1,0.0,0.0
4,2016-01-01 04:00:00,ing_w,0.0,7,690.455096,Obery Farms,location 7369,20.0,250.0,20.0,1020.0,-1.0,2.6


In [35]:
#Feature Engineering and creating new columns out of timestamp
train_final_merged['Hours'] = train_final_merged['timestamp'].dt.hour
#train_final_merged['Seconds'] = train_final_merged['timestamp'].dt.second
#train_final_merged['Minutes'] = train_final_merged['timestamp'].dt.minute

In [36]:
train_final_merged['timestamp'].min(), train_final_merged['timestamp'].max(), (train_final_merged['timestamp'].max() -train_final_merged['timestamp'].min())

(Timestamp('2016-01-01 00:00:00'),
 Timestamp('2016-12-31 23:00:00'),
 Timedelta('365 days 23:00:00'))

In [37]:
#Converting Timestamp to integer
train_final_merged['Unix Sec'] = pd.to_datetime(train_final_merged['timestamp']).astype('int64')/ 10**9
#Type Conversion float to int
train_final_merged['Unix Sec'] = train_final_merged['Unix Sec'].astype('int64')

Unix time is a date and time representation widely used in computing. It measures time by the number of seconds that have elapsed since 00:00:00 UTC on 1 January 1970, the beginning of the Unix epoch, less adjustments made due to leap seconds.

Unix time originated as the system time of Unix operating systems. It has come to be widely used in other computer operating systems, file systems, programming languages, and databases.

Unix time is a single signed number that increments every second, which makes it easier for computers to store and manipulate than conventional date systems. Interpreter programs can then convert it to a human-readable format.

In [38]:
train_final_merged.drop('timestamp', axis=1, inplace=True)

In [39]:
train_final_merged.head()

Unnamed: 0,ingredient_type,yield,num_processing_plants,farm_area,farming_company,deidentified_location,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed,Hours,Unix Sec
0,ing_w,0.0,7,690.455096,Obery Farms,location 7369,25.0,0.0,20.0,1019.7,,0.0,0,1451606400
1,ing_w,0.0,7,690.455096,Obery Farms,location 7369,24.4,70.0,21.1,1020.2,-1.0,1.5,1,1451610000
2,ing_w,0.0,7,690.455096,Obery Farms,location 7369,22.8,0.0,21.1,1020.2,0.0,0.0,2,1451613600
3,ing_w,0.0,7,690.455096,Obery Farms,location 7369,21.1,0.0,20.6,1020.1,0.0,0.0,3,1451617200
4,ing_w,0.0,7,690.455096,Obery Farms,location 7369,20.0,250.0,20.0,1020.0,-1.0,2.6,4,1451620800


In [40]:
train_final_merged.dtypes

ingredient_type           object
yield                    float64
num_processing_plants      int64
farm_area                float64
farming_company           object
deidentified_location     object
temp_obs                 float64
wind_direction           float64
dew_temp                 float64
pressure_sea_level       float64
precipitation            float64
wind_speed               float64
Hours                      int64
Unix Sec                   int64
dtype: object

### Label Encoding for Categorical Columns

In [41]:
#Label Encoding refers to converting the labels into a numeric form so as to convert them into the machine-readable form.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [42]:
train_final_merged.farming_company = le.fit_transform(train_final_merged.farming_company)
train_final_merged['farming_company'] = train_final_merged['farming_company'].astype('category')

In [43]:
train_final_merged.deidentified_location = le.fit_transform(train_final_merged.deidentified_location)
train_final_merged['deidentified_location'] = train_final_merged['deidentified_location'].astype('category')

In [44]:
train_final_merged.ingredient_type = le.fit_transform(train_final_merged.ingredient_type)
train_final_merged['ingredient_type'] = train_final_merged['ingredient_type'].astype('category')

In [45]:
train_final_merged.dtypes

ingredient_type          category
yield                     float64
num_processing_plants       int64
farm_area                 float64
farming_company          category
deidentified_location    category
temp_obs                  float64
wind_direction            float64
dew_temp                  float64
pressure_sea_level        float64
precipitation             float64
wind_speed                float64
Hours                       int64
Unix Sec                    int64
dtype: object

In [46]:
train_final_merged.head()

Unnamed: 0,ingredient_type,yield,num_processing_plants,farm_area,farming_company,deidentified_location,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed,Hours,Unix Sec
0,0,0.0,7,690.455096,1,12,25.0,0.0,20.0,1019.7,,0.0,0,1451606400
1,0,0.0,7,690.455096,1,12,24.4,70.0,21.1,1020.2,-1.0,1.5,1,1451610000
2,0,0.0,7,690.455096,1,12,22.8,0.0,21.1,1020.2,0.0,0.0,2,1451613600
3,0,0.0,7,690.455096,1,12,21.1,0.0,20.6,1020.1,0.0,0.0,3,1451617200
4,0,0.0,7,690.455096,1,12,20.0,250.0,20.0,1020.0,-1.0,2.6,4,1451620800


In [47]:
#Descriptive Stats of Final Dataset
train_final_merged.describe()

Unnamed: 0,yield,num_processing_plants,farm_area,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed,Hours,Unix Sec
count,139773.0,139773.0,139773.0,139718.0,133505.0,139660.0,129155.0,89484.0,139469.0,139773.0,139773.0
mean,503.861098,7.0,6451.949394,14.418106,180.526632,7.350158,1016.158038,0.983047,3.560527,11.502615,1467400000.0
std,981.401152,0.0,8196.029748,10.626595,111.523629,9.790235,7.629684,8.463678,2.335874,6.921604,8867605.0
min,0.0,7.0,252.69616,-28.9,0.0,-35.0,968.2,-1.0,0.0,0.0,1451606000.0
25%,0.0,7.0,743.224,7.2,80.0,0.6,1011.8,0.0,2.1,6.0,1459908000.0
50%,114.67,7.0,3446.7013,15.0,190.0,8.3,1016.4,0.0,3.1,12.0,1467310000.0
75%,478.475,7.0,9232.70014,22.2,280.0,14.4,1020.8,0.0,5.0,18.0,1474798000.0
max,8442.07,7.0,34445.924019,47.2,360.0,26.1,1045.5,343.0,19.0,23.0,1483225000.0


In [48]:
#Seperated the numeric columns from the final merged dataset
num_col = ['yield','farm_area','temp_obs', 'wind_direction', 'dew_temp', 'pressure_sea_level', 
       'precipitation', 'wind_speed','Unix Sec']

In [49]:
num_col

['yield',
 'farm_area',
 'temp_obs',
 'wind_direction',
 'dew_temp',
 'pressure_sea_level',
 'precipitation',
 'wind_speed',
 'Unix Sec']

### Imputing the Numeric Columns

In [50]:
#imputing the missing numerical columns with median of that column
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy = 'median')

imputed_data = pd.DataFrame(num_imputer.fit_transform(train_final_merged[num_col]),
                               columns = num_col)

In [51]:
imputed_data.dtypes

yield                 float64
farm_area             float64
temp_obs              float64
wind_direction        float64
dew_temp              float64
pressure_sea_level    float64
precipitation         float64
wind_speed            float64
Unix Sec              float64
dtype: object

In [52]:
imputed_data.head()

Unnamed: 0,yield,farm_area,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed,Unix Sec
0,0.0,690.455096,25.0,0.0,20.0,1019.7,0.0,0.0,1451606000.0
1,0.0,690.455096,24.4,70.0,21.1,1020.2,-1.0,1.5,1451610000.0
2,0.0,690.455096,22.8,0.0,21.1,1020.2,0.0,0.0,1451614000.0
3,0.0,690.455096,21.1,0.0,20.6,1020.1,0.0,0.0,1451617000.0
4,0.0,690.455096,20.0,250.0,20.0,1020.0,-1.0,2.6,1451621000.0


In [53]:
#Type Conversions
col = ['temp_obs','dew_temp','wind_speed','wind_direction','Unix Sec','farm_area']
imputed_data[col] = imputed_data[col].astype('int64')

In [54]:
imputed_data.head()

Unnamed: 0,yield,farm_area,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed,Unix Sec
0,0.0,690,25,0,20,1019.7,0.0,0,1451606400
1,0.0,690,24,70,21,1020.2,-1.0,1,1451610000
2,0.0,690,22,0,21,1020.2,0.0,0,1451613600
3,0.0,690,21,0,20,1020.1,0.0,0,1451617200
4,0.0,690,20,250,20,1020.0,-1.0,2,1451620800


In [55]:
imputed_data.dtypes

yield                 float64
farm_area               int64
temp_obs                int64
wind_direction          int64
dew_temp                int64
pressure_sea_level    float64
precipitation         float64
wind_speed              int64
Unix Sec                int64
dtype: object

In [56]:
imputed_data.insert(0, 'id', imputed_data.index)

In [57]:
imputed_data.head()

Unnamed: 0,id,yield,farm_area,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed,Unix Sec
0,0,0.0,690,25,0,20,1019.7,0.0,0,1451606400
1,1,0.0,690,24,70,21,1020.2,-1.0,1,1451610000
2,2,0.0,690,22,0,21,1020.2,0.0,0,1451613600
3,3,0.0,690,21,0,20,1020.1,0.0,0,1451617200
4,4,0.0,690,20,250,20,1020.0,-1.0,2,1451620800


In [59]:
cat_col = ['ingredient_type','farming_company', 'deidentified_location']

In [60]:
Final_data = pd.merge(imputed_data,
                         train_final_merged[cat_col],
                         left_index=True,
                         right_index=True)

In [61]:
Final_data.head()

Unnamed: 0,id,yield,farm_area,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed,Unix Sec,ingredient_type,farming_company,deidentified_location
0,0,0.0,690,25,0,20,1019.7,0.0,0,1451606400,0,1,12
1,1,0.0,690,24,70,21,1020.2,-1.0,1,1451610000,0,1,12
2,2,0.0,690,22,0,21,1020.2,0.0,0,1451613600,0,1,12
3,3,0.0,690,21,0,20,1020.1,0.0,0,1451617200,0,1,12
4,4,0.0,690,20,250,20,1020.0,-1.0,2,1451620800,0,1,12


In [62]:
Final_data.dtypes

id                          int64
yield                     float64
farm_area                   int64
temp_obs                    int64
wind_direction              int64
dew_temp                    int64
pressure_sea_level        float64
precipitation             float64
wind_speed                  int64
Unix Sec                    int64
ingredient_type          category
farming_company          category
deidentified_location    category
dtype: object

# Model Building and Predictions

###  Train Test Split

In [63]:
#Necessary Import
from sklearn.model_selection import train_test_split

In [64]:
y = Final_data['yield']
X = Final_data.loc[:, Final_data.columns != 'yield']

X_train, X_test, y_train, y_test =   train_test_split(X, y, test_size=0.20,random_state = 123)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(111818, 12) (27955, 12)
(111818,) (27955,)


### Building a Base Model

## Random Forest Regressor

In [65]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [66]:
# First create the base model to tune
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

In [68]:
train_pred1 = rf.predict(X_train)
test_pred1 = rf.predict(X_test)

In [69]:
train_pred1

array([397.58797 ,  98.440166,   0.      , ...,   0.      ,   0.      ,
       138.10084 ])

In [70]:
test_pred1

array([  0.      , 299.08093 , 415.1673  , ...,  16.860592,   0.743991,
         0.      ])

In [71]:
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

0.9856932893013288
0.8950679979533356


In [72]:
rmse = mean_squared_error(y_test, test_pred1, squared = False)
print("Test RMSE_score: ",rmse)

Test RMSE_score:  312.03522871500917


### Gradient Boosting Regressor

In [73]:
from sklearn.ensemble import GradientBoostingRegressor

gbm = GradientBoostingRegressor(n_estimators= 200,
                                 max_depth = 5,
                                min_samples_split = 2)

In [74]:
gbm_base_model = gbm.fit(X_train,y_train)

In [75]:
train_pred2 = gbm.predict(X_train)
test_pred2 = gbm.predict(X_test)

In [76]:
test_pred2

array([ -7.79621313, 339.46731587, 344.37878504, ...,  34.00406677,
         7.40609534,  -8.59681967])

In [77]:
train_pred2

array([407.41095341,  92.76129749,   0.87420556, ...,  -3.80411466,
         3.4947985 , 131.5742326 ])

In [78]:
print(gbm.score(X_train, y_train))
print(gbm.score(X_test, y_test))

0.850710037976225
0.8218717798554444


In [79]:
rmse2 = mean_squared_error(y_test, test_pred2, squared = False)
print("Test RMSE_score: ",rmse2)

Test RMSE_score:  406.55191703096756


### Hyper Parameter Tuning and Cross Validation RF Regressor

* Random search is a technique where random combinations of the hyperparameters are used to find the best solution for the built model. It is similar to grid search, and yet it has proven to yield better results comparatively.

In [85]:
#import randomized search
from sklearn.model_selection import RandomizedSearchCV

In [86]:
#Use the random grid to search for best hyperparameters
# Number of trees in random forest
n_estimators = [100,200]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [20,30]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 5, 10]
# Method of selecting samples for training each tree

In [87]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [88]:
print(random_grid)

{'n_estimators': [100, 200], 'max_features': ['sqrt'], 'max_depth': [20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [2, 5, 10]}


In [89]:
# Random search of parameters, using 3 fold cross validation
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, 
                               cv = 3, verbose=2, random_state=42, n_jobs = 1)

In [90]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  12.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  10.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  10.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  20.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  20.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  20.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  11.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=10

[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   9.7s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  22.6s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  20.3s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  20.3s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=100; total time=   9.5s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=100; total time=   9.3s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=100; total time=   9.4s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=  19.8s
[CV] END max_depth=30, max_features=

RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=1,
                   param_distributions={'max_depth': [20, 30],
                                        'max_features': ['sqrt'],
                                        'min_samples_leaf': [2, 5, 10],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200]},
                   random_state=42, verbose=2)

In [91]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 30}

### Hyperparameter Tuned Random Forest Regressor

In [92]:
RF_reg = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
          max_features='sqrt', min_samples_leaf=5, min_samples_split=2, n_estimators=200, n_jobs=1)

In [93]:
%%time
RF_reg.fit(X_train,y_train)

Wall time: 29.5 s


RandomForestRegressor(criterion='mse', max_depth=30, max_features='sqrt',
                      min_samples_leaf=5, n_estimators=200, n_jobs=1)

In [94]:
train_pred = RF_reg.predict(X_train)
test_pred = RF_reg.predict(X_test)

In [95]:
train_pred

array([3.89735981e+02, 9.50603713e+01, 0.00000000e+00, ...,
       5.11018027e-02, 1.50442740e+00, 1.34910071e+02])

In [96]:
test_pred

array([1.24262685e-01, 2.91668535e+02, 3.71890227e+02, ...,
       1.78093011e+01, 1.73402724e+00, 0.00000000e+00])

In [97]:
print(RF_reg.score(X_train, y_train))
print(RF_reg.score(X_test, y_test))

0.9099735728601805
0.8482414968894054


In [98]:
rmse1 = mean_squared_error(y_test, test_pred, squared = False)
print("Test RMSE_score: ",rmse1)
#rmse_metric(y_true = y_test, y_pred = test_pred)

Test RMSE_score:  375.25472220732246


# Predictions on the Test Data

In [106]:
# Reading the test data files
test_weather = pd.read_csv("C:/Users/Admin/OneDrive/Desktop/Capstone/test_weather.csv")
test_data = pd.read_csv("C:/Users/Admin/OneDrive/Desktop/Capstone/test_data.csv")

In [107]:
test_data.head()

Unnamed: 0,date,farm_id,ingredient_type,id
0,2017-01-01 00:00:00,fid_110884,ing_w,0
1,2017-01-01 00:00:00,fid_90053,ing_w,1
2,2017-01-01 00:00:00,fid_17537,ing_w,2
3,2017-01-01 00:00:00,fid_110392,ing_w,3
4,2017-01-01 00:00:00,fid_62402,ing_w,4


In [108]:
test_weather.head()

Unnamed: 0,timestamp,deidentified_location,temp_obs,cloudiness,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed
0,2017-01-01 00:00:00,location 7369,17.8,4.0,100.0,11.7,1021.4,,3.6
1,2017-01-01 01:00:00,location 7369,17.8,2.0,130.0,12.8,1022.0,0.0,3.1
2,2017-01-01 02:00:00,location 7369,16.1,0.0,140.0,12.8,1021.9,0.0,3.1
3,2017-01-01 03:00:00,location 7369,17.2,0.0,140.0,13.3,1022.2,0.0,3.1
4,2017-01-01 04:00:00,location 7369,16.7,2.0,130.0,13.3,1022.3,0.0,2.6


* Performed all the Data Cleaning practices on the Test datasets as well 

In [109]:
test_data['date'] = pd.to_datetime(test_data['date'], format='%Y-%m-%d %H:%M:%S')
test_weather['timestamp'] = pd.to_datetime(test_weather['timestamp'], format='%Y-%m-%d %H:%M:%S')

In [110]:
test_data['farm_id'] = test_data['farm_id'].astype('str')

In [111]:
test_weather.drop('cloudiness', axis = 1, inplace =  True)

In [112]:
test_data_merged = pd.merge(test_data,farm_data, on = 'farm_id')

In [113]:
test_data_merged.head()

Unnamed: 0,date,farm_id,ingredient_type,id,num_processing_plants,farm_area,farming_company,deidentified_location
0,2017-01-01 00:00:00,fid_110884,ing_w,0,7,690.455096,Obery Farms,location 7369
1,2017-01-01 01:00:00,fid_110884,ing_w,129,7,690.455096,Obery Farms,location 7369
2,2017-01-01 02:00:00,fid_110884,ing_w,258,7,690.455096,Obery Farms,location 7369
3,2017-01-01 03:00:00,fid_110884,ing_w,387,7,690.455096,Obery Farms,location 7369
4,2017-01-01 04:00:00,fid_110884,ing_w,516,7,690.455096,Obery Farms,location 7369


In [114]:
test_data_merged.rename(columns = {'date':'timestamp'},inplace = True)
#test_data_merged.drop('timestamp', axis=1, inplace=True)
#test_data_merged.drop('deidentified_location', axis=1, inplace=True)

In [115]:
test_weather.drop('timestamp', axis=1, inplace=True)

In [116]:
test_weather.drop('deidentified_location', axis=1, inplace=True)

In [117]:
test_weather.head()

Unnamed: 0,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed
0,17.8,100.0,11.7,1021.4,,3.6
1,17.8,130.0,12.8,1022.0,0.0,3.1
2,16.1,140.0,12.8,1021.9,0.0,3.1
3,17.2,140.0,13.3,1022.2,0.0,3.1
4,16.7,130.0,13.3,1022.3,0.0,2.6


In [118]:
test_data_merged.head()

Unnamed: 0,timestamp,farm_id,ingredient_type,id,num_processing_plants,farm_area,farming_company,deidentified_location
0,2017-01-01 00:00:00,fid_110884,ing_w,0,7,690.455096,Obery Farms,location 7369
1,2017-01-01 01:00:00,fid_110884,ing_w,129,7,690.455096,Obery Farms,location 7369
2,2017-01-01 02:00:00,fid_110884,ing_w,258,7,690.455096,Obery Farms,location 7369
3,2017-01-01 03:00:00,fid_110884,ing_w,387,7,690.455096,Obery Farms,location 7369
4,2017-01-01 04:00:00,fid_110884,ing_w,516,7,690.455096,Obery Farms,location 7369


In [119]:
test_weather.insert(0, 'id', test_weather.index)

In [120]:
test_weather.head()

Unnamed: 0,id,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed
0,0,17.8,100.0,11.7,1021.4,,3.6
1,1,17.8,130.0,12.8,1022.0,0.0,3.1
2,2,16.1,140.0,12.8,1021.9,0.0,3.1
3,3,17.2,140.0,13.3,1022.2,0.0,3.1
4,4,16.7,130.0,13.3,1022.3,0.0,2.6


In [121]:
test_data_merged.head()

Unnamed: 0,timestamp,farm_id,ingredient_type,id,num_processing_plants,farm_area,farming_company,deidentified_location
0,2017-01-01 00:00:00,fid_110884,ing_w,0,7,690.455096,Obery Farms,location 7369
1,2017-01-01 01:00:00,fid_110884,ing_w,129,7,690.455096,Obery Farms,location 7369
2,2017-01-01 02:00:00,fid_110884,ing_w,258,7,690.455096,Obery Farms,location 7369
3,2017-01-01 03:00:00,fid_110884,ing_w,387,7,690.455096,Obery Farms,location 7369
4,2017-01-01 04:00:00,fid_110884,ing_w,516,7,690.455096,Obery Farms,location 7369


In [122]:
#test_weather.drop('deidentified_location', axis=1, inplace=True)

In [123]:
test_final_merge = pd.merge(test_data_merged,test_weather,on = 'id',how = 'left')

In [124]:
test_final_merge.shape

(21251760, 14)

In [125]:
test_final_merge.head()

Unnamed: 0,timestamp,farm_id,ingredient_type,id,num_processing_plants,farm_area,farming_company,deidentified_location,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed
0,2017-01-01 00:00:00,fid_110884,ing_w,0,7,690.455096,Obery Farms,location 7369,17.8,100.0,11.7,1021.4,,3.6
1,2017-01-01 01:00:00,fid_110884,ing_w,129,7,690.455096,Obery Farms,location 7369,11.7,0.0,10.6,1014.7,0.0,0.0
2,2017-01-01 02:00:00,fid_110884,ing_w,258,7,690.455096,Obery Farms,location 7369,22.2,,12.8,1027.0,0.0,2.1
3,2017-01-01 03:00:00,fid_110884,ing_w,387,7,690.455096,Obery Farms,location 7369,17.8,120.0,13.9,1024.4,0.0,2.1
4,2017-01-01 04:00:00,fid_110884,ing_w,516,7,690.455096,Obery Farms,location 7369,20.6,170.0,18.3,1007.6,0.0,4.1


In [126]:
test_final_merge.drop('farm_id', axis=1, inplace=True)

In [127]:
test_final_merge.isnull().sum()

timestamp                       0
ingredient_type                 0
id                              0
num_processing_plants           0
farm_area                       0
farming_company                 0
deidentified_location           0
temp_obs                 21111902
wind_direction           21118036
dew_temp                 21112044
pressure_sea_level       21122825
precipitation            21160021
wind_speed               21112100
dtype: int64

In [128]:
test_final_merge['Unix Sec'] = pd.to_datetime(test_final_merge['timestamp']).astype('int64')/ 10**9
test_final_merge['Unix Sec'] = test_final_merge['Unix Sec'].astype('int64')

In [129]:
test_final_merge.drop('timestamp', axis=1, inplace=True)

In [130]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [131]:
test_final_merge.farming_company = le.fit_transform(test_final_merge.farming_company)
test_final_merge['farming_company'] = test_final_merge['farming_company'].astype('category')

In [132]:
test_final_merge.deidentified_location = le.fit_transform(test_final_merge.deidentified_location)
test_final_merge['deidentified_location'] = test_final_merge['deidentified_location'].astype('category')

In [133]:
test_final_merge.ingredient_type = le.fit_transform(test_final_merge.ingredient_type)
test_final_merge['ingredient_type'] = test_final_merge['ingredient_type'].astype('category')

In [134]:
test_final_merge.tail()

Unnamed: 0,ingredient_type,id,num_processing_plants,farm_area,farming_company,deidentified_location,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed,Unix Sec
21251755,0,41673195,7,8572.252713,14,3,,,,,,,1514444400
21251756,0,41673445,7,8572.252713,14,3,,,,,,,1514448000
21251757,0,41673695,7,8572.252713,14,3,,,,,,,1514451600
21251758,0,41673945,7,8572.252713,14,3,,,,,,,1514455200
21251759,0,41674195,7,8572.252713,14,3,,,,,,,1514458800


In [135]:
# Number of rows to drop 
n = 402960
# Removing last n rows 
Test_Data = test_final_merge.iloc[:-n] 
# Printing dataframe 
Test_Data

         ingredient_type        id  num_processing_plants    farm_area  \
0                      0         0                      7   690.455096   
1                      0       129                      7   690.455096   
2                      0       258                      7   690.455096   
3                      0       387                      7   690.455096   
4                      0       516                      7   690.455096   
...                  ...       ...                    ...          ...   
20848795               0  41673149                      7  3830.948108   
20848796               0  41673399                      7  3830.948108   
20848797               0  41673649                      7  3830.948108   
20848798               0  41673899                      7  3830.948108   
20848799               0  41674149                      7  3830.948108   

         farming_company deidentified_location  temp_obs  wind_direction  \
0                      8           

In [136]:
Test_Data.head()

Unnamed: 0,ingredient_type,id,num_processing_plants,farm_area,farming_company,deidentified_location,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed,Unix Sec
0,0,0,7,690.455096,8,12,17.8,100.0,11.7,1021.4,,3.6,1483228800
1,0,129,7,690.455096,8,12,11.7,0.0,10.6,1014.7,0.0,0.0,1483232400
2,0,258,7,690.455096,8,12,22.2,,12.8,1027.0,0.0,2.1,1483236000
3,0,387,7,690.455096,8,12,17.8,120.0,13.9,1024.4,0.0,2.1,1483239600
4,0,516,7,690.455096,8,12,20.6,170.0,18.3,1007.6,0.0,4.1,1483243200


In [137]:
num_col = ['id','farm_area','temp_obs', 'wind_direction', 'dew_temp', 'pressure_sea_level', 
       'precipitation', 'wind_speed','Unix Sec']

In [138]:
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy = 'median')

imputed_data_test = pd.DataFrame(num_imputer.fit_transform(Test_Data[num_col]),
                               columns = num_col)

In [139]:
imputed_data_test.head()

Unnamed: 0,id,farm_area,temp_obs,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed,Unix Sec
0,0.0,690.455096,17.8,100.0,11.7,1021.4,0.0,3.6,1483229000.0
1,129.0,690.455096,11.7,0.0,10.6,1014.7,0.0,0.0,1483232000.0
2,258.0,690.455096,22.2,190.0,12.8,1027.0,0.0,2.1,1483236000.0
3,387.0,690.455096,17.8,120.0,13.9,1024.4,0.0,2.1,1483240000.0
4,516.0,690.455096,20.6,170.0,18.3,1007.6,0.0,4.1,1483243000.0


In [140]:
col = ['id','temp_obs','dew_temp','wind_speed','wind_direction','Unix Sec','farm_area']

imputed_data_test[col] = imputed_data_test[col].astype('int64')

In [141]:
cat_col = ['ingredient_type','farming_company', 'deidentified_location']

In [142]:
Final_data_test = pd.merge(imputed_data_test, 
                         Test_Data[cat_col], 
                         left_index=True,
                         right_index=True)

In [143]:
Final_data_test.shape

(20848800, 12)

In [145]:
type(Test_Data)

pandas.core.frame.DataFrame

In [148]:
test_pred = RF_reg.predict(Final_data_test)

In [149]:
type(test_pred)

numpy.ndarray

In [150]:
test_pred

array([192.31765926, 211.28238867, 182.03957236, ..., 352.25909046,
       352.25909046, 352.25909046])

In [151]:
#rounding off the data.decimals to 3 point distance
test_pred_1 = np.round(test_pred,3)

In [152]:
test_pred_1[0:10]

array([192.318, 211.282, 182.04 , 195.529, 208.423, 195.304, 210.093,
       187.725, 187.308, 193.12 ])

# Submissions

In [153]:
sample_sub = pd.read_csv("C:/Users/Admin/OneDrive/Desktop/Capstone/sample_submission.csv")

In [154]:
sample_sub.shape

(20848800, 2)

In [155]:
sample_sub.head()

Unnamed: 0,id,yield
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [156]:
sample_sub.tail()

Unnamed: 0,id,yield
20848795,41696845,0
20848796,41696846,0
20848797,41696847,0
20848798,41696848,0
20848799,41696849,0


In [157]:
submission  = pd.DataFrame({'id': test_data.id, 'yield': test_pred_1})

In [158]:
submission.head()

Unnamed: 0,id,yield
0,0,192.318
1,1,211.282
2,2,182.04
3,3,195.529
4,4,208.423


In [159]:
submission.tail()

Unnamed: 0,id,yield
20848795,41696845,352.259
20848796,41696846,352.259
20848797,41696847,352.259
20848798,41696848,352.259
20848799,41696849,352.259


In [161]:
#Converting the Dataframe to a csv file.
Final_Submission_csv = submission.to_csv('Final_Submission.csv', index = True)