In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Load and prepare the data

In [3]:
data_path = 'data/Bewässerungszeiten-Datensatz_small.csv'
rides = pd.read_csv(data_path)

In [4]:
rides.head()

Unnamed: 0,instant,season,yr,mnth,hr,b12h_weathersit,b9h_weathersit,b6h_weathersit,b3h_weathersit,crnt_weathersit,...,a6h_weathersit,a9h_weathersit,a12h_weathersit,temp,atemp,hum,soil_hum,windspeed,CNN_decision,conclusion
0,1,1,0,1,0,1,1,1,1,1,...,1,1,1,0.24,0.2879,0.81,0.81,0.0,1,1
1,2,1,0,1,1,1,1,1,1,1,...,1,1,2,0.22,0.2727,0.8,0.8,0.0,1,1
2,3,1,0,1,2,1,1,1,1,1,...,1,1,2,0.22,0.2727,0.8,0.8,0.0,1,1
3,4,1,0,1,3,1,1,1,1,1,...,1,1,2,0.24,0.2879,0.75,0.75,0.0,1,1
4,5,1,0,1,4,1,1,1,1,1,...,1,2,2,0.24,0.2879,0.75,0.75,0.0,1,1


### One-Hot Encoding for some categorical variables

In [5]:
### Don't run this cell if you need One-Hot Encoding
data=rides.drop(['instant'], axis=1)
data.head()

Unnamed: 0,season,yr,mnth,hr,b12h_weathersit,b9h_weathersit,b6h_weathersit,b3h_weathersit,crnt_weathersit,a3h_weathersit,a6h_weathersit,a9h_weathersit,a12h_weathersit,temp,atemp,hum,soil_hum,windspeed,CNN_decision,conclusion
0,1,0,1,0,1,1,1,1,1,1,1,1,1,0.24,0.2879,0.81,0.81,0.0,1,1
1,1,0,1,1,1,1,1,1,1,1,1,1,2,0.22,0.2727,0.8,0.8,0.0,1,1
2,1,0,1,2,1,1,1,1,1,2,1,1,2,0.22,0.2727,0.8,0.8,0.0,1,1
3,1,0,1,3,1,1,1,1,1,1,1,1,2,0.24,0.2879,0.75,0.75,0.0,1,1
4,1,0,1,4,1,1,1,1,1,1,1,2,2,0.24,0.2879,0.75,0.75,0.0,1,1


### Scaling feature variables
To make training the network easier, we'll standardize each of the continuous variables. That is, we'll shift and scale the variables such that they have zero mean and a standard deviation of 1.

The scaling factors are saved so we can go backwards when we use the network for predictions.

In [6]:
quant_features = ['temp', 'atemp', 'hum', 'soil_hum', 'windspeed', ]
# Store scalings in a dictionary so we can convert back later
scaled_features = {}
for each in quant_features:
    mean, std = data[each].mean(), data[each].std()
    scaled_features[each] = [mean, std]
    data.loc[:, each] = (data[each] - mean)/std

In [7]:
scaled_features

{'temp': [0.47127653783341417, 0.19121228452565855],
 'atemp': [0.45056751497006386, 0.168274712014804],
 'hum': [0.7264099074578183, 0.22062919459565],
 'soil_hum': [0.7264099074578183, 0.22062919459565],
 'windspeed': [0.1844495508981978, 0.12854041180419254]}

### Shuffle data points

### Splitting the data into training, testing, and validation sets

In [8]:
# Save last 500 data as test_data 
test_data = data[-500:]

# Now remove the test data from the data set 
data = data[:-500]

# Separate the data into features and targets
target_fields = ['conclusion']
X_train, y_train = data.drop(target_fields, axis=1), data[target_fields].squeeze()
X_test, y_test = test_data.drop(target_fields, axis=1), test_data[target_fields].squeeze()

In [9]:
X_train.head()

Unnamed: 0,season,yr,mnth,hr,b12h_weathersit,b9h_weathersit,b6h_weathersit,b3h_weathersit,crnt_weathersit,a3h_weathersit,a6h_weathersit,a9h_weathersit,a12h_weathersit,temp,atemp,hum,soil_hum,windspeed,CNN_decision
0,1,0,1,0,1,1,1,1,1,1,1,1,1,-1.209528,-0.966678,0.378871,0.378871,-1.434954,1
1,1,0,1,1,1,1,1,1,1,1,1,1,2,-1.314123,-1.057007,0.333546,0.333546,-1.434954,1
2,1,0,1,2,1,1,1,1,1,2,1,1,2,-1.314123,-1.057007,0.333546,0.333546,-1.434954,1
3,1,0,1,3,1,1,1,1,1,1,1,1,2,-1.209528,-0.966678,0.106922,0.106922,-1.434954,1
4,1,0,1,4,1,1,1,1,1,1,1,2,2,-1.209528,-0.966678,0.106922,0.106922,-1.434954,1


### Apply Random Forest algorithm [.](https://www.kaggle.com/code/faressayah/decision-trees-random-forest-for-beginners/notebook)

In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [12]:
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap}

rf_clf = RandomForestClassifier(random_state=42)

rf_cv = RandomizedSearchCV(estimator=rf_clf, scoring='f1',param_distributions=random_grid, n_iter=100, cv=3, 
                               verbose=2, random_state=42, n_jobs=-1)

rf_cv.fit(X_train, y_train)
rf_best_params = rf_cv.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [13]:
print(f"Best paramters: {rf_best_params}")

Best paramters: {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}


In [14]:
# rf_clf = RandomForestClassifier(**rf_best_params)
rf_clf = RandomForestClassifier(n_estimators=400, min_samples_split = 5, min_samples_leaf = 1, 
                                max_features= 'sqrt', max_depth=30, bootstrap = True)
rf_clf.fit(X_train, y_train)

# Now, use best parmeters for reports
print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                0       1  accuracy  macro avg  weighted avg
precision     1.0     1.0       1.0        1.0           1.0
recall        1.0     1.0       1.0        1.0           1.0
f1-score      1.0     1.0       1.0        1.0           1.0
support    3261.0  3587.0       1.0     6848.0        6848.0
_______________________________________________
Confusion Matrix: 
 [[3261    0]
 [   0 3587]]

Test Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
               0     1  accuracy  macro avg  weighted avg
precision    1.0   1.0       1.0        1.0           1.0
recall       1.0   1.0       1.0        1.0           1.0
f1-score     1.0   1.0       1.0        1.0           1.0
support    476.0  24.0       1.0      500.0         500.0
_______________________________________________
Confusion Matrix: 
 [[476   0]
 [  0  24]]



### Save and Load The Model

In [15]:
import pickle
# save the model to disk
filename = 'models/random_forest_model_07-06-22_09-33.sav'
pickle.dump(rf_clf, open(filename, 'wb'))
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(f"{result*100}%")

100.0%


### Test model with hand-made data

In [16]:
def decider(clf, CNN_decision, s_humidity, csv_path = 'data/temporary_data_.csv'):
    example = {
           'season': [2],
           'yr': [0], 
           'mnth': [7],
           'hr': [18], 
           'b12h_weathersit': [2],
           'b9h_weathersit': [2], 
           'b6h_weathersit': [2],
           'b3h_weathersit': [2], 
           'crnt_weathersit': [2],
           'a3h_weathersit': [2], 
           'a6h_weathersit': [2],
           'a9h_weathersit': [2], 
           'a12h_weathersit': [2],
           'temp': [0.5], 
           'atemp': [0.5],
           'hum': [s_humidity], 
           'soil_hum': [s_humidity],
           'windspeed': [0.2], 
           'CNN_decision': [CNN_decision]
          }
    # convert dictionary to pandas dataframe
    example = pd.DataFrame.from_dict(example)
    # add example to csv file
#     example.to_csv(csv_path, mode='a', header=False)
    # read csv file
#     rides = pd.read_csv(csv_path)
    # One-Hot Encoding
#     dummy_fields = ['season', 'b12h_weathersit', 'b9h_weathersit', 'b6h_weathersit', 'b3h_weathersit', 
#                 'crnt_weathersit', 'a3h_weathersit', 'a6h_weathersit', 'a9h_weathersit', 'a12h_weathersit', 
#                 'mnth', 'hr']
#     for each in dummy_fields:
#         dummies = pd.get_dummies(rides[each], prefix=each, drop_first=False)
#         rides = pd.concat([rides, dummies], axis=1)
#     fields_to_drop = ['instant',  'season', 'b12h_weathersit', 'b9h_weathersit', 'b6h_weathersit', 'b3h_weathersit', 
#                     'crnt_weathersit', 'a3h_weathersit', 'a6h_weathersit', 'a9h_weathersit', 'a12h_weathersit', 
#                     'mnth', 'hr', 'conclusion']
#     data = rides.drop(fields_to_drop, axis=1)
    data=example
    # normalization
    for each in quant_features:
        mean, std = scaled_features[each]
        data.loc[:, each] = (data[each] - mean)/std 
    # Convert dataframe to numpy and get last data from list
    data = data.iloc[-1:]       
#     print(data)
    # run model to decision
    decision = clf.predict(data)
    
    return decision.item()   

In [18]:
decider(
        rf_clf,
        CNN_decision = 0,        
        s_humidity = 0.9,
        csv_path = 'data/temporary_data_.csv'
        )

0