In [18]:
# import your libraries here
import pandas as pd
import keras
import numpy as np
import multiprocessing

# Importing utility functions from Keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Input, InputLayer, Normalization
from keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [107]:
data = pd.read_csv("processed_data.csv")
# print(data["Year"].value_counts())



In [116]:
from time import time_ns

def get_X_and_y(df : pd.DataFrame, y_col_name:str) -> tuple[np.ndarray,np.ndarray]:
    X = df.copy()
    # country_nums = list(set(list(data["Country"])))
    # onehot_countries = to_categorical(data["Country"].apply(lambda x: country_nums.index(x)),len(set(list(data["Country"]))))
    X.pop("Country")
    y = X.pop(y_col_name) 
    return X.__array__().astype("float32"), y.__array__().astype("float32")

def data_generator(X, y, batch_size: int) -> tuple[list,list]:
    '''
    Returns data generator to be used by feed_forward
    https://wiki.python.org/moin/Generators
    https://realpython.com/introduction-to-python-generators/
    
    Yields batches of embeddings and labels to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)
    
    '''
    i = 0
    for i in range(0,len(y),batch_size):
        next_i = min(len(y), i+batch_size)
        features = X[i:next_i]
        y_true = y[i:next_i]
        yield features, y_true




In [113]:
# given lines of text (arraylike) and embeddings (model.wv)
def train_model(X,y, learning_rate = 0.01):
    # print(X)
    num_sequences_per_batch = 16 # this is the batch size
    steps_per_epoch = len(X)//num_sequences_per_batch  # Number of batches per epoch
    # train_generator = data_generator(X, y, num_sequences_per_batch)
    # sample=next(train_generator) # this is how you get data out of generators
    # print(sample)

    # Define the model architecture using Keras Sequential API
    model = Sequential()
    # setup normalization layer as input layer - DON'T TOUCH THIS LAYER
    normalizer = Normalization(input_shape=(X.shape[1],),axis=None)
    # adapt for our training input data
    normalizer.adapt(X)
    # add normalization layer to 
    model.add(normalizer)
    # configure and add hidden layers - TWEAK THESE LAYERS (modify, add, or remove)
    model.add(layer_h1 := Dense(2000, activation='relu'))
    model.add(layer_h2 := Dense(1500, activation='relu'))
    model.add(layer_h3 := Dense(1000, activation='relu'))
    model.add(layer_h4 := Dense(500, activation='relu'))
    # add output layer
    model.add(layer_o := Dense(1)) 
    model.summary()

    # setup model for learning
    model.compile(optimizer=Adam(learning_rate=learning_rate), # TRY DIFFERENT LEARNING RATES
                loss='mean_absolute_error')
    
    # train the model
    model.fit(x=X, y=y,
            epochs=100) # TWEAK THIS  

    # return the model, tokenizer, and embedding dict for sentence generation use later
    return model

In [119]:
# split data into training and test
train_data = data.loc[data["Year"]!=2018,:]
test_data = data.loc[data["Year"]==2018,:]
# then get X and y for both training and test data (numpy arrays)
train_X,train_y = get_X_and_y(train_data,"total_yield")
test_X,test_y = get_X_and_y(test_data,"total_yield")
train_y

array([ 69.81,  68.46,  71.04, ..., 125.17, 138.35, 132.23], dtype=float32)

In [121]:
m = train_model(train_X, train_y, 0.01)


Model: "sequential_30"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization_28 (Normaliza  (None, 11)               3         
 tion)                                                           
                                                                 
 dense_102 (Dense)           (None, 2000)              24000     
                                                                 
 dense_103 (Dense)           (None, 1500)              3001500   
                                                                 
 dense_104 (Dense)           (None, 1000)              1501000   
                                                                 
 dense_105 (Dense)           (None, 500)               500500    
                                                                 
 dense_106 (Dense)           (None, 1)                 501       
                                                     

In [104]:
preds = [arr[0] for arr in m.predict(test_X)]
# for yhat,y in zip(preds,test_y):
    # print(f"predicted: {yhat} , y_true: {y}")
print(mean_absolute_error(y_pred=preds,y_true=test_y))

predicted: 60.787689208984375 , y_true: 87.14
predicted: 76.16443634033203 , y_true: 48.55999999999999
predicted: 153.47262573242188 , y_true: 144.91
predicted: 136.99417114257812 , y_true: 131.9
predicted: 107.7380142211914 , y_true: 106.07
predicted: 106.1572036743164 , y_true: 75.77
predicted: -0.003213865915313363 , y_true: 0.0
predicted: 123.22398376464844 , y_true: 110.55
predicted: 122.5222396850586 , y_true: 119.81
predicted: 105.61555480957031 , y_true: 131.25999999999996
predicted: 75.26773834228516 , y_true: 65.62
predicted: 158.30555725097656 , y_true: 161.57000000000002
predicted: 129.58912658691406 , y_true: 129.0
predicted: 115.05950927734375 , y_true: 131.59
predicted: 66.48392486572266 , y_true: 71.24
predicted: -0.0032326113432645798 , y_true: 0.0
predicted: 134.07882690429688 , y_true: 143.47000000000003
predicted: 119.87445068359375 , y_true: 117.89
predicted: 120.4302749633789 , y_true: 111.77
predicted: 113.5545883178711 , y_true: 125.04
predicted: 82.780464172363

## HYPER PARAMETER TUNING

In [1]:
# TODO Test out different learning rates

# TODO Test out different # of epochs
