In [None]:
from os import walk

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

## Loading and processing data

In [None]:
df = pd.read_csv('project1_output.csv')

#### Converting cols to their appropriate types again because we lost it on the csv export

In [None]:
df["product_category_name"] = df["product_category_name"].astype('category')
df["order_status"] = df["order_status"].astype('category')
df["review_score"] = df["review_score"].astype('category')
df["payment_type"] = df["payment_type"].astype('category')
df["customer_zip_code_prefix"] = df["customer_zip_code_prefix"].astype('category')
df["customer_city"] = df["customer_city"].astype('category')
df["customer_state"] = df["customer_state"].astype('category')
df["seller_zip_code_prefix"] = df["seller_zip_code_prefix"].astype('category')
df["seller_city"] = df["seller_city"].astype('category')
df["seller_state"] = df["seller_state"].astype('category')

df["product_name_lenght"] = df["product_name_lenght"].astype('int64')
df["product_description_lenght"] = df["product_description_lenght"].astype('int64')
df["product_photos_qty"] = df["product_photos_qty"].astype('int64')
df["payment_installments"] = df["payment_installments"].astype('int64')
df["payment_sequential"] = df["payment_sequential"].astype('int64')

In [None]:
df.dtypes

#### Droping unnecessary columns
We only want to work with numerical values.

In [None]:
df = df.select_dtypes(exclude=['object'])

In [None]:
df.columns

We'll also drop the `payment_value` column because our model would simply infer our target value from it by subtracting it from the `price` column.

In [None]:
df = df.drop(columns=['payment_value'])

### Feature engineering

Here we'll one-hot encode all of our categorical columns, and then drop the original ones

In [None]:
df = pd.get_dummies(df)
df = df.select_dtypes(exclude=['category'])

In [None]:
for col in df.columns:
    print(col)
print(len(df.columns))

Even though we generated over 22000 columns this way, we believe that our model will be powerful enough to filter out any unecessary data.

## Picking column for prediction

We chose the `freight_value` column so we can perform a regression in order to try to find it's value based on all of the columns we have available.

In [None]:
TARGET_VALUE = 'freight_value'

In [None]:
target_col = df[TARGET_VALUE]

In [None]:
target_col

In [None]:
df = df.drop(columns=[TARGET_VALUE])

## Separating prediction and test data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# we'll define a random_state in order to have a reproducible split across runs
x_train, x_test, y_train, y_test = train_test_split(df, target_col, random_state=23)

# Picking 4 ML algorithms

We'll use the following 4 algorithms:

1. Linear regression
2. Multilayer perceptron (a shallow one)
3. Multilayer perceptron (a deep one, AKA DNN)
4. ?? (random forests or svm?) 

## Linear regression
Let's start off with linear regression, which is the most simple algorithm in our selection, and will serve as a baseline for the following algorithms.

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

In [None]:
normalizer = preprocessing.Normalization(axis=-1)
normalizer.adapt(np.array(x_train))

Since a regression model can be seen basicaly as a single unit, single layer MLP model, we'll create it as follows:

In [None]:
regression_model = tf.keras.Sequential([
    normalizer,
    layers.Dense(units=1)
])

regression_model.summary()

In [None]:
regression_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')

In [None]:
#len(x_train) == len(y_train)
len(x_test) == len(y_test)

In [None]:
history = regression_model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    epochs=100
)