In [1]:
from os import walk

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

## Loading and processing data

In [4]:
df = pd.read_csv('project1_output.csv')

#### Converting cols to their appropriate types again because we lost it on the csv export

In [8]:
df["product_category_name"] = df["product_category_name"].astype('category')
df["order_status"] = df["order_status"].astype('category')
df["review_score"] = df["review_score"].astype('category')
df["payment_type"] = df["payment_type"].astype('category')
df["customer_zip_code_prefix"] = df["customer_zip_code_prefix"].astype('category')
df["customer_city"] = df["customer_city"].astype('category')
df["customer_state"] = df["customer_state"].astype('category')
df["seller_zip_code_prefix"] = df["seller_zip_code_prefix"].astype('category')
df["seller_city"] = df["seller_city"].astype('category')
df["seller_state"] = df["seller_state"].astype('category')

df["product_name_lenght"] = df["product_name_lenght"].astype('int64')
df["product_description_lenght"] = df["product_description_lenght"].astype('int64')
df["product_photos_qty"] = df["product_photos_qty"].astype('int64')
df["payment_installments"] = df["payment_installments"].astype('int64')
df["payment_sequential"] = df["payment_sequential"].astype('int64')

In [21]:
df.dtypes

order_item_id                    int64
price                          float64
freight_value                  float64
product_category_name         category
product_name_lenght              int64
product_description_lenght       int64
product_photos_qty               int64
product_weight_g               float64
product_length_cm              float64
product_height_cm              float64
product_width_cm               float64
order_status                  category
review_score                  category
payment_sequential               int64
payment_type                  category
payment_installments             int64
customer_zip_code_prefix      category
customer_city                 category
customer_state                category
seller_zip_code_prefix        category
seller_city                   category
seller_state                  category
payment_value_norm             float64
volume                         float64
dtype: object

#### Droping unnecessary columns
We only want to work with numerical values.

In [11]:
df = df.select_dtypes(exclude=['object'])

In [14]:
df.columns

Index(['order_item_id', 'price', 'freight_value', 'product_category_name',
       'product_name_lenght', 'product_description_lenght',
       'product_photos_qty', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'order_status', 'review_score',
       'payment_sequential', 'payment_type', 'payment_installments',
       'payment_value', 'customer_zip_code_prefix', 'customer_city',
       'customer_state', 'seller_zip_code_prefix', 'seller_city',
       'seller_state', 'payment_value_norm', 'volume'],
      dtype='object')

We'll also drop the `payment_value` column because our model would simply infer our target value from it by subtracting it from the `price` column.

In [None]:
df = df.drop(columns=['payment_value'])

### Feature engineering

In [27]:
df = pd.get_dummies(df)
df = df.select_dtypes(exclude=['category'])

## Picking column for prediction

In [22]:
TARGET_VALUE = 'freight_value'

## Separating prediction and test data

In [29]:
import logging

from sklearn.model_selection import train_test_split


logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [31]:
x_df = df.drop('freight_value', axis=1)
y_df = df['freight_value']
Xtrain, Xtest, ytrain, ytest = train_test_split(x_df, y_df, random_state=1)