# Milestone 3: Preprocessing

In the previous milestone, we merged the **TCG** and **Price Guide** datasets. We saved the resulting dataframe as a `.csv` file. For this milestone, we will first begin by normalizing our data and encoding our categorical variables. We will then train and evaluate our first model. Let's begin my importing important libraries. 

### Imports

In [13]:
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import numpy as np
from scipy.stats import zscore

### Data Loading

In [8]:
price_guide_tcg = pd.read_csv("price_guide_tcg_cleaned.csv", low_memory=False)
price_guide_tcg.describe()

Unnamed: 0,id,loose-price,graded-price,box-only-price,manual-only-price,bgs-10-price,condition-17-price,condition-18-price,sales-volume,hp
count,30300.0,30132.0,23242.0,23345.0,22502.0,22504.0,22653.0,22507.0,30300.0,25631.0
mean,1815578.0,9.113746,40.4734,50.029936,169.867231,260.75189,144.303368,101.864726,48.108482,96.110569
std,1610918.0,57.19715,203.630556,280.75307,1698.911993,2705.228337,641.144056,1019.301576,102.838685,52.140362
min,25228.0,0.01,0.14,0.99,0.98,0.99,1.0,1.0,1.0,10.0
25%,888708.8,1.34,12.06,13.0,34.91,52.0,44.0,21.0,5.0,60.0
50%,959665.5,1.99,16.065,18.0,42.72,65.0,50.0,26.0,12.0,80.0
75%,2254008.0,4.2925,29.8075,33.0,90.045,137.0,99.99,54.0,38.0,120.0
max,7747251.0,4223.26,17033.54,27550.0,221599.74,332400.0,42316.0,132960.0,1837.0,340.0


In [10]:
price_guide_tcg.columns

Index(['tcg_id', 'id', 'console-name', 'product-name', 'loose-price',
       'graded-price', 'box-only-price', 'manual-only-price', 'bgs-10-price',
       'condition-17-price', 'condition-18-price', 'sales-volume',
       'release-date', 'set', 'series', 'publisher', 'generation',
       'release_date', 'artist', 'name', 'set_num', 'types', 'supertype',
       'subtypes', 'hp', 'weaknesses', 'rarity', 'legalities', 'resistances'],
      dtype='object')

## Categorical Variable Encoding

In [15]:
categorical_columns = [
    'tcg_id', 'console-name', 'product-name', 'release-date', 'set',
    'series', 'publisher', 'generation', 'artist', 'name', 'set_num',
    'types', 'supertype', 'subtypes', 'weaknesses', 'rarity',
    'legalities', 'resistances'
]

In [17]:
for col in categorical_columns:
    if col in price_guide_tcg.columns:
        le = LabelEncoder()
        price_guide_tcg[col] = le.fit_transform(price_guide_tcg[col].astype(str))

## Price Encoding

In [20]:
price_columns = [
    'loose-price', 'graded-price', 'box-only-price', 'manual-only-price',
    'bgs-10-price', 'condition-17-price', 'condition-18-price'
]

In [23]:
zscore_threshold = 3
for col in price_columns:
    if col in price_guide_tcg.columns:
        price_guide_tcg = price_guide_tcg[(np.abs(zscore(price_guide_tcg[col], nan_policy='omit')) < zscore_threshold)]


In [25]:
price_guide_tcg

Unnamed: 0,tcg_id,id,console-name,product-name,loose-price,graded-price,box-only-price,manual-only-price,bgs-10-price,condition-17-price,...,name,set_num,types,supertype,subtypes,hp,weaknesses,rarity,legalities,resistances
3,16290,959175.0,0,101,1.54,15.11,17.0,38.94,58.0,51.0,...,10,282,36,2,102,,63,30,4,26
4,16244,959046.0,0,612,3.24,28.09,31.0,41.00,62.0,15.8,...,104,210,25,1,5,170.0,19,12,4,18
5,16311,959105.0,0,613,9.77,32.76,36.0,109.86,165.0,143.0,...,104,307,25,1,5,170.0,19,26,4,18
6,16292,959088.0,0,618,1.25,14.17,16.0,37.30,56.0,48.0,...,100,284,36,2,56,,63,30,4,26
7,16292,959176.0,0,619,1.54,7.85,9.0,38.29,57.0,50.0,...,100,284,36,2,56,,63,30,4,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30288,15240,806714.0,133,27986,4.52,27.33,30.0,62.54,94.0,81.0,...,3859,320,8,1,5,170.0,46,12,4,2
30289,15219,806575.0,133,28110,2.76,25.95,29.0,190.75,286.0,248.0,...,3874,295,4,1,31,130.0,41,8,4,3
30292,15147,886726.0,133,28117,16.00,50.00,55.0,60.00,90.0,78.0,...,3876,60,4,1,5,170.0,41,26,4,3
30293,15220,806576.0,133,28118,4.11,23.00,25.0,59.13,89.0,77.0,...,3876,296,4,1,5,170.0,41,12,4,3
