# Feature Engineering for Game Sales Data

### Import Data and Libraries

In [5]:
import pandas as pd

# MinMaxScaler will convert a sequence of number and
# scale to a range between 0 and 1. For example, if the
# smallest and biggest numbers in a dataset are 20000 and 100000, respectively,
# MinMaxScaler will convert these to 0 and 1. 
from sklearn.preprocessing import MinMaxScaler

In [6]:
trainDf = pd.read_csv('./sales_data_train.csv')
testDf = pd.read_csv('./sales_data_test.csv')

In [7]:
trainDf.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,3.5,1,0,1,0,1,0,0,132717,59.99
1,4.5,0,0,0,0,1,1,0,83407,49.99
2,3.0,0,0,0,0,1,1,0,62423,49.99
3,4.5,1,0,0,0,0,0,1,69889,39.99
4,4.0,1,0,1,0,1,0,1,161382,59.99


In [8]:
# Instantiate the scaler
scaler = MinMaxScaler(feature_range=(0, 1))

In [9]:
# .fit_transform() will apply (1) feature engineering, (2) training, and (3) validation
# in one method.
train_scaled = scaler.fit_transform(trainDf)
test_scaled = scaler.fit_transform(testDf)

In [10]:
train_scaled

array([[0.5       , 1.        , 0.        , ..., 0.        , 0.37471396,
        1.        ],
       [0.83333333, 0.        , 0.        , ..., 0.        , 0.19242528,
        0.5       ],
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.11485185,
        0.5       ],
       ...,
       [0.83333333, 0.        , 1.        , ..., 0.        , 0.61007375,
        1.        ],
       [0.5       , 1.        , 1.        , ..., 1.        , 0.24626902,
        0.        ],
       [0.33333333, 1.        , 0.        , ..., 1.        , 0.21633242,
        0.5       ]])

In [11]:
# Print the numbers that were used to scale the data
print(
    "Multiply by {:.10f} and Add {:.10f}".format( 
        scaler.scale_[8], 
        scaler.min_[8] 
      )
)

# Multiply by 0.0000042367 and Add -0.1534149886

Multiply by 0.0000042367 and Add -0.1534149886


In [13]:
# Export the scales data for the next notebook
trainDf_scaled = pd.DataFrame(train_scaled, columns=trainDf.columns.values)
testDf_scaled = pd.DataFrame(test_scaled, columns=testDf.columns.values)

# Set index to exclude the row number
trainDf_scaled.to_csv('./sales_data_train_scaled.csv', index=False)
testDf_scaled.to_csv('./sales_data_test_scaled.csv', index=False)