# Transforming Data into Features

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
#import data
reviews = pd.read_csv('reviews.csv')

In [3]:
#print column names
print(reviews.columns)
 
#print .info
print(reviews.info())

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null 

In [6]:
#look at the counts of recommended
print(reviews['Recommended IND'].value_counts())

1    19314
0     4172
Name: Recommended IND, dtype: int64


In [7]:
#create binary dictionary
binary_dict = {True:1, False:0}
 
#transform column
reviews['Recommended IND'] = reviews['Recommended IND'].map(binary_dict)
 
#print your transformed column
print(reviews['Recommended IND'].value_counts())


Series([], Name: Recommended IND, dtype: int64)


In [8]:
#look at the counts of rating
print(reviews['Rating'].value_counts())


5    13131
4     5077
3     2871
2     1565
1      842
Name: Rating, dtype: int64


In [9]:
#create dictionary
rating_dict = {'Loved it':5, 'Liked it':4, 'Was ok':3, 'Not great':2, 'Hated it':1}
 
#transform rating column
reviews['Rating'] = reviews['Rating'].map(rating_dict)
 
#print your transformed column values
print(reviews['Rating'].value_counts())

Series([], Name: Rating, dtype: int64)


In [10]:
#get the number of categories in a feature
print(reviews['Department Name'].value_counts()) 

Tops        10468
Dresses      6319
Bottoms      3799
Intimate     1735
Jackets      1032
Trend         119
Name: Department Name, dtype: int64


In [11]:
#perform get_dummies
one_hot = pd.get_dummies(reviews['Department Name'])
 
#join the new columns back onto the original
reviews = reviews.join(one_hot)

#print column names
print(reviews.columns)

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name', 'Bottoms', 'Dresses', 'Intimate',
       'Jackets', 'Tops', 'Trend'],
      dtype='object')


In [14]:
#get numerical columns
reviews = reviews[['Clothing ID', 'Age', 'Recommended IND', 'Rating', 'Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']].copy()

 
#reset index
reviews = reviews.set_index(reviews['Clothing ID'])

#instantiate standard scaler
scaler = StandardScaler()

#fit transform data
scaler.fit_transform(reviews)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  result = op(x, *args, **kwargs)


array([[-0.74334818, -0.83054886,         nan, ..., -0.21438431,
        -0.89672592, -0.07136282],
       [ 0.79628898, -0.74911087,         nan, ..., -0.21438431,
        -0.89672592, -0.07136282],
       [ 0.78153208,  1.36827674,         nan, ..., -0.21438431,
        -0.89672592, -0.07136282],
       ...,
       [ 0.91434423, -0.99342483,         nan, ..., -0.21438431,
        -0.89672592, -0.07136282],
       [ 0.81596486, -1.23773878,         nan, ..., -0.21438431,
        -0.89672592, -0.07136282],
       [ 0.91434423,  0.71677286,         nan, ..., -0.21438431,
        -0.89672592, -0.07136282]])

In [15]:
print(reviews.columns)

Index(['Clothing ID', 'Age', 'Recommended IND', 'Rating', 'Bottoms', 'Dresses',
       'Intimate', 'Jackets', 'Tops', 'Trend'],
      dtype='object')
