# 빌트인 데이터 준비 기능 

* 참고자료: https://aws.amazon.com/ko/blogs/aws/next-generation-sagemaker-notebooks-now-with-built-in-data-preparation-real-time-collaboration-and-notebook-automation/
* dataset: https://www.kaggle.com/datasets/nicapotato/womens-ecommerce-clothing-reviews?resource=download


## Install or Upgrade Packages

In [10]:
# Install Reinvent Wheels
! pip install --upgrade sagemaker
! pip install jupyterlab==3

[0m

## 데이터 s3 업로드

In [3]:
! aws s3 cp 'Womens Clothing E-Commerce Reviews.csv' s3://'kaggle-dataset-innovate-demo'  

upload: ./Womens Clothing E-Commerce Reviews.csv to s3://kaggle-dataset-innovate-demo/Womens Clothing E-Commerce Reviews.csv


## 데이터 읽기


In [11]:
import pandas as pd

In [12]:
df = pd.read_csv("Womens Clothing E-Commerce Reviews.csv")

## 데이터 샘플링

In [13]:
sample_rate = 0.2

In [14]:
df = df.sample(frac=sample_rate)

## 데이터 탐색 (기존 방법)

* 데이터 확인

In [None]:
df

* 히스토그램 

In [None]:
import numpy as np
import matplotlib.pyplot as plt # plotting

In [None]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()

plotPerColumnDistribution(df, 10, 5)

* 데이터 전처리

In [None]:
df = df.drop(columns=['Unnamed: 0'])

In [None]:
df

## 데이터 탐색 (빌트인 데이터 준비 기능 활용)

In [15]:
import sagemaker_datawrangler

In [None]:
df = pd.read_csv("Womens Clothing E-Commerce Reviews.csv")
df = df.sample(frac=sample_rate)

* 데이터 확인

In [16]:
df

       Unnamed: 0  Clothing ID  Age  \
16605       16605         1001   46   
8123         8123          833   48   
12894       12894          863   44   
18630       18630          975   48   
7027         7027          862   69   
...           ...          ...  ...   
12658       12658          875   44   
14099       14099          686   28   
256           256          840   44   
17923       17923          936   31   
11661       11661          964   36   

                                              Title  \
16605                          Go see in the store!   
8123   Very cute but harder to wear than i expected   
12894                    Nice t, but not super soft   
18630                                    The horror   
7027                                  Not impressed   
...                                             ...   
12658                                 Pretty blouse   
14099                  Just what i was looking for!   
256                 Stylish and unde

In [17]:
# Pandas code generated by sagemaker_datawrangler
output_df = df.copy(deep=True)


# Code to Replace with new value for column: Title to resolve warning: Missing values 
generic_value = 'Other'
output_df['Title']=output_df['Title'].fillna(generic_value)



# Code to Drop column for column: Unnamed: 0 to resolve warning: ID column 
output_df=output_df.drop(columns=['Unnamed: 0'])


# Code to Drop missing for column: Review Text to resolve warning: Missing values 
output_df = output_df[output_df['Review Text'].notnull()]


In [None]:
# Pandas code generated by sagemaker_datawrangler
output_df = df.copy(deep=True)


# Code to Replace with new value for column: Title to resolve warning: Missing values 
generic_value = 'Other'
output_df['Title']=output_df['Title'].fillna(generic_value)



# Code to Drop column for column: Unnamed: 0 to resolve warning: ID column 
output_df=output_df.drop(columns=['Unnamed: 0'])


# Code to Drop missing for column: Review Text to resolve warning: Missing values 
output_df = output_df[output_df['Review Text'].notnull()]


# Code to Drop missing for column: Division Name to resolve warning: Missing values 
output_df = output_df[output_df['Division Name'].notnull()]


In [None]:
# Pandas code generated by sagemaker_datawrangler
output_df = df.copy(deep=True)


# Code to Drop column for column: Unnamed: 0 to resolve warning: ID column 
output_df=output_df.drop(columns=['Unnamed: 0'])


# Code to Replace with new value for column: Title to resolve warning: Missing values 
generic_value = 'Other'
output_df['Title']=output_df['Title'].fillna(generic_value)



# Code to Drop missing for column: Review Text to resolve warning: Missing values 
output_df = output_df[output_df['Review Text'].notnull()]


# Code to Drop missing for column: Division Name to resolve warning: Missing values 
output_df = output_df[output_df['Division Name'].notnull()]


In [18]:
output_df.shape

(4540, 10)

In [19]:
output_df

       Clothing ID  Age                                         Title  \
16605         1001   46                          Go see in the store!   
8123           833   48  Very cute but harder to wear than i expected   
12894          863   44                    Nice t, but not super soft   
18630          975   48                                    The horror   
7027           862   69                                 Not impressed   
...            ...  ...                                           ...   
12658          875   44                                 Pretty blouse   
14099          686   28                  Just what i was looking for!   
256            840   44               Stylish and understated elegant   
17923          936   31         Big bold and beautiful but not for me   
11661          964   36                                      Love it!   

                                             Review Text  Rating  \
16605  Online photos do not do this skirt any justice..