In [118]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path

In [119]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

In [120]:
DATA = Path("../../data")

In [121]:
train_data = pd.read_csv(DATA / "train_outlier_preprocessing.csv")
test_data = pd.read_csv(DATA / "test_outlier_preprocessing.csv")

In [122]:
del train_data["id"]
del test_data["id"]

In [123]:
del train_data["veil-color"]
del train_data["stem-root"]
del train_data["spore-print-color"]
del train_data["veil-type"]

In [124]:
del test_data["veil-color"]
del test_data["stem-root"]
del test_data["spore-print-color"]
del test_data["veil-type"]

In [125]:
train_data

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,e,8.80,f,s,u,f,a,c,w,4.51,15.39,,w,f,f,d,a
1,p,4.51,x,,,f,a,c,n,4.79,6.48,y,o,t,z,d,w
2,e,6.94,f,s,b,f,,c,w,6.85,9.93,s,n,f,f,l,w
3,e,3.88,f,y,g,f,,,g,4.16,6.53,,w,f,f,d,u
4,e,5.85,x,,w,f,d,,w,3.37,8.36,,w,f,f,g,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,e,9.29,f,,n,t,,,w,12.14,18.81,,w,t,,d,u
3116941,e,10.88,s,,w,t,d,c,p,6.65,26.97,,w,f,f,d,u
3116942,p,7.82,x,,e,f,a,,w,9.51,11.06,,y,t,z,d,a
3116943,e,9.45,,,n,t,,,p,9.13,17.77,y,w,t,p,d,u


In [126]:
test_data

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,8.64,x,,n,t,,,w,11.13,17.12,,w,t,,d,a
1,6.90,,,,f,,c,y,1.27,10.75,,n,f,f,d,a
2,2.00,b,g,n,f,,c,n,6.18,3.14,,n,f,f,d,s
3,3.47,x,,n,f,,c,n,4.98,8.51,,w,t,z,d,u
4,6.17,x,,y,f,,,y,6.73,13.70,,y,t,,d,u
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2077959,0.88,x,g,w,f,a,d,w,2.67,1.35,,e,f,f,d,u
2077960,3.12,x,s,w,f,d,c,w,2.69,7.38,,w,f,f,g,a
2077961,5.73,x,,e,f,a,,w,6.16,9.74,,y,t,z,d,a
2077962,5.03,b,g,n,f,a,d,g,6.00,3.46,s,g,f,f,d,a


In [127]:
# IQR 방법으로 아웃라이어 처리

def outlier_remove_iqr(data, column,threshold=1.5):
    q1 = data[column].quantile(0.25)
    q3 = data[column].quantile(0.75)
    iqr = q3-q1
    
    lower = q1 - (threshold * iqr)
    upper = q3 + (threshold * iqr)

    data.loc[(data[column] > upper) | (data[column] < lower), column] = np.nan
    return data[column]

In [128]:
train_data["cap-diameter"] = outlier_remove_iqr(data=train_data, column="cap-diameter")
train_data["stem-width"] = outlier_remove_iqr(data=train_data, column="stem-width")
train_data["stem-height"] = outlier_remove_iqr(data=train_data, column="stem-height")

In [129]:
train_data["cap-diameter_log1p"] = np.log1p(train_data["cap-diameter"])
train_data["stem-width_log1p"] = np.log1p(train_data["stem-width"])
train_data["stem-height_log1p"] = np.log1p(train_data["stem-height"])

In [130]:
test_data["cap-diameter_log1p"] = np.log1p(test_data["cap-diameter"])
test_data["stem-width_log1p"] = np.log1p(test_data["stem-width"])
test_data["stem-height_log1p"] = np.log1p(test_data["stem-height"])

In [131]:
train_data = train_data.dropna()

In [132]:
del train_data["does-bruise-or-bleed"]
del test_data["does-bruise-or-bleed"]

In [133]:
del train_data["has-ring"]
del test_data["has-ring"]

In [134]:
del train_data["habitat"]
del test_data["habitat"]

In [135]:
del train_data["ring-type"]
del test_data["ring-type"]

In [136]:
del train_data["cap-surface"]
del test_data["cap-surface"]

In [137]:
del train_data["stem-surface"]
del test_data["stem-surface"]

In [162]:
del train_data["gill-spacing"]
del test_data["gill-spacing"]

In [139]:
train_data.loc[:, "class"] = np.where(train_data["class"]=="e", 0, 1) # 0이 독성 없음, 1이 독성 있음

In [142]:
train_data.loc[:, "cap-shape_is_x"] = np.where(train_data["cap-shape"]!="x", 0, 1)
test_data.loc[:, "cap-shape_is_x"] = np.where(test_data["cap-shape"]!="x", 0, 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["cap-shape_is_x"] = np.where(train_data["cap-shape"]!="x", 0, 1)


In [157]:
del train_data["cap-shape"]
del test_data["cap-shape"]

In [143]:
train_data.loc[:, "gill-attachment_is_d"] = np.where(train_data["gill-attachment"]!='d', 0, 1)
test_data.loc[:, "gill-attachment_is_d"] = np.where(test_data["gill-attachment"]!='d', 0, 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.loc[:, "gill-attachment_is_d"] = np.where(train_data["gill-attachment"]!='d', 0, 1)


In [159]:
del train_data["gill-attachment"]
del test_data["gill-attachment"]

In [147]:
train_data.loc[:, "gill-color_is_n"] = np.where(train_data["gill-color"] != 'n', 0 ,1)
test_data.loc[:, "gill-color_is_n"] = np.where(test_data["gill-color"] != 'n', 0 ,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.loc[:, "gill-color_is_n"] = np.where(train_data["gill-color"] != 'n', 0 ,1)


In [160]:
del train_data["gill-color"]
del test_data["gill-color"]

In [155]:
train_data.loc[:, "stem-color"] = np.where(train_data["stem-color"] != 'n', 0, 1)
test_data.loc[:, "stem-color"] = np.where(test_data["stem-color"] != 'n', 0, 1)

In [148]:
def cap_color_encoding(data):
    if data == 'n':
        return 1
    elif data == 'g':
        return 0
    elif data == 'y':
        return 2
    else:
        return -1

In [151]:
train_data["cap-color"] = train_data["cap-color"].apply(cap_color_encoding)
test_data["cap-color"] = test_data["cap-color"].apply(cap_color_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["cap-color"] = train_data["cap-color"].apply(cap_color_encoding)


In [164]:
def season_encoding(data):
    if data == 's' or data == 'w':
        return 0
    else:
        return 1

In [166]:
train_data["season"] = train_data["season"].apply(season_encoding)
test_data["season"] = test_data["season"].apply(season_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["season"] = train_data["season"].apply(season_encoding)


In [168]:
train_data

Unnamed: 0,class,cap-diameter,cap-color,stem-height,stem-width,stem-color,season,cap-diameter_log1p,stem-width_log1p,stem-height_log1p,cap-shape_is_x,gill-attachment_is_d,gill-color_is_n
12,0,2.74,1,4.33,2.57,1,0,1.319086,1.272566,1.673351,0,0,0
25,0,3.64,0,5.35,3.56,1,1,1.534714,1.517323,1.848455,0,0,0
148,0,3.62,0,4.22,3.11,1,1,1.530395,1.413423,1.652497,0,0,0
154,0,5.09,0,5.50,3.19,1,0,1.806648,1.432701,1.871802,0,0,0
169,1,1.73,2,2.58,2.81,1,1,1.004302,1.337629,1.275363,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116376,0,4.18,0,6.03,3.81,1,0,1.644805,1.570697,1.950187,0,0,0
3116423,1,1.63,1,2.33,2.69,1,1,0.966984,1.305626,1.202972,1,1,0
3116440,1,1.41,2,2.87,2.51,1,1,0.879627,1.255616,1.353255,1,1,1
3116660,0,4.38,0,4.84,2.91,0,1,1.682688,1.363537,1.764731,0,0,0


In [170]:
train_data.to_csv(DATA / "train_value_preprocessing_01.csv", index=False)
test_data.to_csv(DATA / "test_value_preprocessing_01.csv", index=False)