<a href="https://colab.research.google.com/github/doremococo/python-bootcamp/blob/main/chapter3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np


df = pd.read_csv("/content/adult_income_dataset.csv")

# 列名を取得
print(df.columns)

# 3-01 特定列のユニークな行を出力
df["occupation"].unique()

# ユニークな値がいくつあるか数える
df["occupation"].nunique()

# その列の各値がどれくらいの数出現しているかを確認 .value_counts()
df["workclass"].value_counts()

# 3-02 値の置換 .replace("検索文字", "置換後の文字")
new_df = df.copy()
unique_education = df["education"].value_counts()
print(f"old:  {unique_education}")

new_df["education"] = new_df["education"].replace("Bachelors", "Bachelor degree")
# new_unique_education = new_df["education"].value_counts()
# print(f"new:  {new_unique_education}")

# 色々な置換方法を試す
# replace(dict)
dict_rpls = new_df[["capital-loss", "capital-gain"]].replace({0: 1111, 2174: 9999})
# print(dict_rpls)


Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')
old:  education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: count, dtype: int64


In [2]:
new_df = df.copy()
new_df["capital-diff"] = df["capital-gain"] - df["capital-loss"]
new_df[["capital-gain", "capital-loss", "capital-diff"]].head()

Unnamed: 0,capital-gain,capital-loss,capital-diff
0,2174,0,2174
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


In [3]:
from sklearn.preprocessing import StandardScaler

df_304 = df.copy()


# 3-04 列の標準化
scaler = StandardScaler()
df_304[["scl-age", "scl-cg"]] = scaler.fit_transform(df_304[["age","capital-gain"]])
print(df_304["age"].mean())
df_304[["scl-age", "age"]].head()

38.58164675532078


Unnamed: 0,scl-age,age
0,0.030671,39
1,0.837109,50
2,-0.042642,38
3,1.057047,53
4,-0.775768,28


In [4]:
# 3-05 外れ値を検出する
# new_df の hour-per-week列で外れ値を四分位範囲で検出し、その数を出力
# 四分位範囲(IQR) = 3Q - 1Q

new_df = df.copy()

Q1 = new_df["hours-per-week"].quantile(0.25)
Q3 = new_df["hours-per-week"].quantile(0.75)

IQR = Q3 - Q1
print(IQR)

# 経験則として、1.5*IQR が外れ値までの距離
outer_Q1 = Q1 - 1.5*IQR
outer_Q3 = Q3 + 1.5*IQR

pre_count_sum = ((new_df["hours-per-week"] > outer_Q1) & (new_df["hours-per-week"] < outer_Q3)).sum()
print("pre:  ", pre_count_sum)

# 外れ値の数をかぞえる
count_outlnier = ((new_df["hours-per-week"] < outer_Q1) | (new_df["hours-per-week"] > outer_Q3)).sum()
print("pre_out:  ", count_outlnier)

# 外れ値にある値を、Q1もしくはQ3に置換する
new_df["hours-per-week"] = np.where(new_df["hours-per-week"] < outer_Q1 , Q1, new_df["hours-per-week"])
new_df["hours-per-week"] = np.where(new_df["hours-per-week"] > outer_Q3 , Q3, new_df["hours-per-week"])

# 外れ値の数をかぞえる
count_outliner = ((new_df["hours-per-week"] < outer_Q1) | (new_df["hours-per-week"] > outer_Q3)).sum()
print("fix_out:  ", count_outliner)

new_df["hours-per-week"].head()

# それをさらに、pandas の clip によって置換する
outer_Q1 = 45
outer_Q3 = 46
new_df["hours-per-week"] = new_df["hours-per-week"].clip(lower = outer_Q1, upper = outer_Q3)
new_df["hours-per-week"].head(10)


5.0
pre:   23553
pre_out:   9008
fix_out:   0


Unnamed: 0,hours-per-week
0,45.0
1,45.0
2,45.0
3,45.0
4,45.0
5,45.0
6,45.0
7,45.0
8,46.0
9,45.0


In [5]:
# 3-06 欠損値を特定の値で埋める
new_df = df.copy()

# occupation 列の欠損値の数を出力
print("pre null:  ", new_df["occupation"].isnull().sum())

# occupation列の欠損値を特定の値で埋める

# 試しに np.where でやっても出来た
new_df["occupation"] = np.where(new_df["occupation"].isnull(), "AAAAA", new_df["occupation"])
print("test np null -> AAAAA:  ", (new_df["occupation"] == "AAAAA").sum())
new_df["occupation"] = np.where(new_df["occupation"] == "AAAAA", np.nan , new_df["occupation"])

# 基本的には pandas の fillna("") で欠損値を置換する
new_df["occupation"] = new_df["occupation"].fillna("BBBBB")
print("fillna(BBBBB):  ", (new_df["occupation"] == "BBBBB").sum())

pre null:   1843
test np null -> AAAAA:   1843
fillna(BBBBB):   1843


In [6]:
# 3-07 データの正規化
# 0-1の間で正規化を行う。対象["hours-per-week"]
new_df = df.copy()

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
new_df["nom-hpw"] = scaler.fit_transform(new_df[["hours-per-week"]])
new_df["nom-hpw"]

Unnamed: 0,nom-hpw
0,0.397959
1,0.122449
2,0.397959
3,0.397959
4,0.397959
...,...
32556,0.377551
32557,0.397959
32558,0.397959
32559,0.193878
