# 5장 판다스 3편

## 주요 내용

`Series`와 `DataFrame` 객체를로부터 기초 통계 자료를 추출하는 방식을 다룬다.

* 합, 누적합
* 상관관계, 공분산
* 중복값 처리

## 기본 설정

`pandas` 라이브러리는 보통 `pd` 라는 별칭으로 사용된다.

In [1]:
import pandas as pd
import numpy as np

랜덤 시드, 어레이 내부에 사용되는 부동소수점 정확도, 도표 크기 지정 옵션 등은 이전과 동일하다.

In [2]:
np.random.seed(12345)
np.set_printoptions(precision=4, suppress=True)

import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))

`Series`와 `DataFrame`을 표로 보여줄 때 사용되는 행의 수를 20으로 지정한다. 
기본 값은 60이다.

In [3]:
PREVIOUS_MAX_ROWS = pd.options.display.max_rows # 원래 60이 기본.
pd.options.display.max_rows = 20

## 5.3 기초 통계 함수

In [4]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [5]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [6]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [7]:
df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [8]:
df.idxmax()

one    b
two    d
dtype: object

In [9]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [10]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [11]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

### 5.3.1 상관관계와 공분산

conda install pandas-datareader

In [12]:
price = pd.read_pickle('examples/yahoo_price.pkl')
volume = pd.read_pickle('examples/yahoo_volume.pkl')

import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in all_data.items()})

In [13]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


In [14]:
returns['MSFT'].corr(returns['IBM'])
returns['MSFT'].cov(returns['IBM'])

8.870655479703549e-05

In [15]:
returns.MSFT.corr(returns.IBM)

0.4997636114415116

In [16]:
returns.corr()
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000277,0.000107,7.8e-05,9.5e-05
GOOG,0.000107,0.000251,7.8e-05,0.000108
IBM,7.8e-05,7.8e-05,0.000146,8.9e-05
MSFT,9.5e-05,0.000108,8.9e-05,0.000215


In [17]:
returns.corrwith(returns.IBM)

AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64

In [18]:
returns.corrwith(volume)

AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

### 5.3.2 중복값 처리

In [19]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [20]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [21]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [22]:
pd.value_counts(obj.values, sort=False)

b    2
a    3
d    1
c    3
dtype: int64

In [23]:
obj
mask = obj.isin(['b', 'c'])
mask
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [24]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2])

In [25]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [26]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


## 연습문제

아래 코드는 인터넷 데이터 저장소로부터 아이리스(붓꽃) 데이터(`iris.data`)를 
2차원 넘파이 어레이로 불러온다.

In [27]:
# 아이리스(붓꽃) 데이터 불러오기
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='str')

`iris.data` 파일에는 아래 형식의 데이터가 150개 들어 있다. 

```python
5.1,3.5,1.4,0.2,Iris-setosa
```

하나의 데이터에 사용된 값들은 하나의 아이리스(붓꽃)에 대한 꽃잎, 꽃받침과 관련된 특성(features)과 품종을 나타내며,
보다 구체적으로 아래 순서를 따른다.

```
꽃받침 길이, 꽃받침 너비, 꽃잎 길이, 꽃잎 너비, 품종
```

In [28]:
type(iris)

numpy.ndarray

In [29]:
iris.shape

(150, 5)

길이와 너비를 저장하는 특성들은 숫자로 저장되어 있었지만 위 코드는 문자열로 저장된 품종 특성과의 자료형을 통일시키기 위해
모두 문자열 자료형으로 불러왔다.
처음 5개 데이터를 확인하면 다음과 같다.

__참고:__ `'<U15'`는 길이가 최대 15인 유니코드 문자열 자료형을 나타낸다.

In [30]:
iris[:5]

array([['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'],
       ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'],
       ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'],
       ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'],
       ['5.0', '3.6', '1.4', '0.2', 'Iris-setosa']], dtype='<U15')

수치형 데이터와 품종 데이터를 분리해서 각각 (150,4), (150,) 모양의 어레이를 생성하자.
이때 수치형 데이터는 `'f8'`, 즉 `'float64'` 자료형을 사용하도록 한다.

In [31]:
iris_features = iris[:,:4].astype('f8')
iris_labels = iris[:, 4]

판다스의 데이터프레임으로 형변환한다.
이때 각 열의 이름을 기존에 데이터를 반영하도록 지정한다.

In [32]:
columns = ['꽃받침길이', '꽃받침너비', '꽃잎길이', '꽃잎너비']
iris_features = pd.DataFrame(iris_features, columns=columns)
iris_features[:5]

Unnamed: 0,꽃받침길이,꽃받침너비,꽃잎길이,꽃잎너비
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


레이블은 판다스의 시리즈로 변환한다.

In [33]:
iris_labels = pd.Series(iris_labels)
iris_labels

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Length: 150, dtype: object

150개의 데이터는 아래 세 개의 품종으로 구분되며, 각각 50개씩 아래 언급된 순서대로 구분되어 있다.

```
'Iris-setosa', 'Iris-versicolor', 'Iris-virginica'
```

즉, 0번, 50번, 100번부터 각 품종의 데이터가 시작된다.
넘파이의 경우와는 달리 인덱스를 항상 함께 보여준다.

In [34]:
iris_labels[::50]

0          Iris-setosa
50     Iris-versicolor
100     Iris-virginica
dtype: object

In [35]:
iris_labels[:5]

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
dtype: object

In [36]:
iris_labels[50:55]

50    Iris-versicolor
51    Iris-versicolor
52    Iris-versicolor
53    Iris-versicolor
54    Iris-versicolor
dtype: object

In [37]:
iris_labels[100:105]

100    Iris-virginica
101    Iris-virginica
102    Iris-virginica
103    Iris-virginica
104    Iris-virginica
dtype: object

__연습 1.__ 꽃잎 길이(2번 열)가 1.5보다 크거나 꽃받침 길이(0번 열)가 5.0보다 작은 데이터만 추출하라.

In [38]:
mask = (iris_features.꽃잎길이>1.5) | (iris_features.꽃받침길이<5.0)
mask

0      False
1       True
2       True
3       True
4      False
       ...  
145     True
146     True
147     True
148     True
149     True
Length: 150, dtype: bool

조건을 만족하는 샘플의 수는 아래와 같다.

In [39]:
mask.sum()

129

조건을 만족하는 샘플들은 다음과 같다.

In [40]:
iris_features[mask]

Unnamed: 0,꽃받침길이,꽃받침너비,꽃잎길이,꽃잎너비
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


__연습 2.__ 꽃받침 길이(0번 열)와 꽃잎 길이(2번 열) 사이의 피어슨 상관계수를 계산하라.

힌트: 넘파이의 적절한 함수를 활용한다. 상관계수에 대한 설명은 [위키백과: 상관분석](https://ko.wikipedia.org/wiki/상관_분석)을 참고한다.

데이터프레임의 `corr()` 메서드는 모든 특성들 사이의 피어슨 상관계수로 이루어진 데이터프레임을 반환환다.

In [41]:
iris_corr = iris_features.corr()
iris_corr

Unnamed: 0,꽃받침길이,꽃받침너비,꽃잎길이,꽃잎너비
꽃받침길이,1.0,-0.109369,0.871754,0.817954
꽃받침너비,-0.109369,1.0,-0.420516,-0.356544
꽃잎길이,0.871754,-0.420516,1.0,0.962757
꽃잎너비,0.817954,-0.356544,0.962757,1.0


따라서 '꽃받침길이'와 다른 특성들 사이의 상관계수를 역순으로 정렬하면 다음과 같다.

In [42]:
iris_corr['꽃받침길이'].sort_values(ascending=False)

꽃받침길이    1.000000
꽃잎길이     0.871754
꽃잎너비     0.817954
꽃받침너비   -0.109369
Name: 꽃받침길이, dtype: float64

따라서 '꽃받침길이'와 '꽃잎길이' 사이의 상관계수는 다음과 같다.

In [43]:
iris_corr['꽃받침길이']['꽃잎길이']

0.8717541573048719

__연습 3.__ 아래 식으로 계산된 값을 갖는 새로운 열(column)이 추가된 데이터프레임 `iris_features_added`를 생성하라.
열의 이름은 '길이속성1'으로 지정한다.

$$\frac{\text{원주율} \times \text{꽃잎길이} \times \text{꽃받침길이}^2}{3} $$

In [44]:
# pass와 None을 각각 적절한 코드와 표현식으로 대체하라.

scaled = (3.14 * iris_features['꽃잎길이'] * iris_features['꽃받침길이']**2) / 3
length_property1 = pd.Series(scaled, name='길이특성1')

In [45]:
length_property1

0       38.113320
1       35.182653
2       30.057127
3       33.221200
4       36.633333
          ...    
145    244.321307
146    207.711000
147    229.952667
148    217.262880
149    185.815780
Name: 길이특성1, Length: 150, dtype: float64

In [46]:
iris_features_added = pd.concat([iris_features, length_property1], axis=1)

assert iris_features_added.shape == (150, 5)
iris_features_added

Unnamed: 0,꽃받침길이,꽃받침너비,꽃잎길이,꽃잎너비,길이특성1
0,5.1,3.5,1.4,0.2,38.113320
1,4.9,3.0,1.4,0.2,35.182653
2,4.7,3.2,1.3,0.2,30.057127
3,4.6,3.1,1.5,0.2,33.221200
4,5.0,3.6,1.4,0.2,36.633333
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,244.321307
146,6.3,2.5,5.0,1.9,207.711000
147,6.5,3.0,5.2,2.0,229.952667
148,6.2,3.4,5.4,2.3,217.262880


__연습 4.__ `Iris_versicolor` 품종에 해당하는 데이터만 `iris_features`로부터 추출하라. 

In [47]:
# None을 적절한 부울 표현식으로 대체하라.

mask = iris_labels == 'Iris-versicolor'
mask

0      False
1      False
2      False
3      False
4      False
       ...  
145    False
146    False
147    False
148    False
149    False
Length: 150, dtype: bool

In [48]:
mask.sum()

50

In [49]:
iris_versicolor = iris_features[mask]
iris_versicolor.head()

Unnamed: 0,꽃받침길이,꽃받침너비,꽃잎길이,꽃잎너비
50,7.0,3.2,4.7,1.4
51,6.4,3.2,4.5,1.5
52,6.9,3.1,4.9,1.5
53,5.5,2.3,4.0,1.3
54,6.5,2.8,4.6,1.5


In [50]:
iris_versicolor.tail()

Unnamed: 0,꽃받침길이,꽃받침너비,꽃잎길이,꽃잎너비
95,5.7,3.0,4.2,1.2
96,5.7,2.9,4.2,1.3
97,6.2,2.9,4.3,1.3
98,5.1,2.5,3.0,1.1
99,5.7,2.8,4.1,1.3


__연습 5.__ 꽃받침 길이(0번 열)의 평균값(mean), 중앙값(median), 표준편차(standard deviation)를 구하라.

__참고:__ 데이터프레임의 메서드는 기본적으로 열(columns)에 대한 속성을 다룬다.
즉, `axis=0`을 기본 축으로 사용한다.

In [51]:
iris_mean = iris_features.mean()
iris_mean

꽃받침길이    5.843333
꽃받침너비    3.054000
꽃잎길이     3.758667
꽃잎너비     1.198667
dtype: float64

In [52]:
iris_mean = iris_features.mean(axis=0)
iris_mean

꽃받침길이    5.843333
꽃받침너비    3.054000
꽃잎길이     3.758667
꽃잎너비     1.198667
dtype: float64

In [53]:
iris_median = iris_features.median()
iris_median

꽃받침길이    5.80
꽃받침너비    3.00
꽃잎길이     4.35
꽃잎너비     1.30
dtype: float64

In [54]:
iris_std = iris_features.std()
iris_std

꽃받침길이    0.828066
꽃받침너비    0.433594
꽃잎길이     1.764420
꽃잎너비     0.763161
dtype: float64

__연습 6.__ 세 개의 품종 별 꽃받침 너비(1번 열)의 평균값을 계산하여 아래 어레이와 동일한 모양을 갖는 
데이터프레임 `iris_kind_sepal_length`를 생성하라.

```
[['Iris-setosa', 3.418],
 ['Iris-versicolor', 2.770],
 ['Iris-virginica', 2.974]]
```

In [55]:
# pass와 None을 각각 적절한 코드와 표현식으로 대체하라.

mask1 = iris_labels == 'Iris-setosa'
mask2 = iris_labels == 'Iris-versicolor'
mask3 = iris_labels == 'Iris-virginica'
mean1 = iris_features[mask1].mean()['꽃받침너비']
mean2 = iris_features[mask2].mean()['꽃받침너비']
mean3 = iris_features[mask3].mean()['꽃받침너비']
mean2

2.7700000000000005

In [56]:
iris_kind_sepal_length = pd.DataFrame({'kinds': ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], 
                                       'mean_sepal_width':[mean1, mean2, mean3]})

iris_kind_sepal_length

Unnamed: 0,kinds,mean_sepal_width
0,Iris-setosa,3.418
1,Iris-versicolor,2.77
2,Iris-virginica,2.974


아래와 같이 모든 과정을 자동화할 수도 있다.

In [57]:
kinds = list(set(iris_labels))
kinds.sort()

iris_kind_sepal_length = []

for kind in kinds:
    mask = iris_labels == kind
    mean_0 = iris_features[mask].mean()['꽃받침너비']
    iris_kind_sepal_length.append([kind, mean_0])
    
pd.DataFrame(iris_kind_sepal_length, columns=['아이리스_품종', '꽃받침길이_평균'])

Unnamed: 0,아이리스_품종,꽃받침길이_평균
0,Iris-setosa,3.418
1,Iris-versicolor,2.77
2,Iris-virginica,2.974


__연습 7.__ 꽃잎 너비(3번 열)에 사용된 값을 모두 0과 1사이의 값으로 변환하라. 

힌트: 하나의 특성, 여기서는 꽃잎 너비,에 속하는 값을 모두 0과 1사이의 값으로 변환하는 작업을 정규화(normalization)이라 한다.
정규화에 대한 설명은 [정규화/표준화](https://rucrazia.tistory.com/90)을 참고하라.

In [58]:
iris_features[:5]

Unnamed: 0,꽃받침길이,꽃받침너비,꽃잎길이,꽃잎너비
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


데이터프레임의 메서드는 기본적으로 축을 0으로 지정해서 열 단위로 작동한다.

In [59]:
iris_features.min()

꽃받침길이    4.3
꽃받침너비    2.0
꽃잎길이     1.0
꽃잎너비     0.1
dtype: float64

In [60]:
iris_features.min(axis=0)

꽃받침길이    4.3
꽃받침너비    2.0
꽃잎길이     1.0
꽃잎너비     0.1
dtype: float64

In [61]:
iris_features_normalized = (iris_features - iris_features.min())/(iris_features.max() - iris_features.min())

iris_features_normalized

Unnamed: 0,꽃받침길이,꽃받침너비,꽃잎길이,꽃잎너비
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667


__연습 8.__ `iris_features`에 사용된 모든 값을 특성 별로 표준화(standardization)하라. 

힌트: 표준화에 대한 설명은 [정규화/표준화](https://rucrazia.tistory.com/90)을 참고하라.

In [62]:
iris_features.mean()

꽃받침길이    5.843333
꽃받침너비    3.054000
꽃잎길이     3.758667
꽃잎너비     1.198667
dtype: float64

In [63]:
iris_features.std()

꽃받침길이    0.828066
꽃받침너비    0.433594
꽃잎길이     1.764420
꽃잎너비     0.763161
dtype: float64

In [64]:
# None을 적절한 부울 표현식으로 대체하라.

iris_features_standardized = (iris_features - iris_features.mean()) / iris_features.std()

iris_features_standardized[:5]

Unnamed: 0,꽃받침길이,꽃받침너비,꽃잎길이,꽃잎너비
0,-0.897674,1.028611,-1.336794,-1.308593
1,-1.1392,-0.12454,-1.336794,-1.308593
2,-1.380727,0.33672,-1.39347,-1.308593
3,-1.50149,0.10609,-1.280118,-1.308593
4,-1.018437,1.259242,-1.336794,-1.308593


## 과제

In [65]:
from sklearn.datasets import load_boston
X = load_boston()

In [66]:
type(X)

sklearn.utils.Bunch

In [17]:
X.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [21]:
X.data

array([[  0.0063,  18.    ,   2.31  , ...,  15.3   , 396.9   ,   4.98  ],
       [  0.0273,   0.    ,   7.07  , ...,  17.8   , 396.9   ,   9.14  ],
       [  0.0273,   0.    ,   7.07  , ...,  17.8   , 392.83  ,   4.03  ],
       ...,
       [  0.0608,   0.    ,  11.93  , ...,  21.    , 396.9   ,   5.64  ],
       [  0.1096,   0.    ,  11.93  , ...,  21.    , 393.45  ,   6.48  ],
       [  0.0474,   0.    ,  11.93  , ...,  21.    , 396.9   ,   7.88  ]])

In [22]:
X.target

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [25]:
print(X.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

| 특성 | 의미 |
|:------|:---------|
| CRIM  | 지역별 1인당 범죄율 |
| ZN    | 25,000 평방 피트 이상의 주거 지역 비율 |
| INDUS | 지역별 비 소매 사업 에이커(acre) 비율 |
| CHAS  | Charles River 더미 변수(지역이 강 경계에 닿으면 1, 아니면 0) |
| NOX   | 산화 질소 농도(1000만분 율) |
| RM    | 주택 당 평균 방 수 |
| AGE   | 소유주가 살고 있는 1940년 이전에 지어진 건물 비율 |
| DIS   | 보스턴 고용 센터 다섯 곳 까지의 가중 거리 |
| RAD   | 방사형 고속도로 접근성 지수 |
| TAX   | 1만달러당 전체 가지 재산 세율 |
| PTRATIO | 지역별 학생-교사 비율 |
| B     | 1000(Bk - 0.63)^2 (Bk 지역별 흑인 비율) |
| LSTAT | 지역별 낮은 지위 인구 비율 |
| MEDV  | 소유주 거주 주택의 중간 가격(단위: 1,000달러) |

In [27]:
X.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [29]:
X.filename

'C:\\Users\\gslee\\anaconda3\\lib\\site-packages\\sklearn\\datasets\\data\\boston_house_prices.csv'

In [31]:
boston = X.data
boston

array([[  0.0063,  18.    ,   2.31  , ...,  15.3   , 396.9   ,   4.98  ],
       [  0.0273,   0.    ,   7.07  , ...,  17.8   , 396.9   ,   9.14  ],
       [  0.0273,   0.    ,   7.07  , ...,  17.8   , 392.83  ,   4.03  ],
       ...,
       [  0.0608,   0.    ,  11.93  , ...,  21.    , 396.9   ,   5.64  ],
       [  0.1096,   0.    ,  11.93  , ...,  21.    , 393.45  ,   6.48  ],
       [  0.0474,   0.    ,  11.93  , ...,  21.    , 396.9   ,   7.88  ]])

In [33]:
boston = pd.DataFrame(X.data, columns=X.feature_names)
boston

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [34]:
boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB


In [35]:
boston.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [79]:
boston['RM'][100:105] = np.nan

In [80]:
boston[97:107]

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
97,0.12083,0.0,2.89,0.0,0.445,8.069,76.0,3.4952,2.0,276.0,18.0,396.9,4.21
98,0.08187,0.0,2.89,0.0,0.445,7.82,36.9,3.4952,2.0,276.0,18.0,393.53,3.57
99,0.0686,0.0,2.89,0.0,0.445,7.416,62.5,3.4952,2.0,276.0,18.0,396.9,6.19
100,0.14866,0.0,8.56,0.0,0.52,,79.9,2.7778,5.0,384.0,20.9,394.76,9.42
101,0.11432,0.0,8.56,0.0,0.52,,71.3,2.8561,5.0,384.0,20.9,395.58,7.67
102,0.22876,0.0,8.56,0.0,0.52,,85.4,2.7147,5.0,384.0,20.9,70.8,10.63
103,0.21161,0.0,8.56,0.0,0.52,,87.4,2.7147,5.0,384.0,20.9,394.47,13.44
104,0.1396,0.0,8.56,0.0,0.52,,90.0,2.421,5.0,384.0,20.9,392.69,12.33
105,0.13262,0.0,8.56,0.0,0.52,5.851,96.7,2.1069,5.0,384.0,20.9,394.05,16.47
106,0.1712,0.0,8.56,0.0,0.52,5.836,91.9,2.211,5.0,384.0,20.9,395.67,18.66


In [81]:
boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       501 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB


In [82]:
boston.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,501.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.28305,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.705422,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.884,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.208,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.619,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [83]:
boston.shape

(506, 13)