In [1]:
import numpy as np
import pandas as pd

# 정렬

In [2]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [3]:
#정렬
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [4]:
df = pd.DataFrame(np.arange(8).reshape(2,4),
            index=['three', 'one'],
            columns=['d','a','b','c'])
df

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [5]:
print(df.sort_index()) #행 인덱스 정렬(axis=0, default)
print(df.sort_index(axis=1)) #열 인덱스 정렬
print(df.sort_index(axis=1, ascending=False)) #열 인덱스 기준 내림차순 정렬

       d  a  b  c
one    4  5  6  7
three  0  1  2  3
       a  b  c  d
three  1  2  3  0
one    5  6  7  4
       d  c  b  a
three  0  3  2  1
one    4  7  6  5


In [6]:
obj = pd.Series([3,6,-2,1])
print(obj)
print(obj.sort_values()) #default : 오름차순 정렬
print(obj.sort_values(ascending=False)) #내림차순 정렬

0    3
1    6
2   -2
3    1
dtype: int64
2   -2
3    1
0    3
1    6
dtype: int64
1    6
0    3
3    1
2   -2
dtype: int64


In [7]:
obj = pd.Series([3,np.nan,6,np.nan,-2,1])
print(obj.sort_values()) 
print(obj.sort_values(ascending=False))
#오름차순 정렬이든, 내림차순 정렬이든 NaN은 가장마지막에 출력

4   -2.0
5    1.0
0    3.0
2    6.0
1    NaN
3    NaN
dtype: float64
2    6.0
0    3.0
5    1.0
4   -2.0
1    NaN
3    NaN
dtype: float64


In [8]:
df = pd.DataFrame({'b':[4,7,-2,2], 'a':[0,1,0,1]})
df

Unnamed: 0,b,a
0,4,0
1,7,1
2,-2,0
3,2,1


In [9]:
#칼럼이 여러개일 때 어떤 칼럼을 기준으로 정렬할 것인지 옵션 by=""을 작성
print(df.sort_values(by='b')) #b칼럼을 기준으로 오름차순 정렬
print(df.sort_values(by='a')) #a를 기준으로 오름차순 정렬
print(df.sort_values(by=['a', 'b'])) #a 기준으로 정렬 후 a값이 같은 것들은 추가로 b를 기준으로 정렬
print(df.sort_values(by=['a', 'b'], ascending=False))

   b  a
2 -2  0
3  2  1
0  4  0
1  7  1
   b  a
0  4  0
2 -2  0
1  7  1
3  2  1
   b  a
2 -2  0
0  4  0
3  2  1
1  7  1
   b  a
1  7  1
3  2  1
0  4  0
2 -2  0


# rank()

In [10]:
obj = pd.Series([7,-3,7,4,2,0,4])
obj.rank() #오름차순 정렬에 따른 순위 부과(동일한 값이 있으면 n과 n+1의 평균순위인 n.5의 순위를 매긴다)

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [11]:
obj.rank(method="first") #method="first" 같은 크기의 숫자여도 먼저 나온 것을 높은 rank를 줌

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [12]:
print(obj.rank(ascending=False)) #내림차순 정렬
print(obj.rank(ascending=False, method='first'))

0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5
dtype: float64
0    1.0
1    7.0
2    2.0
3    3.0
4    5.0
5    6.0
6    4.0
dtype: float64


# 인덱스 중복

In [13]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [14]:
obj.index #obj 데이터프레임의 index가 추출됌.

Index(['a', 'a', 'b', 'b', 'c'], dtype='object')

In [15]:
obj.index.is_unique #obj의 index가 unique(유일)한지 확인하는 코드

#함수명 접두어:
# get~(일반적으로 데이터를 가져옴)
# set~(데이터를 저장)
# is~(True/False 결과가 출력되는 함수들)

False

In [16]:
print(type(obj['a']))  #중복 인덱스의 열 데이터 출력 결과의 타입은 Series
obj['a']

<class 'pandas.core.series.Series'>


a    0
a    1
dtype: int64

In [17]:
print(type(obj['c']))  #중복되지 않은 열 인덱스의 데이터 출력 결과의 타입은 스칼라
obj['c']

<class 'numpy.int64'>


4

In [18]:
df = pd.DataFrame(np.random.randn(4,3), index=['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,0.084414,0.868653,0.353042
a,0.518431,0.407827,-0.549682
b,-0.432023,-0.268414,0.711681
b,0.243372,-2.150253,0.501548


In [19]:
df.loc['b'] #행 인덱스가 중복되는 경우는 결과가 데이터프레임으로 출력

Unnamed: 0,0,1,2
b,-0.432023,-0.268414,0.711681
b,0.243372,-2.150253,0.501548


# 수학/통계 메소드

## sum(), mean()

In [20]:
df = pd.DataFrame([[1.5, np.nan],[7.0, 4.5],[np.nan, np.nan],[0.7, -1.5]],
                 index=['a', 'b', 'c', 'd'],
                 columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.5,
b,7.0,4.5
c,,
d,0.7,-1.5


In [21]:
df.sum() #NaN을 제외한 데이터들의 각 컬럼에 대한합이 구해짐
         #계산 결과가 시리즈로 나옴

one    9.2
two    3.0
dtype: float64

In [22]:
df.sum(axis=1) #NaN을 제외한 데이터들의 각 행(인덱스)에 대한합이 구해짐
               #계산 결과가 시리즈로 나옴

a     1.5
b    11.5
c     0.0
d    -0.8
dtype: float64

In [23]:
df.sum(axis=1, skipna=False) #skipna 옵션의 default 값은 True
                             #skipna=False를 주면 NaN값을 포함하여 sum을 계산

a     NaN
b    11.5
c     NaN
d    -0.8
dtype: float64

In [24]:
print(df.mean(axis=1)) #skipna=True 여도 요소값이 전부 NaN이면 결과값도 NaN으로 나옴
print(df.mean(axis=1, skipna=False))

a    1.50
b    5.75
c     NaN
d   -0.40
dtype: float64
a     NaN
b    5.75
c     NaN
d   -0.40
dtype: float64


## idxmax(), idxmin()

In [25]:
df.idxmax() #최대값을 가지고 있는 인덱스를 리턴

one    b
two    b
dtype: object

In [26]:
df.idxmin() #최소값을 가지고 있는 인덱스 리턴

one    d
two    d
dtype: object

## cumsum()

In [27]:
df.cumsum() #누적합 : NaN값을 건너뛰고 계산

Unnamed: 0,one,two
a,1.5,
b,8.5,4.5
c,,
d,9.2,3.0


## describe()

In [28]:
#describe : (기술통계) 수치데이터를 한번에 보여주는 함수 
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.066667,1.5
std,3.429772,4.242641
min,0.7,-1.5
25%,1.1,0.0
50%,1.5,1.5
75%,4.25,3.0
max,7.0,4.5


In [29]:
obj = pd.Series(['a', 'a', 'b', 'c']*4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [30]:
# (요약통계) 문자데이터도 가능하지만 수치데이터와는 다른 종류의 값을 출력
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

## 상관관계, 공분산

In [31]:
#금융,주식(주식가격/시가총액 등) 데이터를 받아올 수 있도록 도와주는 모듈
import pandas_datareader.data as web

In [33]:
allData = {ticker:web.get_data_yahoo(ticker) 
           for ticker in ['AAPL', 'IBM', 'MSFT', "GOOG"]}

In [35]:
allData.keys()

dict_keys(['AAPL', 'IBM', 'MSFT', 'GOOG'])

In [38]:
allData.items()

dict_items([('AAPL',                   High         Low        Open       Close       Volume  \
Date                                                                      
2015-08-24  108.800003   92.000000   94.870003  103.120003  162206300.0   
2015-08-25  111.110001  103.500000  111.110001  103.739998  103601600.0   
2015-08-26  109.889999  105.050003  107.089996  109.690002   96774600.0   
2015-08-27  113.239998  110.019997  112.230003  112.919998   84616100.0   
2015-08-28  113.309998  111.540001  112.169998  113.290001   53164400.0   
...                ...         ...         ...         ...          ...   
2020-08-14  460.000000  452.179993  459.320007  459.630005   41391300.0   
2020-08-17  464.350006  455.850006  464.250000  458.429993   29890400.0   
2020-08-18  464.000000  456.029999  457.410004  462.250000   26408400.0   
2020-08-19  468.649994  462.440002  463.929993  462.829987   36283800.0   
2020-08-20  473.562012  462.933502  463.000000  473.100006   31726797.0   

   

In [45]:
# 각 회사의 종가(adj Close)만 가져와 합하기
price = pd.DataFrame({ticker:data['Adj Close'] for ticker, data in allData.items()})
print(price.info())
print(price.describe())

price

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1258 entries, 2015-08-24 to 2020-08-20
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    1258 non-null   float64
 1   IBM     1258 non-null   float64
 2   MSFT    1258 non-null   float64
 3   GOOG    1258 non-null   float64
dtypes: float64(4)
memory usage: 89.1 KB
None
              AAPL          IBM         MSFT         GOOG
count  1258.000000  1258.000000  1258.000000  1258.000000
mean    175.499864   127.927212    95.589210  1020.496068
std      74.781649    11.173484    44.544198   236.794986
min      84.809998    92.307220    36.778946   582.059998
25%     110.200016   122.054560    56.620014   784.899979
50%     164.523193   129.362015    88.452236  1042.159973
75%     205.388294   134.315952   125.749487  1184.642517
max     473.100006   155.360657   216.017807  1581.750000


Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-08-24,95.294647,114.890862,37.878582,589.609985
2015-08-25,95.867607,112.880898,36.778946,582.059998
2015-08-26,101.366074,117.477425,38.814651,628.619995
2015-08-27,104.350983,118.950912,39.896111,637.609985
2015-08-28,104.692871,118.502495,39.923386,630.380005
...,...,...,...,...
2020-08-14,459.630005,125.269997,208.396240,1507.729980
2020-08-17,458.429993,124.440002,209.772919,1517.979980
2020-08-18,462.250000,124.919998,210.979996,1558.599976
2020-08-19,462.829987,123.839996,209.699997,1547.530029


In [46]:
volume = pd.DataFrame({ticker:data['Volume'] for ticker, data in allData.items()})
print(volume.info())
print(volume.describe())

volume

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1258 entries, 2015-08-24 to 2020-08-20
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    1258 non-null   float64
 1   IBM     1258 non-null   float64
 2   MSFT    1258 non-null   float64
 3   GOOG    1258 non-null   int64  
dtypes: float64(3), int64(1)
memory usage: 89.1 KB
None
               AAPL           IBM          MSFT          GOOG
count  1.258000e+03  1.258000e+03  1.258000e+03  1.258000e+03
mean   3.449670e+07  4.575614e+06  2.990448e+07  1.700609e+06
std    1.643824e+07  2.675079e+06  1.464148e+07  7.968964e+05
min    1.136200e+07  1.193000e+06  7.425600e+06  3.475000e+05
25%    2.362235e+07  3.093900e+06  2.068352e+07  1.213150e+06
50%    3.015910e+07  3.870150e+06  2.610305e+07  1.488950e+06
75%    4.080385e+07  5.091775e+06  3.409328e+07  1.948650e+06
max    1.622063e+08  3.049020e+07  1.352271e+08  6.653900e+06


Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-08-24,162206300.0,10189700.0,88753700.0,5770300
2015-08-25,103601600.0,7073200.0,70616600.0,3538000
2015-08-26,96774600.0,6221800.0,63408000.0,4235900
2015-08-27,84616100.0,4976600.0,50943200.0,3491300
2015-08-28,53164400.0,4076300.0,28246700.0,1978700
...,...,...,...,...
2020-08-14,41391300.0,2963400.0,17958900.0,1354800
2020-08-17,29890400.0,3360100.0,20184800.0,1378300
2020-08-18,26408400.0,2882400.0,21336200.0,2027100
2020-08-19,36283800.0,3741700.0,27600900.0,1660000


### 수익률

In [None]:
# GOOG 어제 종가 1만원, 오늘 1만 2천원
# 하루동안 수익률? 20%

# 1.2만 - 1.2만
# ------------- = 수익률
#     1만원  

# 60일 전에 매수, 금일 종가에 매도, 수익률
# 오늘 종가 - 60일전 매수가
# -----------------------
#     60일전 매수가

In [47]:
df = pd.DataFrame({"삼성전자":[52200, 52300, 52900, 52000, 51700],
             "LG전자":[68200, 67800, 68800, 67500, 66300]})
df

Unnamed: 0,삼성전자,LG전자
0,52200,68200
1,52300,67800
2,52900,68800
3,52000,67500
4,51700,66300


In [49]:
#pct_change : 수익률을 계산하는 함수(default -> 바로 위에 있는 행과의 수익률을 계산)
df.pct_change()*100 #단위(%)

Unnamed: 0,삼성전자,LG전자
0,,
1,0.191571,-0.58651
2,1.147228,1.474926
3,-1.701323,-1.889535
4,-0.576923,-1.777778


In [50]:
df.pct_change(periods=2)*100 #수익률을 계산하고 싶은 기간을 옵션으로 명시할 수 있음

Unnamed: 0,삼성전자,LG전자
0,,
1,,
2,1.340996,0.879765
3,-0.573614,-0.442478
4,-2.268431,-3.633721


In [52]:
price.pct_change()*100 #전날 종가 대비 수익률

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-08-24,,,,
2015-08-25,0.601251,-1.749455,-2.903055,-1.280505
2015-08-26,5.735479,4.072015,5.534975,7.999175
2015-08-27,2.944683,1.254273,2.786213,1.430115
2015-08-28,0.327633,-0.376977,0.068365,-1.133919
...,...,...,...,...
2020-08-14,-0.089123,0.191952,0.095834,-0.705981
2020-08-17,-0.261082,-0.662564,0.660606,0.679830
2020-08-18,0.833280,0.385725,0.575421,2.675924
2020-08-19,0.125470,-0.864555,-0.606692,-0.710249


In [None]:
#연습문제1.
#4개 features에 대해 상관계수 및 공분산 구하기
#공분산, 상관계수
#Google과 양의 상관관계, 음의 상관관계가 가장 큰 회사 출력

# 결측치

In [66]:
# 1. merge 과정에서 결측치(NaN)값 생성
visited = pd.read_csv("../etc/data/survey_visited.csv")
survey = pd.read_csv("../etc/data/survey_survey.csv")

In [67]:
visited.head()

Unnamed: 0,ident,site,dated
0,619,DR-1,1927-02-08
1,622,DR-1,1927-02-10
2,734,DR-3,1939-01-07
3,735,DR-3,1930-01-12
4,751,DR-3,1930-02-26


In [68]:
survey.head()

Unnamed: 0,taken,person,quant,reading
0,619,dyer,rad,9.82
1,619,dyer,sal,0.13
2,622,dyer,rad,7.8
3,622,dyer,sal,0.09
4,734,pb,rad,8.41


In [75]:
vs = visited.merge(survey, left_on='ident', right_on='taken')
vs

Unnamed: 0,ident,site,dated,taken,person,quant,reading
0,619,DR-1,1927-02-08,619,dyer,rad,9.82
1,619,DR-1,1927-02-08,619,dyer,sal,0.13
2,622,DR-1,1927-02-10,622,dyer,rad,7.8
3,622,DR-1,1927-02-10,622,dyer,sal,0.09
4,734,DR-3,1939-01-07,734,pb,rad,8.41
5,734,DR-3,1939-01-07,734,lake,sal,0.05
6,734,DR-3,1939-01-07,734,pb,temp,-21.5
7,735,DR-3,1930-01-12,735,pb,rad,7.22
8,735,DR-3,1930-01-12,735,,sal,0.06
9,735,DR-3,1930-01-12,735,,temp,-26.0


In [77]:
# 원래 데이터가 없는 경우
n = pd.Series({'goat':4, 'amoeba':np.nan})
n

goat      4.0
amoeba    NaN
dtype: float64

In [78]:
# 데이터가 조사가 안된 경우
sci = pd.DataFrame({
    'name' : ['Rosa', 'Will'],
    'occu' : ['Scientist', 'Chemist'],
    'mssing' : [np.nan, np.nan]
})
sci

Unnamed: 0,name,occu,mssing
0,Rosa,Scientist,
1,Will,Chemist,


In [87]:
gap = pd.read_csv("../etc/data/gapminder.tsv", sep="\t")

In [85]:
gap.info() #null값이 하나도 없음

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [86]:
gap.describe()

Unnamed: 0,year,lifeExp,pop,gdpPercap
count,1704.0,1704.0,1704.0,1704.0
mean,1979.5,59.474439,29601210.0,7215.327081
std,17.26533,12.917107,106157900.0,9857.454543
min,1952.0,23.599,60011.0,241.165877
25%,1965.75,48.198,2793664.0,1202.060309
50%,1979.5,60.7125,7023596.0,3531.846989
75%,1993.25,70.8455,19585220.0,9325.462346
max,2007.0,82.603,1318683000.0,113523.1329


In [88]:
gap.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [102]:
life_exp = gap.groupby(["year"])['lifeExp'].mean()
print(type(life_exp))
life_exp

<class 'pandas.core.series.Series'>


year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [109]:
#1952년의 평균 수명 추출
print(life_exp.iloc[0])
print(life_exp[1952])
print(life_exp.loc[1952])

49.05761971830987
49.05761971830987
49.05761971830987


In [117]:
ebola = pd.read_csv("../etc/data/country_timeseries.csv")
print(ebola.info())
ebola

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date                 122 non-null    object 
 1   Day                  122 non-null    int64  
 2   Cases_Guinea         93 non-null     float64
 3   Cases_Liberia        83 non-null     float64
 4   Cases_SierraLeone    87 non-null     float64
 5   Cases_Nigeria        38 non-null     float64
 6   Cases_Senegal        25 non-null     float64
 7   Cases_UnitedStates   18 non-null     float64
 8   Cases_Spain          16 non-null     float64
 9   Cases_Mali           12 non-null     float64
 10  Deaths_Guinea        92 non-null     float64
 11  Deaths_Liberia       81 non-null     float64
 12  Deaths_SierraLeone   87 non-null     float64
 13  Deaths_Nigeria       38 non-null     float64
 14  Deaths_Senegal       22 non-null     float64
 15  Deaths_UnitedStates  18 non-null     flo

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,3/27/2014,5,103.0,8.0,6.0,,,,,,66.0,6.0,5.0,,,,,
118,3/26/2014,4,86.0,,,,,,,,62.0,,,,,,,
119,3/25/2014,3,86.0,,,,,,,,60.0,,,,,,,
120,3/24/2014,2,86.0,,,,,,,,59.0,,,,,,,


In [119]:
# count : 누락값이 아닌 데이터의 개수를 리턴하는 함수
ebola.count()

Date                   122
Day                    122
Cases_Guinea            93
Cases_Liberia           83
Cases_SierraLeone       87
Cases_Nigeria           38
Cases_Senegal           25
Cases_UnitedStates      18
Cases_Spain             16
Cases_Mali              12
Deaths_Guinea           92
Deaths_Liberia          81
Deaths_SierraLeone      87
Deaths_Nigeria          38
Deaths_Senegal          22
Deaths_UnitedStates     18
Deaths_Spain            16
Deaths_Mali             12
dtype: int64

In [124]:
numRows = ebola.shape[0]
print(ebola.count()) #시리즈
print(numRows) #스칼라
numMissing = numRows-ebola.count() #브로드캐스팅 -> 입력되지 않은 결측값의 개수
print(numMissing)

Date                   122
Day                    122
Cases_Guinea            93
Cases_Liberia           83
Cases_SierraLeone       87
Cases_Nigeria           38
Cases_Senegal           25
Cases_UnitedStates      18
Cases_Spain             16
Cases_Mali              12
Deaths_Guinea           92
Deaths_Liberia          81
Deaths_SierraLeone      87
Deaths_Nigeria          38
Deaths_Senegal          22
Deaths_UnitedStates     18
Deaths_Spain            16
Deaths_Mali             12
dtype: int64
122
Date                     0
Day                      0
Cases_Guinea            29
Cases_Liberia           39
Cases_SierraLeone       35
Cases_Nigeria           84
Cases_Senegal           97
Cases_UnitedStates     104
Cases_Spain            106
Cases_Mali             110
Deaths_Guinea           30
Deaths_Liberia          41
Deaths_SierraLeone      35
Deaths_Nigeria          84
Deaths_Senegal         100
Deaths_UnitedStates    104
Deaths_Spain           106
Deaths_Mali            110
dtype: int6

## nonzero

In [130]:
# zero : False, nonzero : True
np.count_nonzero(ebola['Cases_Guinea'].isnull())
#null이 29개

29

In [131]:
np.count_nonzero(ebola.isnull())
#ebola 전체 null 개수

1214

## value_counts()

In [133]:
#value_counts()
#아주 많이 사용하는 코드
#똑같은 요소(value) 값이 몇 개인지 수를 세는 함수
ebola.Cases_Guinea.value_counts()

86.0      3
112.0     2
390.0     2
495.0     2
2597.0    1
         ..
235.0     1
231.0     1
226.0     1
224.0     1
2776.0    1
Name: Cases_Guinea, Length: 88, dtype: int64

In [134]:
#결측치 개수까지 세는 옵션
ebola.Cases_Guinea.value_counts(dropna=False)

NaN       29
86.0       3
495.0      2
112.0      2
390.0      2
          ..
235.0      1
231.0      1
226.0      1
224.0      1
2776.0     1
Name: Cases_Guinea, Length: 89, dtype: int64

In [136]:
ebola.fillna(0) #결측치를 0으로 채워넣음

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,0.0,10030.0,0.0,0.0,0.0,0.0,0.0,1786.0,0.0,2977.0,0.0,0.0,0.0,0.0,0.0
1,1/4/2015,288,2775.0,0.0,9780.0,0.0,0.0,0.0,0.0,0.0,1781.0,0.0,2943.0,0.0,0.0,0.0,0.0,0.0
2,1/3/2015,287,2769.0,8166.0,9722.0,0.0,0.0,0.0,0.0,0.0,1767.0,3496.0,2915.0,0.0,0.0,0.0,0.0,0.0
3,1/2/2015,286,0.0,8157.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3496.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12/31/2014,284,2730.0,8115.0,9633.0,0.0,0.0,0.0,0.0,0.0,1739.0,3471.0,2827.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,3/27/2014,5,103.0,8.0,6.0,0.0,0.0,0.0,0.0,0.0,66.0,6.0,5.0,0.0,0.0,0.0,0.0,0.0
118,3/26/2014,4,86.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119,3/25/2014,3,86.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120,3/24/2014,2,86.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [138]:
#상위 5개의 행, 5개의 열만 참조
ebola.fillna(0).iloc[:5, :5]

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone
0,1/5/2015,289,2776.0,0.0,10030.0
1,1/4/2015,288,2775.0,0.0,9780.0
2,1/3/2015,287,2769.0,8166.0,9722.0
3,1/2/2015,286,0.0,8157.0,0.0
4,12/31/2014,284,2730.0,8115.0,9633.0


In [139]:
#위 행의 값을 참조하여 NaN값을 채워넣는데
#참조할 값이 없다면 여전히 NaN값을 유지
ebola.fillna(method='ffill')

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,2769.0,8157.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,3/27/2014,5,103.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,66.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
118,3/26/2014,4,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,62.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
119,3/25/2014,3,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,60.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
120,3/24/2014,2,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,59.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0


In [140]:
#아래 행의 값을 참조하여 NaN값을 채워넣는데
#참조할 값이 없다면 여전히 NaN값을 유지
ebola.fillna(method='bfill')

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,8166.0,10030.0,20.0,1.0,4.0,1.0,7.0,1786.0,3496.0,2977.0,8.0,0.0,1.0,0.0,6.0
1,1/4/2015,288,2775.0,8166.0,9780.0,20.0,1.0,4.0,1.0,7.0,1781.0,3496.0,2943.0,8.0,0.0,1.0,0.0,6.0
2,1/3/2015,287,2769.0,8166.0,9722.0,20.0,1.0,4.0,1.0,7.0,1767.0,3496.0,2915.0,8.0,0.0,1.0,0.0,6.0
3,1/2/2015,286,2730.0,8157.0,9633.0,20.0,1.0,4.0,1.0,7.0,1739.0,3496.0,2827.0,8.0,0.0,1.0,0.0,6.0
4,12/31/2014,284,2730.0,8115.0,9633.0,20.0,1.0,4.0,1.0,7.0,1739.0,3471.0,2827.0,8.0,0.0,1.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,3/27/2014,5,103.0,8.0,6.0,,,,,,66.0,6.0,5.0,,,,,
118,3/26/2014,4,86.0,,,,,,,,62.0,,,,,,,
119,3/25/2014,3,86.0,,,,,,,,60.0,,,,,,,
120,3/24/2014,2,86.0,,,,,,,,59.0,,,,,,,


In [141]:
#NaN 값의 윗 행과 아래 행의 평균을 구해 채움.
ebola.interpolate()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,2749.5,8157.0,9677.5,,,,,,1753.0,3496.0,2871.0,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,3/27/2014,5,103.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,66.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
118,3/26/2014,4,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,62.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
119,3/25/2014,3,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,60.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
120,3/24/2014,2,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,59.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0


In [142]:
ebola.shape

(122, 18)

In [146]:
#NaN값을 포함하는 행을 삭제
ebolaDropna = ebola.dropna()
ebolaDropna.shape #(1, 18) : 한 행만 남음 -> 데이터분석 어려워짐
                  #결측값을 모두 삭제하는 방법은 좋지 않은 방법임
ebolaDropna

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
19,11/18/2014,241,2047.0,7082.0,6190.0,20.0,1.0,4.0,1.0,6.0,1214.0,2963.0,1267.0,8.0,0.0,1.0,0.0,6.0


In [148]:
#기니, 리베리아, 시에라리온 세 국가의 발병자수를 모두 더한 다음
#새롭게 Cases_Mutliple이라는 컬럼을 생성하고 저장하시오.
# => feature engineering(특성공학) : 기존의 변수로 파생변수를 생성

ebola['Cases_Multiple'] = ebola['Cases_Guinea']+ebola['Cases_Liberia']+ebola['Cases_SierraLeone']
ebola

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali,Cases_Multiple
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,,20657.0
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,,20478.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,3/27/2014,5,103.0,8.0,6.0,,,,,,66.0,6.0,5.0,,,,,,117.0
118,3/26/2014,4,86.0,,,,,,,,62.0,,,,,,,,
119,3/25/2014,3,86.0,,,,,,,,60.0,,,,,,,,
120,3/24/2014,2,86.0,,,,,,,,59.0,,,,,,,,


In [161]:
# ebola 데이터에서 기니, 리베리아, 시에라리온, 멀티플 컬럼값만 추출하여
# ebola_subset이라는 데이터 프레임을 만드시오.

ebola_subset = ebola.loc[:,['Cases_Guinea', 'Cases_Liberia', 'Cases_SierraLeone', 'Cases_Multiple']]
ebola_subset

Unnamed: 0,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Multiple
0,2776.0,,10030.0,
1,2775.0,,9780.0,
2,2769.0,8166.0,9722.0,20657.0
3,,8157.0,,
4,2730.0,8115.0,9633.0,20478.0
...,...,...,...,...
117,103.0,8.0,6.0,117.0
118,86.0,,,
119,86.0,,,
120,86.0,,,


In [163]:
#ebola.Cases_Guinea의 합계 출력
ebola.Cases_Guinea.sum() #skipna=True가 default값으로 설정되어있음

84729.0