In [6]:
import numpy as np
import pandas as pd

In [7]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 100)

# 누락된 데이터missing data 다루기
* null, NaN, NA, None
* 파이썬/pandas에서는 NaN(float) 또는 None(object) 으로 취급
* 단, 정수형 누락값인 NA는 pandas에서는 취급불가
* 누락된 데이터는 numpy 모듈이 지원(np.nan)

In [3]:
# 결측치 다루기
a = np.array([1,2,None,4,5])
print(a)   # 숫자로 구성되어 있지만 데이터형식은 none 때문에 object로 취급

[1 2 None 4 5]


In [4]:
print(a.dtype)

object


In [5]:
a * 100   # 결측치(none) 때문에 산술연살 실패

TypeError: unsupported operand type(s) for *: 'NoneType' and 'int'

In [9]:
a.sum()   # 결측치(none) 때문에 산술연살 실패

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [10]:
b = np.array([1,2,np.nan,4,5])
print(b)

[ 1.  2. nan  4.  5.]


In [11]:
print(b.dtype)

float64


In [12]:
print(b * 100)

[100. 200.  nan 400. 500.]


In [13]:
b.sum()

nan

In [14]:
c = pd.Series([1,2,np.nan,4,5])
print(c)

0    1.0
1    2.0
2    NaN
3    4.0
4    5.0
dtype: float64


In [15]:
c * 100

0    100.0
1    200.0
2      NaN
3    400.0
4    500.0
dtype: float64

In [16]:
c.sum()

12.0

# null값 다루기
* pandas 자료구조에서는 null 값을 감지하고 삭제하는 기능 제공 
  + isna(isnull, notnull), dropna, fillna

In [17]:
c.isnull()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [18]:
c.notnull()

0     True
1     True
2    False
3     True
4     True
dtype: bool

In [19]:
# isna (pandas 0.21이상 지원)
c.isna()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [20]:
# null 갯수 확인 : 
c.isna().sum()

1

In [21]:
# null 제거 : dropna
c.dropna()

0    1.0
1    2.0
3    4.0
4    5.0
dtype: float64

In [22]:
# null값 대체 : 평균, 최소/최대, 최빈값, 중앙값
c.fillna(0)

0    1.0
1    2.0
2    0.0
3    4.0
4    5.0
dtype: float64

In [23]:
# 우편번호 데이터프레임의 결측치를 적절히 처리하고 출력
zip = pd.read_csv('data/zipcode2013.csv', sep='\t')
zip

Unnamed: 0,ZIPCODE,SIDO,GUGUN,DONG,RI,BUNJI,SEQ
0,135-806,서울,강남구,개포1동,경남아파트,,1
1,135-807,서울,강남구,개포1동,우성3차아파트,(1∼6동),2
2,135-806,서울,강남구,개포1동,우성9차아파트,(901∼902동),3
3,135-770,서울,강남구,개포1동,주공아파트,(1∼16동),4
4,135-805,서울,강남구,개포1동,주공아파트,(17∼40동),5
...,...,...,...,...,...,...,...
52139,363-823,충북,청원군,현도면,죽전리,,52140
52140,363-823,충북,청원군,현도면,중삼리,,52141
52141,363-822,충북,청원군,현도면,중척리,,52142
52142,363-821,충북,청원군,현도면,하석리,,52143


In [24]:
zip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52144 entries, 0 to 52143
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ZIPCODE  52144 non-null  object
 1   SIDO     52144 non-null  object
 2   GUGUN    51958 non-null  object
 3   DONG     52144 non-null  object
 4   RI       31655 non-null  object
 5   BUNJI    20603 non-null  object
 6   SEQ      52144 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 2.8+ MB


In [25]:
zip.isna().sum()

ZIPCODE        0
SIDO           0
GUGUN        186
DONG           0
RI         20489
BUNJI      31541
SEQ            0
dtype: int64

In [26]:
#결측치 처리는 공백으로 대체
# zip = zip.fillna(' ')
zip.fillna('', inplace=True)  # inplace 옵션으로 변경사항을 바로 변수에 저장
zip

Unnamed: 0,ZIPCODE,SIDO,GUGUN,DONG,RI,BUNJI,SEQ
0,135-806,서울,강남구,개포1동,경남아파트,,1
1,135-807,서울,강남구,개포1동,우성3차아파트,(1∼6동),2
2,135-806,서울,강남구,개포1동,우성9차아파트,(901∼902동),3
3,135-770,서울,강남구,개포1동,주공아파트,(1∼16동),4
4,135-805,서울,강남구,개포1동,주공아파트,(17∼40동),5
...,...,...,...,...,...,...,...
52139,363-823,충북,청원군,현도면,죽전리,,52140
52140,363-823,충북,청원군,현도면,중삼리,,52141
52141,363-822,충북,청원군,현도면,중척리,,52142
52142,363-821,충북,청원군,현도면,하석리,,52143


In [27]:
# titanic 데이터프레임의 결측치를 적절히 처리하고 출력
titanic = pd.read_csv('data/titanic.csv')
titanic.head(16)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S
5,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,C101,S
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C


In [28]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   name      1309 non-null   object 
 3   sex       1309 non-null   object 
 4   age       1046 non-null   float64
 5   sibsp     1309 non-null   int64  
 6   parch     1309 non-null   int64  
 7   ticket    1309 non-null   object 
 8   fare      1308 non-null   float64
 9   cabin     295 non-null    object 
 10  embarked  1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 112.6+ KB


In [29]:
titanic.isna().sum()

pclass         0
survived       0
name           0
sex            0
age          263
sibsp          0
parch          0
ticket         0
fare           1
cabin       1014
embarked       2
dtype: int64

In [30]:
# age는 평균값으로 대체
mean = np.mean(titanic.age)
titanic.age.fillna(mean, inplace=True)

In [31]:
# cabin 컬럼을 기준으로 dropna하는 것은 비효율적 - 1014개의 행이 제거
# 따라서, drop메서드를 이용해서 해당컬럼을 제거
# 객체명.drop('컬럼명', axis=1)
titanic.drop('cabin', axis=1, inplace=True)

In [32]:
titanic.isna().sum()

pclass      0
survived    0
name        0
sex         0
age         0
sibsp       0
parch       0
ticket      0
fare        1
embarked    2
dtype: int64

In [33]:
# embarked, fare 컬럼의 결측치 제거(행기준)
titanic.dropna(inplace=True)

In [34]:
titanic.isna().sum()

pclass      0
survived    0
name        0
sex         0
age         0
sibsp       0
parch       0
ticket      0
fare        0
embarked    0
dtype: int64

# dataframe에 새로운 항목 추가
+ 기존 컬럼의 값을 기준으로 새로운 컬럼을 생성할 수 있음 - 파생변수
+ 기존 변수를 조합해서 새로운 변수를 만들어 냄
+ 데이터를 분석하기 좋은 형태로 만들기 위해 수행
+ 객채명['새로운컬럼명'] = 값들

In [35]:
# 문자형 변수를 숫자형 변수로 인코딩 -> 컴퓨터는 숫자를 더 좋아하기 때문
# 성별 컬럼의 female, male값을 0, 1로 변환
# 승선위치 컬럼의 C, Q, S값을 0, 1, 2로 변환
def getGender(x):   # 성별변환함수
    gender = 0
    if x == 'male' : gender = 1
    return gender
    
def getPort(x):
    port = 0
    if x == 'Q' : port = 1
    elif x == 'S' : port = 2
    return port

In [36]:
# 새로운 성별 컬럼 생성
titanic['gender'] = titanic['sex'].apply(lambda x: getGender(x) )
titanic[['sex','gender']].head()

Unnamed: 0,sex,gender
0,female,0
1,male,1
2,female,0
3,male,1
4,female,0


In [37]:
titanic['port'] = titanic['embarked'].apply(lambda x: getPort(x) )
titanic[['embarked','port']]

Unnamed: 0,embarked,port
0,S,2
1,S,2
2,S,2
3,S,2
4,S,2
...,...,...
1304,C,0
1305,C,0
1306,C,0
1307,C,0


In [38]:
# 숫자형 변수를 문자형 변수로 인코딩
# pclass 컬럼을 기준으로 seat컬럼을 생성(1st, 2nd, 3rd)
def getSeat(x):   
    seat = '1st'
    if x == 2 : seat = '2nd'
    elif x == 3 : seat = '3rd'
    return seat

In [39]:
titanic['seat'] = titanic['pclass'].apply(lambda x: getSeat(x) )
titanic[['pclass','seat']]

Unnamed: 0,pclass,seat
0,1,1st
1,1,1st
2,1,1st
3,1,1st
4,1,1st
...,...,...
1304,3,3rd
1305,3,3rd
1306,3,3rd
1307,3,3rd


In [40]:
# embarked 컬럼을 기준오르 port2컬럼을 생성(cherbourg, queenstown, southamptond)
def getPort2(x):   # 
    port2 = 'cherbourg'
    if x == 'Q' : port2 = 'queenstown'
    elif x == 'S' : port2 = 'southamptond'
    return port2

In [41]:
titanic['port2'] = titanic['embarked'].apply(lambda x: getPort2(x) )
titanic[['embarked','port2']]

Unnamed: 0,embarked,port2
0,S,southamptond
1,S,southamptond
2,S,southamptond
3,S,southamptond
4,S,southamptond
...,...,...
1304,C,cherbourg
1305,C,cherbourg
1306,C,cherbourg
1307,C,cherbourg


In [42]:
# 문자열 데이터를 범주형 데이터로 변환
# 승객들의 직책을 추출 : getTitle
titanic.name.head()

0                      Allen, Miss. Elisabeth Walton
1                     Allison, Master. Hudson Trevor
2                       Allison, Miss. Helen Loraine
3               Allison, Mr. Hudson Joshua Creighton
4    Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
Name: name, dtype: object

In [43]:
titanic.name.tail()

1304         Zabour, Miss. Hileni
1305        Zabour, Miss. Thamine
1306    Zakarian, Mr. Mapriededer
1307          Zakarian, Mr. Ortin
1308           Zimmerman, Mr. Leo
Name: name, dtype: object

In [44]:
fmt = ' ([A-Za-z]+)'    # []문자의 범위, + 문자수, () 문자조합 \.특수기호. 의미
titanic['title'] = titanic.name.str.extract(fmt)    # extract(형식) 지정한 형식과 일치하는 문자를 추출함
titanic['title'].head()

0      Miss
1    Master
2      Miss
3        Mr
4       Mrs
Name: title, dtype: object

In [46]:
# 직책별 빈도
titanic.title.value_counts()

Mr             735
Miss           255
Mrs            190
Master          59
y                8
Rev              8
Dr               8
Planke           4
Col              4
Impe             3
Billiard         3
Messemaeker      2
Mlle             2
Major            2
Carlo            2
Ms               2
Gordon           2
Shawah           1
Cruyssen         1
Steen            1
Walle            1
Velde            1
Melkebeke        1
Palmquist        1
Capt             1
Pelsmaeker       1
Mulder           1
Khalil           1
Mme              1
der              1
Don              1
the              1
Jonkheer         1
Brito            1
Name: title, dtype: int64

In [47]:
# 연속형 데이터를 범주형으로 변환 : getAges
# 승객 나이를 유아, 10대, 20대~ 80대
# 신생아 : ~ 1까지
# 유아 :  2 ~ 6까지
# 어린이 : 7 ~ 12까지
# 청소년 : 13 ~ 19까지
# 성인 : 20대, 30대, ~ 80대
def getAge(x):   # 
    age = '신생아'
    if 2<= x <7 : age = '유아'
    elif x < 13 : age = '어린이'
    elif x < 20 : age = '청소년'
    elif x < 30 : age = '20대'
    elif x < 40 : age = '30대'
    elif x < 50 : age = '40대'
    elif x < 60 : age = '50대'
    elif x < 70 : age = '60대'
    elif x < 80 : age = '70대'
    elif 80 <= x : age = '80대'
    return age

titanic['ages'] = titanic['age'].apply(lambda x: getAge(x) )
titanic[['age','ages']].head(50)

Unnamed: 0,age,ages
0,29.0,20대
1,0.9167,어린이
2,2.0,유아
3,30.0,30대
4,25.0,20대
5,48.0,40대
6,63.0,60대
7,39.0,30대
8,53.0,50대
9,71.0,70대


In [49]:
titanic.ages.value_counts()

20대    607
30대    231
40대    135
청소년    131
50대     70
어린이     54
유아      40
60대     30
70대      7
80대      1
Name: ages, dtype: int64

In [102]:
#titanic2로 저장
titanic.to_csv('titanic2')
titanic2 = pd.read_csv('titanic2')

In [74]:
# pandas 기술통계 다루기
name = ['Tom','James','Ricky','Vin','Steve','Smith','Jack','Lee','David','Gasper','Betina','Andres']
age = [25,26,25,23,30,29,23,34,40,30,51,46]
rating = [4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65]
data = {'name':name, 'age':age, 'rating':rating}

df = pd.DataFrame(data)
df.head(12)

Unnamed: 0,name,age,rating
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2
5,Smith,29,4.6
6,Jack,23,3.8
7,Lee,34,3.78
8,David,40,2.98
9,Gasper,30,4.8


In [53]:
# 데이터 컬럼별 행수 : count
df.count()

name      12
age       12
rating    12
dtype: int64

In [61]:
# 전체합 : sum()
# numeric_only=True 숫자 데이터만 대상으로 전체합을 구함
df.sum(numeric_only=True)   #열기반

age       382.00
rating     44.92
dtype: float64

In [60]:
df.sum(axis=1, numeric_only=True)   #행기반

0     29.23
1     29.24
2     28.98
3     25.56
4     33.20
5     33.60
6     26.80
7     37.78
8     42.98
9     34.80
10    55.10
11    49.65
dtype: float64

In [63]:
df.mean(numeric_only=True)

age       31.833333
rating     3.743333
dtype: float64

In [82]:
# 최빈값 : mode
df.mode()
# df.age.mode()  
# df.rating.mode()  

Unnamed: 0,name,age,rating
0,Andres,23.0,2.56
1,Betina,25.0,2.98
2,David,30.0,3.2
3,Gasper,,3.24
4,Jack,,3.65
5,James,,3.78
6,Lee,,3.8
7,Ricky,,3.98
8,Smith,,4.1
9,Steve,,4.23


In [67]:
# 중앙값 : median
df.median(numeric_only=True)

age       29.50
rating     3.79
dtype: float64

In [68]:
# 최소값 : min
df.min(numeric_only=True)

age       23.00
rating     2.56
dtype: float64

In [69]:
# 최대값 : max
df.max(numeric_only=True)

age       51.0
rating     4.8
dtype: float64

In [88]:
# 누적합 : cumsum
df.loc[:, ['age','rating']].cumsum().head()

Unnamed: 0,age,rating
0,25,4.23
1,51,7.47
2,76,11.45
3,99,14.01
4,129,17.21


In [None]:
# 누적곱 : cumprod

In [89]:
df.loc[:, ['age','rating']].cumprod().head()

Unnamed: 0,age,rating
0,25,4.23
1,650,13.7052
2,16250,54.546696
3,373750,139.639542
4,11212500,446.846534


In [90]:
# 표준편차 : std
df.loc[:, ['age','rating']].std().head()

age       9.232682
rating    0.661628
dtype: float64

In [96]:
# 분산 : var
df.var(numeric_only=True)

age       85.242424
rating     0.437752
dtype: float64

In [99]:
# 공분산 : cov
df.cov().head()

Unnamed: 0,age,rating
age,85.242424,0.357879
rating,0.357879,0.437752


In [100]:
# 상관계수 : corr
df.corr().head()

Unnamed: 0,age,rating
age,1.0,0.058586
rating,0.058586,1.0


In [107]:
# 타이타닉 데이터에 대해 기술통계
titanic2.head()

Unnamed: 0.1,Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked,gender,port,seat,port2,title,ages
0,0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,S,0,2,1st,southamptond,Miss,20대
1,1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,S,1,2,1st,southamptond,Master,어린이
2,2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,S,0,2,1st,southamptond,Miss,유아
3,3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,S,1,2,1st,southamptond,Mr,30대
4,4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,S,0,2,1st,southamptond,Mrs,20대


In [109]:
titanic2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306 entries, 0 to 1305
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1306 non-null   int64  
 1   pclass      1306 non-null   int64  
 2   survived    1306 non-null   int64  
 3   name        1306 non-null   object 
 4   sex         1306 non-null   object 
 5   age         1306 non-null   float64
 6   sibsp       1306 non-null   int64  
 7   parch       1306 non-null   int64  
 8   ticket      1306 non-null   object 
 9   fare        1306 non-null   float64
 10  embarked    1306 non-null   object 
 11  gender      1306 non-null   int64  
 12  port        1306 non-null   int64  
 13  seat        1306 non-null   object 
 14  port2       1306 non-null   object 
 15  title       1306 non-null   object 
 16  ages        1306 non-null   object 
dtypes: float64(2), int64(7), object(8)
memory usage: 173.6+ KB


In [118]:
titanic2.pclass = titanic2.pclass.astype(str)
titanic2.survived = titanic2.survived.astype(str)
titanic2.sibsp = titanic2.sibsp.astype(str)
titanic2.gender = titanic2.gender.astype(str)
titanic2.parch = titanic2.parch.astype(str)
titanic2.port = titanic2.port.astype(str)

In [119]:
titanic2.count()

Unnamed: 0    1306
pclass        1306
survived      1306
name          1306
sex           1306
age           1306
sibsp         1306
parch         1306
ticket        1306
fare          1306
embarked      1306
gender        1306
port          1306
seat          1306
port2         1306
title         1306
ages          1306
dtype: int64

In [120]:
titanic2.median(numeric_only=True)

Unnamed: 0    654.500000
age            29.881135
fare           14.454200
dtype: float64

In [121]:
titanic2.mean(numeric_only=True)

Unnamed: 0    654.218224
age            29.826880
fare           33.223956
dtype: float64

In [122]:
titanic2.min(numeric_only=True)

Unnamed: 0    0.0000
age           0.1667
fare          0.0000
dtype: float64

In [123]:
titanic2.corr()

Unnamed: 0.1,Unnamed: 0,age,fare
Unnamed: 0,1.0,-0.298731,-0.48043
age,-0.298731,1.0,0.170266
fare,-0.48043,0.170266,1.0


In [124]:
titanic2.sum(numeric_only=True)

Unnamed: 0    854409.000000
age            38953.905077
fare           43390.486900
dtype: float64

In [125]:
titanic2.std(numeric_only=True)

Unnamed: 0    377.745795
age            12.837289
fare           51.765986
dtype: float64