# 10-1 간단한 함수 만들기

In [5]:
def my_sq(x):
    return x**2

def my_exp(x, n):
    return x**n

In [6]:
print(my_sq(4))
print(my_exp(4,2))

16
16


# 10-2 apply 메서드 사용하기 - 기초

In [7]:
import pandas as pd

df = pd.DataFrame({'a':[10,20,30], 'b':[20,30,40]})
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [8]:
df['a']**2

0    100
1    400
2    900
Name: a, dtype: int64

In [9]:
df['a'].apply(my_sq)

0    100
1    400
2    900
Name: a, dtype: int64

In [10]:
df['a'].apply(my_exp, n=2)

0    100
1    400
2    900
Name: a, dtype: int64

In [11]:
df['a'].apply(my_exp, n=3)

0     1000
1     8000
2    27000
Name: a, dtype: int64

In [12]:
# 열의 평균 구하기
def avg_column(col):
    sum = 0
    for item in col:
        sum += item
    return sum / df.shape[0]

# 행의 평균 구하기
def avg_row(row):
    sum = 0
    for item in row:
        sum += item
    return sum / df.shape[1]

In [13]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [14]:
df.apply(avg_column)

a    20.0
b    30.0
dtype: float64

In [15]:
df.apply(avg_row, axis=1)

0    15.0
1    25.0
2    35.0
dtype: float64

# apply 메서드 사용하기 - 고급

In [16]:
import seaborn as sns

titanic = sns.load_dataset("titanic")

In [17]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


### 누락값의 개수를 알려주는 함수 만들기

In [18]:
import numpy as np

def count_missing(vec):
    null_vec = pd.isnull(vec)
    null_count = np.sum(null_vec)
    return null_count

In [19]:
cmis_col = titanic.apply(count_missing)
cmis_col

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

### 과정을 알아보자

In [32]:
# 누락값의 유무에 따라 불린값들이 시리즈로 나타난다.
pd.isnull(titanic['age'])

0      False
1      False
2      False
3      False
4      False
5       True
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17      True
18     False
19      True
20     False
21     False
22     False
23     False
24     False
25     False
26      True
27     False
28      True
29      True
       ...  
861    False
862    False
863     True
864    False
865    False
866    False
867    False
868     True
869    False
870    False
871    False
872    False
873    False
874    False
875    False
876    False
877    False
878     True
879    False
880    False
881    False
882    False
883    False
884    False
885    False
886    False
887    False
888     True
889    False
890    False
Name: age, Length: 891, dtype: bool

In [33]:
np.sum(pd.isnull(titanic['age']))

177

In [20]:
def prop_missing(vec):
    num = count_missing(vec)
    dem = vec.size
    return num / dem

In [21]:
pmis_col = titanic.apply(prop_missing)
pmis_col

survived       0.000000
pclass         0.000000
sex            0.000000
age            0.198653
sibsp          0.000000
parch          0.000000
fare           0.000000
embarked       0.002245
class          0.000000
who            0.000000
adult_male     0.000000
deck           0.772166
embark_town    0.002245
alive          0.000000
alone          0.000000
dtype: float64

In [34]:
count_missing(titanic['age'])

177

In [35]:
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

## 행 방향으로 처리하기

In [36]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [39]:
titanic['num_missing'] = titanic.apply(count_missing, axis=1)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,num_missing
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,1


### 누락값이 2개 이상인 행들만 추출

In [44]:
titanic.loc[titanic.num_missing > 1, :].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,num_missing
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True,2
17,1,2,male,,0,0,13.0,S,Second,man,True,,Southampton,yes,True,2
19,1,3,female,,0,0,7.225,C,Third,woman,False,,Cherbourg,yes,True,2
26,0,3,male,,0,0,7.225,C,Third,man,True,,Cherbourg,no,True,2
28,1,3,female,,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True,2
