In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## describe()
- 통계량 요약, 숫자만 가능

In [3]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
df.describe(include='object')

Unnamed: 0,sex,embarked,who,embark_town,alive
count,891,889,891,889,891
unique,2,3,3,3,2
top,male,S,man,Southampton,no
freq,577,644,537,644,549


## count()
- 데이터의 개수, Null값은 제외

In [5]:
df.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [6]:
# 원하는 컬럼에 직접 적용 가능
df['age'].count()

np.int64(714)

## mean()

In [7]:
df['age'].mean()

np.float64(29.69911764705882)

In [8]:
df.mean() # 평균을 계산할 수 없는 데이터가 포함되어있기 때문에 오류

TypeError: can only concatenate str (not "int") to str

In [14]:
df.mean(numeric_only=True) # 평균을 계산할 수 없는 데이터를 제외하고 숫자형 데이터만 평균 계산

survived       0.383838
pclass         2.308642
age           29.699118
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

In [15]:
df.mean(numeric_only=True, skipna=False) # skipna=False를 하면 만약 NaN가 속해있으면 그 데이터는 계산하지않고 넘어감

survived       0.383838
pclass         2.308642
age                 NaN
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

In [10]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [11]:
# 성인 남성 요금 평균
cond = df['adult_male'] == True
df.loc[cond, 'fare'].mean()

np.float64(24.864181750465548)

In [12]:
# fare : 30 ~ 40 지불 승객 & 1등석 승객 => 나이 평균
cond1 = df['fare'] >= 30
cond2 = df['fare'] <= 40
cond3 = df['class'] == 'First'

df.loc[cond1 & cond2 & cond3, 'age'].mean()

np.float64(44.095238095238095)

## median()

In [20]:
numbers = [1, 2, 10, 11, 100, 1000]
pd.Series(numbers).mean()

np.float64(187.33333333333334)

In [21]:
pd.Series(numbers).median()
# 이상값이 너무 클 때 사용
# 비율적으로 가운데에 있는 데이터 접근
# 데이터의 개수가 짝수개면 중간 두개 데이터의 평균을 출력

np.float64(10.5)

In [23]:
mean_value = df['age'].mean()
median_value = df['age'].median()
print(mean_value, median_value)

29.69911764705882 28.0


## sum()

In [26]:
df.sum(numeric_only=True)

survived        342.0000
pclass         2057.0000
age           21205.1700
sibsp           466.0000
parch           340.0000
fare          28693.9493
adult_male      537.0000
alone           537.0000
dtype: float64

In [27]:
df['fare'].sum()

np.float64(28693.9493)

In [28]:
df['fare'].cumsum() # sum()이 되기까지의 누적합들을 출력

0          7.2500
1         78.5333
2         86.4583
3        139.5583
4        147.6083
          ...    
886    28602.7493
887    28632.7493
888    28656.1993
889    28686.1993
890    28693.9493
Name: fare, Length: 891, dtype: float64

In [30]:
df['age'].cumprod() # 누적곱

  return bound(*args, **kwds)


0            22.0
1           836.0
2         21736.0
3        760760.0
4      26626600.0
          ...    
886           inf
887           inf
888           NaN
889           inf
890           inf
Name: age, Length: 891, dtype: float64

## 표본분산 var()
- 데이터가 얼만큼 떨어져있는지, 편차제곱의 평균

In [36]:
fare_mean = df['fare'].mean()
# fare_mean

total = ((df['fare'] - fare_mean) ** 2).sum() # 편차제곱의 합
total_count = df['fare'].count() - 1 # 모분산은 n으로 나누고 표본분산은 n-1을 나눔

my_var = total / total_count
print(my_var)

2469.436845743116


In [37]:
df['fare'].var()

np.float64(2469.436845743116)

## 표준편차 std()

In [39]:
import numpy as np
np.sqrt(df['fare'].var())

np.float64(49.6934285971809)

In [40]:
df['fare'].std()

np.float64(49.6934285971809)

## min(), max()

In [41]:
df['age'].min()

np.float64(0.42)

In [42]:
df['age'].max()

np.float64(80.0)

## agg() (aggregation)
- 컬럼을 선택하고 계산하고싶은 함수들을 넣어 사용

In [44]:
df['age'].agg(['max', 'min', 'count', 'mean'])

max       80.000000
min        0.420000
count    714.000000
mean      29.699118
Name: age, dtype: float64

In [45]:
df[['age', 'fare']].agg(['min', 'max'])

Unnamed: 0,age,fare
min,0.42,0.0
max,80.0,512.3292


## quantile()

In [47]:
df['age'].quantile(0.1) # 나이기준 하위 10%

np.float64(14.0)

In [48]:
df['age'].quantile(0.8) # 나이기준 상위 20%

np.float64(41.0)

In [49]:
df['age'].median()
df['age'].quantile(0.5) # 중앙값과 같은 코드

np.float64(28.0)

## unique()

In [50]:
df['who'].unique # 문자의 종류..?

array(['man', 'woman', 'child'], dtype=object)

In [51]:
df['who'].nunique() # 유니크값의 수

3

## mode() 최빈값

In [52]:
df['who'].mode()

0    man
Name: who, dtype: object

In [53]:
df['deck'].mode()

0    C
Name: deck, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

## corr() 상관계수
- 각 컬럼끼리의 상관관계를 -1 ~ 1까지 표현

In [55]:
df.corr(numeric_only=True)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307,-0.55708,-0.203367
pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495,0.094035,0.135207
age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067,0.280328,0.19827
sibsp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651,-0.253586,-0.584471
parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225,-0.349943,-0.583398
fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0,-0.182024,-0.271832
adult_male,-0.55708,0.094035,0.280328,-0.253586,-0.349943,-0.182024,1.0,0.404744
alone,-0.203367,0.135207,0.19827,-0.584471,-0.583398,-0.271832,0.404744,1.0


In [56]:
df.corr(numeric_only=True)['survived']

survived      1.000000
pclass       -0.338481
age          -0.077221
sibsp        -0.035322
parch         0.081629
fare          0.257307
adult_male   -0.557080
alone        -0.203367
Name: survived, dtype: float64