In [2]:
import numpy as np
import pandas as pd

In [1]:
my_dict = {
    "kor": {
        "a": 10,
        "b": 30,
        "c": 50,
        "d": 70
    },
    "eng": {
        "a": 20,
        "b": 40,
        "c": 60,
        "d": 80
    },
    "math": {
        "a": 10,
        "b": 20,
        "c": 10,
        "d": 20
    }
}
my_dict

{'eng': {'a': 20, 'b': 40, 'c': 60, 'd': 80},
 'kor': {'a': 10, 'b': 30, 'c': 50, 'd': 70},
 'math': {'a': 10, 'b': 20, 'c': 10, 'd': 20}}

In [3]:
my_df = pd.DataFrame(my_dict)
my_df

Unnamed: 0,kor,eng,math
a,10,20,10
b,30,40,20
c,50,60,10
d,70,80,20


# Summarization Data
- count
- value_counts
- unique
- nunique

- sum
- cumsum

- mean
- median
- std
- var
- skew
- kurt

- quantile
- describe


In [5]:
my_df2 = my_df.copy()
my_df2.loc['a':'b', 'kor'] = np.nan
my_df2

Unnamed: 0,kor,eng,math
a,,20,10
b,,40,20
c,50.0,60,10
d,70.0,80,20


In [6]:
my_df.count()

kor     4
eng     4
math    4
dtype: int64

In [7]:
my_df2.count()

kor     2
eng     4
math    4
dtype: int64

In [8]:
my_df2.shape

(4, 3)

In [11]:
my_df2[my_df2.kor.isna()]

Unnamed: 0,kor,eng,math
a,,20,10
b,,40,20


In [13]:
my_nan = ["", "NA", "0", 0, -1, np.nan]
my_nan

['', 'NA', '0', 0, -1, nan]

In [14]:
my_df.value_counts()

kor  eng  math
70   80   20      1
50   60   10      1
30   40   20      1
10   20   10      1
dtype: int64

In [15]:
my_df.kor.value_counts()

70    1
30    1
10    1
50    1
Name: kor, dtype: int64

In [17]:
my_df.math.unique()

array([10, 20])

In [18]:
my_df.math.nunique()

2

In [19]:
my_df.sum()

kor     160
eng     200
math     60
dtype: int64

### index vs columns
- index (0): 
  - 값을 계산: 행 추가
  - 값을 할당: 행별
- columns (1):
  - 값을 계산: 열 추가
  - 값을 할당: 열별
- 2 이상  

In [23]:
my_df

Unnamed: 0,kor,eng,math
a,10,20,10
b,30,40,20
c,50,60,10
d,70,80,20


In [20]:
my_df.sum(axis='index')

kor     160
eng     200
math     60
dtype: int64

In [21]:
my_df.sum(axis=0)

kor     160
eng     200
math     60
dtype: int64

In [22]:
my_df.sum(axis='columns')

a     40
b     90
c    120
d    170
dtype: int64

In [24]:
my_df.cumsum()

Unnamed: 0,kor,eng,math
a,10,20,10
b,40,60,30
c,90,120,40
d,160,200,60


In [25]:
my_df.mean()

kor     40.0
eng     50.0
math    15.0
dtype: float64

In [26]:
my_df.median()

kor     40.0
eng     50.0
math    15.0
dtype: float64

In [27]:
my_df.std()

kor     25.819889
eng     25.819889
math     5.773503
dtype: float64

In [28]:
my_df.quantile([.25, .5, 0.75])

Unnamed: 0,kor,eng,math
0.25,25.0,35.0,10.0
0.5,40.0,50.0,15.0
0.75,55.0,65.0,20.0


In [29]:
my_df2['Result'] = ['통과', '탈락', '통과', '탈락']
my_df2

Unnamed: 0,kor,eng,math,Result
a,,20,10,통과
b,,40,20,탈락
c,50.0,60,10,통과
d,70.0,80,20,탈락


In [30]:
my_df2.describe()

Unnamed: 0,kor,eng,math
count,2.0,4.0,4.0
mean,60.0,50.0,15.0
std,14.142136,25.819889,5.773503
min,50.0,20.0,10.0
25%,55.0,35.0,10.0
50%,60.0,50.0,15.0
75%,65.0,65.0,20.0
max,70.0,80.0,20.0


In [33]:
my_df2.describe(include=['O']) # 'O': 파이썬 기본 객체 (str)

Unnamed: 0,Result
count,4
unique,2
top,통과
freq,2


## Data Reshape
- assign
- drop_duplicates
- drop
- dropna

In [34]:
my_df.shape

(4, 3)

In [37]:
my_df2.math.shape

(4,)

In [36]:
my_df2.math.drop_duplicates()

a    10
b    20
Name: math, dtype: int64

In [38]:
my_df2.math.drop_duplicates().shape

(2,)

In [39]:
my_df2.drop('a')

Unnamed: 0,kor,eng,math,Result
b,,40,20,탈락
c,50.0,60,10,통과
d,70.0,80,20,탈락


In [40]:
my_df2.drop(['a', 'b'])

Unnamed: 0,kor,eng,math,Result
c,50.0,60,10,통과
d,70.0,80,20,탈락


In [41]:
my_df2.drop('a', axis='index')

Unnamed: 0,kor,eng,math,Result
b,,40,20,탈락
c,50.0,60,10,통과
d,70.0,80,20,탈락


In [42]:
my_df2.drop(['a', 'b'], axis='index')

Unnamed: 0,kor,eng,math,Result
c,50.0,60,10,통과
d,70.0,80,20,탈락


In [46]:
my_df2.shape

(4, 4)

In [45]:
my_df2.drop(['a', 'b'], axis='index').shape

(2, 4)

In [44]:
my_df2.drop('Result', axis='columns')

Unnamed: 0,kor,eng,math
a,,20,10
b,,40,20
c,50.0,60,10
d,70.0,80,20


In [47]:
my_df2.drop('Result', axis='columns').shape

(4, 3)

In [48]:
my_df2.drop(index=['a', 'b'])

Unnamed: 0,kor,eng,math,Result
c,50.0,60,10,통과
d,70.0,80,20,탈락


In [49]:
my_df2.drop(columns=['Result'])

Unnamed: 0,kor,eng,math
a,,20,10
b,,40,20
c,50.0,60,10
d,70.0,80,20


In [50]:
my_df2.dropna()

Unnamed: 0,kor,eng,math,Result
c,50.0,60,10,통과
d,70.0,80,20,탈락


In [51]:
my_df2.dropna(axis='index')

Unnamed: 0,kor,eng,math,Result
c,50.0,60,10,통과
d,70.0,80,20,탈락


In [53]:
my_df2.dropna(axis='columns')

Unnamed: 0,eng,math,Result
a,20,10,통과
b,40,20,탈락
c,60,10,통과
d,80,20,탈락


In [55]:
my_df2.assign(kor_final=lambda m:  2*m.kor)

Unnamed: 0,kor,eng,math,Result,kor_final
a,,20,10,통과,
b,,40,20,탈락,
c,50.0,60,10,통과,100.0
d,70.0,80,20,탈락,140.0


In [56]:
my_df2['kor_mid'] = my_df2.kor / 2
my_df2

Unnamed: 0,kor,eng,math,Result,kor_mid
a,,20,10,통과,
b,,40,20,탈락,
c,50.0,60,10,통과,25.0
d,70.0,80,20,탈락,35.0


## Index Manipulation
- values
- index
- rename
- reindex

In [70]:
my_df2 = my_df.copy()
my_df2.columns = ["국어", "영어", "수학"]
my_df2.index = [idx*2 for idx in my_df2.index]
my_df2 = my_df2 * 2

In [63]:
my_df.values[:, :]

array([[10, 20, 10],
       [30, 40, 20],
       [50, 60, 10],
       [70, 80, 20]])

In [61]:
my_df # (국어, aa) 존재 하지 않음.

Unnamed: 0,kor,eng,math
a,10,20,10
b,30,40,20
c,50,60,10
d,70,80,20


In [71]:
my_df2

Unnamed: 0,국어,영어,수학
aa,20,40,20
bb,60,80,40
cc,100,120,20
dd,140,160,40


In [69]:
my_df.values[2:4, 0:2]

array([[50, 60],
       [70, 80]])

In [72]:
my_df2.iloc[2:4, 0:2] = my_df.values[2:4, 0:2]
my_df2

Unnamed: 0,국어,영어,수학
aa,20,40,20
bb,60,80,40
cc,50,60,20
dd,70,80,40


In [73]:
my_df.values

array([[10, 20, 10],
       [30, 40, 20],
       [50, 60, 10],
       [70, 80, 20]])

In [74]:
my_df2

Unnamed: 0,국어,영어,수학
aa,20,40,20
bb,60,80,40
cc,50,60,20
dd,70,80,40


In [75]:
my_df2.index = ['a', 'b', 'c', 'd']
my_df2

Unnamed: 0,국어,영어,수학
a,20,40,20
b,60,80,40
c,50,60,20
d,70,80,40


In [76]:
my_df2.columns = ['kor_final', 'eng_final', 'math_final']
my_df2

Unnamed: 0,kor_final,eng_final,math_final
a,20,40,20
b,60,80,40
c,50,60,20
d,70,80,40


In [77]:
my_df2.rename({'a': 'a2'})

Unnamed: 0,kor_final,eng_final,math_final
a2,20,40,20
b,60,80,40
c,50,60,20
d,70,80,40


In [78]:
my_df2.rename({'a': 'a2'}, axis='index')

Unnamed: 0,kor_final,eng_final,math_final
a2,20,40,20
b,60,80,40
c,50,60,20
d,70,80,40


In [79]:
my_df2.rename(index={'a': 'a2'})

Unnamed: 0,kor_final,eng_final,math_final
a2,20,40,20
b,60,80,40
c,50,60,20
d,70,80,40


In [80]:
my_df2.rename({"kor_final": "kor_mid"}, axis='columns')

Unnamed: 0,kor_mid,eng_final,math_final
a,20,40,20
b,60,80,40
c,50,60,20
d,70,80,40


In [81]:
my_df2.rename(columns={"kor_final": "kor_mid"})

Unnamed: 0,kor_mid,eng_final,math_final
a,20,40,20
b,60,80,40
c,50,60,20
d,70,80,40


In [82]:
my_df2.index = ["a", "b"]

ValueError: ignored

In [85]:
my_df2.reindex(["b", "a", "aa"]) # index + reshape

Unnamed: 0,kor_final,eng_final,math_final
b,60.0,80.0,40.0
a,20.0,40.0,20.0
aa,,,


## Sorting
- sort_index
  - ascending
  - na_position
  - method

- sort_values
  - ascending
  - na_position
  - method


In [86]:
my_df2

Unnamed: 0,kor_final,eng_final,math_final
a,20,40,20
b,60,80,40
c,50,60,20
d,70,80,40


In [87]:
my_df2.index = [np.nan, 'b', 'c', 'd']
my_df2.kor_final = [np.nan, np.nan, 50, 70]
my_df2

Unnamed: 0,kor_final,eng_final,math_final
,,40,20
b,,80,40
c,50.0,60,20
d,70.0,80,40


In [88]:
my_df.sort_index()

Unnamed: 0,kor,eng,math
a,10,20,10
b,30,40,20
c,50,60,10
d,70,80,20


In [90]:
my_df.sort_index(ascending=True)

Unnamed: 0,kor,eng,math
a,10,20,10
b,30,40,20
c,50,60,10
d,70,80,20


In [91]:
my_df.sort_index(ascending=False)

Unnamed: 0,kor,eng,math
d,70,80,20
c,50,60,10
b,30,40,20
a,10,20,10


In [93]:
my_df2

Unnamed: 0,kor_final,eng_final,math_final
,,40,20
b,,80,40
c,50.0,60,20
d,70.0,80,40


In [92]:
my_df2.sort_index()

Unnamed: 0,kor_final,eng_final,math_final
b,,80,40
c,50.0,60,20
d,70.0,80,40
,,40,20


In [94]:
my_df2.sort_index(na_position='last')

Unnamed: 0,kor_final,eng_final,math_final
b,,80,40
c,50.0,60,20
d,70.0,80,40
,,40,20


In [95]:
my_df2.sort_index(na_position='first')

Unnamed: 0,kor_final,eng_final,math_final
,,40,20
b,,80,40
c,50.0,60,20
d,70.0,80,40


In [99]:
my_df2.sort_index(kind='quicksort')

Unnamed: 0,kor_final,eng_final,math_final
b,,80,40
c,50.0,60,20
d,70.0,80,40
,,40,20


In [100]:
my_df2.sort_index(kind='mergesort')

Unnamed: 0,kor_final,eng_final,math_final
b,,80,40
c,50.0,60,20
d,70.0,80,40
,,40,20


In [104]:
my_df.sort_values('kor')

Unnamed: 0,kor,eng,math
a,10,20,10
b,30,40,20
c,50,60,10
d,70,80,20


In [105]:
my_df2.sort_values('kor_final')

Unnamed: 0,kor_final,eng_final,math_final
c,50.0,60,20
d,70.0,80,40
,,40,20
b,,80,40


In [106]:
my_df2.sort_values('kor_final', na_position='first')

Unnamed: 0,kor_final,eng_final,math_final
,,40,20
b,,80,40
c,50.0,60,20
d,70.0,80,40


In [108]:
my_df2.sort_values('kor_final', kind='quicksort')

Unnamed: 0,kor_final,eng_final,math_final
c,50.0,60,20
d,70.0,80,40
,,40,20
b,,80,40


In [112]:
my_df2.sort_values('kor_final', na_position='first')

Unnamed: 0,kor_final,eng_final,math_final
,,40,20
b,,80,40
c,50.0,60,20
d,70.0,80,40


## Observation

In [113]:
my_df.rank()

Unnamed: 0,kor,eng,math
a,1.0,1.0,1.5
b,2.0,2.0,3.5
c,3.0,3.0,1.5
d,4.0,4.0,3.5


In [117]:
my_df.rank?

In [118]:
my_df.rank(method='average')

Unnamed: 0,kor,eng,math
a,1.0,1.0,1.5
b,2.0,2.0,3.5
c,3.0,3.0,1.5
d,4.0,4.0,3.5


In [121]:
my_df.rank(method='min')

Unnamed: 0,kor,eng,math
a,1.0,1.0,1.0
b,2.0,2.0,3.0
c,3.0,3.0,1.0
d,4.0,4.0,3.0
