# Pandas
- 객체 생성:
  - Series
  - DataFrame

- 값선택, 메서드
  - 숫자 데이터

- 인덱스
  - 단일 인덱스
  - 멀티(레벨) 인덱스

- 데이터 타입
  - 숫자
    - 정수, 실수
  - 문자열(String)
  - 카테고리
  - TimeSeries
- Missing Data
=> 데이터 가져오기 

- Reshape
  - DataFrame 합치기
    - concat
    - join
  - pivot, pivot_table
  - groupby

- visualization


In [1]:
import numpy as np
import pandas as pd

# Text Data Type
- pandas의 텍스트
  - object(Python str)
  - pandas.StringDtype

In [2]:
pd.StringDtype

pandas.core.arrays.string_.StringDtype

In [3]:
my_list = list("abc")
my_list

['a', 'b', 'c']

In [4]:
my_ss = pd.Series(my_list)
my_ss

0    a
1    b
2    c
dtype: object

In [5]:
my_ss = pd.Series(my_list, dtype="string")
my_ss

0    a
1    b
2    c
dtype: string

In [6]:
my_ss = pd.Series(my_list, dtype=pd.StringDtype())
my_ss

0    a
1    b
2    c
dtype: string

In [7]:
my_ss = pd.Series(my_list)
my_ss

0    a
1    b
2    c
dtype: object

In [8]:
my_ss.astype("string")

0    a
1    b
2    c
dtype: string

In [9]:
my_ss.astype(pd.StringDtype())

0    a
1    b
2    c
dtype: string

In [14]:
my_list1 = [1, "hello", np.nan]

In [15]:
my_ss1 = pd.Series(my_list1)
my_ss1

0        1
1    hello
2      NaN
dtype: object

In [17]:
my_ss1.iloc[0]

1

In [19]:
my_ss1.astype("string")

0        1
1    hello
2     <NA>
dtype: string

In [18]:
my_ss1.astype("string").iloc[0]

'1'

In [20]:
my_ss1.astype("string").iloc[2]

<NA>

In [21]:
my_list2 = ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", 'dog', 'cat']
my_ss2 = pd.Series(my_list2, dtype="string")
my_ss2

0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [22]:
my_ss2.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

In [23]:
"Python".lower()

'python'

In [24]:
my_ss2.str.upper()

0       A
1       B
2       C
3    AABA
4    BACA
5    <NA>
6    CABA
7     DOG
8     CAT
dtype: string

In [25]:
my_ss2.str.len()

0       1
1       1
2       1
3       4
4       4
5    <NA>
6       4
7       3
8       3
dtype: Int64

In [39]:
my_list5 = ['  jack', '  jill  ', '  jesse', ' frank']
my_index4 = pd.Index(my_list5)
my_index4

Index(['  jack', '  jill  ', '  jesse', ' frank'], dtype='object')

In [27]:
my_index4.str.upper()

Index(['JACK', 'JILL', 'JESSE', 'FRANK'], dtype='object')

In [33]:
my_ss3 = pd.Series(["Cat", "dog", "Lion", "Whale"], index=my_index4)
my_ss3

  jack        Cat
  jill        dog
  jesse      Lion
 frank      Whale
dtype: object

In [35]:
my_ss3.index

Index(['  jack', '  jill  ', '  jesse', ' frank'], dtype='object')

In [37]:
my_ss3.index = my_ss3.index.str.strip()
my_ss3.index

Index(['jack', 'jill', 'jesse', 'frank'], dtype='object')

In [38]:
my_ss3['jack']

'Cat'

In [31]:
my_ss3.index.str.len()

Int64Index([4, 4, 5, 5], dtype='int64')

In [40]:
new_index = []
for idx in my_ss3.index:
  new_index.append(idx.strip())
new_index

['jack', 'jill', 'jesse', 'frank']

In [45]:
my_df = pd.DataFrame(np.random.randn(100, 2), columns=[" Column A ", "  Column B "])
my_df

Unnamed: 0,Column A,Column B
0,-0.494070,0.083703
1,1.082358,-1.563586
2,0.174808,-0.125572
3,-0.043427,-1.050630
4,-0.598147,-1.010725
...,...,...
95,-0.672219,-1.453323
96,-0.439945,-0.142504
97,-0.105833,2.261384
98,0.045385,0.171494


In [42]:
my_df[" Column A "]

0    -0.855963
1     2.359680
2    -1.543895
3    -1.063467
4    -2.390832
        ...   
95    0.676945
96    0.785507
97   -0.074928
98    0.697894
99   -0.288202
Name:  Column A , Length: 100, dtype: float64

In [46]:
my_df.columns = my_df.columns.str.strip().str.replace(" ", "_").str.lower()
my_df

Unnamed: 0,column_a,column_b
0,-0.494070,0.083703
1,1.082358,-1.563586
2,0.174808,-0.125572
3,-0.043427,-1.050630
4,-0.598147,-1.010725
...,...,...
95,-0.672219,-1.453323
96,-0.439945,-0.142504
97,-0.105833,2.261384
98,0.045385,0.171494


In [47]:
my_df.column_a

0    -0.494070
1     1.082358
2     0.174808
3    -0.043427
4    -0.598147
        ...   
95   -0.672219
96   -0.439945
97   -0.105833
98    0.045385
99    0.442224
Name: column_a, Length: 100, dtype: float64

In [59]:
my_ss = pd.Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype="string")
my_ss

0    a_b_c
1    c_d_e
2     <NA>
3    f_g_h
dtype: string

In [60]:
my_ss.str.split("_")

0    [a, b, c]
1    [c, d, e]
2         <NA>
3    [f, g, h]
dtype: object

In [61]:
my_ss.str.split("_").str[1]

0       b
1       d
2    <NA>
3       g
dtype: object

In [62]:
my_ss.str.split("_").str.get(1)

0       b
1       d
2    <NA>
3       g
dtype: object

In [63]:
my_ss.str.split("_", expand=True)

Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


In [64]:
"a,b,c,d".split(",")

['a', 'b', 'c', 'd']

In [66]:
my_ss.str.split("_", expand=True, n=2)

Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


In [67]:
my_ss.str.rsplit("_", expand=True, n=1)

Unnamed: 0,0,1
0,a_b,c
1,c_d,e
2,,
3,f_g,h


In [69]:
my_ss2

0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [74]:
my_ss2.str.replace("A", "aa")

0        aa
1         B
2         C
3     aaaba
4      Baca
5      <NA>
6    CaaBaa
7       dog
8       cat
dtype: string

In [75]:
my_ss2

0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [76]:
my_ss2.str.replace("^A", "X")

0       X
1       B
2       C
3    Xaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [77]:
my_ss4 = pd.Series(['a', 'b', 'c', 'd'], dtype='string')
my_ss4

0    a
1    b
2    c
3    d
dtype: string

In [78]:
my_ss4.str.cat()

'abcd'

In [79]:
my_ss4.str.cat(sep=',')

'a,b,c,d'

In [81]:
my_ss4.str.cat(sep="_")

'a_b_c_d'

In [82]:
my_ss5 = pd.Series(["a", "b", np.nan, "d"], dtype="string")
my_ss5

0       a
1       b
2    <NA>
3       d
dtype: string

In [83]:
my_ss5.str.cat()

'abd'

In [84]:
my_ss5.str.cat(na_rep="c")

'abcd'

In [85]:
my_ss5.str.cat(sep=",", na_rep="-")

'a,b,-,d'

In [87]:
my_ss4.str.upper().values

<StringArray>
['A', 'B', 'C', 'D']
Length: 4, dtype: string

In [89]:
my_ss4.str.cat(my_ss4.str.upper().values)

0    aA
1    bB
2    cC
3    dD
dtype: string

In [92]:
my_ss4.str.cat(my_ss5.values, na_rep="*")

0    aa
1    bb
2    c*
3    dd
dtype: string

In [95]:
my_df = pd.DataFrame({"a": my_ss4.values, "b": my_ss5.values}, dtype="string")
my_df

Unnamed: 0,a,b
0,a,a
1,b,b
2,c,
3,d,d


In [98]:
my_ss4.str.cat(my_df)

0     aaa
1     bbb
2    <NA>
3     ddd
dtype: string

In [101]:
my_ss2

0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [102]:
"Python"[3]

'h'

In [105]:
"Python"[10]

IndexError: ignored

In [103]:
my_ss2.str[0]

0       A
1       B
2       C
3       A
4       B
5    <NA>
6       C
7       d
8       c
dtype: string

In [104]:
my_ss2.str[1]

0    <NA>
1    <NA>
2    <NA>
3       a
4       a
5    <NA>
6       A
7       o
8       a
dtype: string

In [106]:
my_ss2.str.get(3) # na_rep

0    <NA>
1    <NA>
2    <NA>
3       a
4       a
5    <NA>
6       A
7    <NA>
8    <NA>
dtype: string

In [109]:
my_ss2[my_ss2.str.startswith("A")]

0       A
3    Aaba
dtype: string

### 문자열
- 컬럼/인덱스
- split: 여러 열 -> 하나의 열
- cat: 
  - 다른 컬럼의 데이터로 구별
  - PASS_, final_kor, mid_kor, 
- replace