# DataFrame 연습

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame 
# Pandas에는 Series 와 DataFrame이라는 두 종류의 모듈을 사용한다.

In [3]:
# 정렬
# 인덱스를 정렬하는 방법
# Series의 기본 구조는 index와 그 index에 맵핑에 되는 values 값이 있다.
# Series(values 값 , index = index 값) 
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [4]:
obj.sort_index() 

a    1
b    2
c    3
d    0
dtype: int64

In [5]:
obj ## 위에서 sort를 해줬으나 obj의 결과는 변하지 않는다.

d    0
a    1
b    2
c    3
dtype: int64

In [6]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [7]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [8]:
frame.sort_index(axis=1)
# 열을 기준으로 정렬

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [9]:
frame.sort_index(axis=1, ascending=False)
# 내림차순으로 정렬

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [10]:
frame = DataFrame({'b': [4,7,3,2], 'a': [4,9,2,5], 'c': [5,3,7,9]}) 
frame

Unnamed: 0,b,a,c
0,4,4,5
1,7,9,3
2,3,2,7
3,2,5,9


In [11]:
frame.sort_values(by='b')

Unnamed: 0,b,a,c
3,2,5,9
2,3,2,7
0,4,4,5
1,7,9,3


In [12]:
frame.sort_values(by='a')

Unnamed: 0,b,a,c
2,3,2,7
0,4,4,5
3,2,5,9
1,7,9,3


In [13]:
# 순위 매기기 rank
obj = Series([100, 23, 55, 44, 22, 99, 33])
obj.rank()

0    7.0
1    2.0
2    5.0
3    4.0
4    1.0
5    6.0
6    3.0
dtype: float64

In [14]:
# 내림차순으로 순위를 매긴다
obj.rank(ascending=False)

0    1.0
1    6.0
2    3.0
3    4.0
4    7.0
5    2.0
6    5.0
dtype: float64

In [15]:
obj = Series([100, 22, 100, 44, 22, 99, 33])
obj.rank()
# 동점이 있으면 평균값을 준다

0    6.5
1    1.5
2    6.5
3    4.0
4    1.5
5    5.0
6    3.0
dtype: float64

In [16]:
obj.rank(method='first')
# 동일한 값이 존재 할 경우 먼저 나타나는 것에게 높은 순위를 줄 수 있다

0    6.0
1    1.0
2    7.0
3    4.0
4    2.0
5    5.0
6    3.0
dtype: float64

In [17]:
frame = DataFrame({'b': [4,7,3,2], 'a': [4,9,2,5], 'c': [5,3,7,9]})
frame

Unnamed: 0,b,a,c
0,4,4,5
1,7,9,3
2,3,2,7
3,2,5,9


In [18]:
frame.rank(axis=1) #행 기준으로 rank를 수행

Unnamed: 0,b,a,c
0,1.5,1.5,3.0
1,2.0,3.0,1.0
2,2.0,1.0,3.0
3,1.0,2.0,3.0


In [19]:
# NaN이 있는 경우 빼고 계산한다
frame = DataFrame({'b': [4, 7, 3, 2], 'a': [4,9,2,5], 'c': [5,3,7,np.nan]})
frame

Unnamed: 0,b,a,c
0,4,4,5.0
1,7,9,3.0
2,3,2,7.0
3,2,5,


In [20]:
frame.sum()

b    16.0
a    20.0
c    15.0
dtype: float64

In [21]:
frame.mean()

b    4.0
a    5.0
c    5.0
dtype: float64

In [22]:
frame.sum(skipna=False)
# NaN이 있으면 이를 반영하여 스킵하지 않는다
# skipna 은 skip NaN을 뜻함.

b    16.0
a    20.0
c     NaN
dtype: float64

In [23]:
frame

Unnamed: 0,b,a,c
0,4,4,5.0
1,7,9,3.0
2,3,2,7.0
3,2,5,


In [24]:
# 최대치가 있는 위치를 반환한다
frame.idxmax()

b    1
a    1
c    2
dtype: int64

In [25]:
# 최소치가 있는 위치를 반환한다
frame.idxmin()

b    3
a    2
c    1
dtype: int64

In [26]:
# 항목 갯수 세기

# 유니크한 값 찾기 (set)
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [27]:
uniques = obj.unique() ## value 값의 종류를 확인 할 수 있다.
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [28]:
# 빈도수를 간단히 알 수 있다
# 빈도수가 높은 순으로 정렬된다.
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [29]:
# 빈도수와 관련없이 나타나는 순서대로 보려면
obj.value_counts(sort=False)

d    1
a    3
c    3
b    2
dtype: int64

In [30]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [31]:
# 특정한 내용이 들어있는지 알려면 isin()을 사용한다
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [32]:
obj[mask] ## obj에서 mask 값이 true인 값만을 출력

0    c
5    b
6    b
7    c
8    c
dtype: object

In [33]:
# 아래는 같은 결과를 얻는다
obj[obj.isin(['b', 'c'])] 

0    c
5    b
6    b
7    c
8    c
dtype: object

In [34]:
frame = DataFrame({'X':['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'], 
                  'Y':['f', 'g', 'd', 'g', 'h', 'e', 'd', 'h', 'f'],
                   'Z':['a', 'e', 'd', 'g', 'd', 'e', 'q', 'b', 'c']})

In [35]:
frame

Unnamed: 0,X,Y,Z
0,c,f,a
1,a,g,e
2,d,d,d
3,a,g,g
4,a,h,d
5,b,e,e
6,b,d,q
7,c,h,b
8,c,f,c


In [36]:
frame.apply(lambda s: s.value_counts())    # 아래와 같다

Unnamed: 0,X,Y,Z
a,3.0,,1.0
b,2.0,,1.0
c,3.0,,1.0
d,1.0,2.0,2.0
e,,1.0,2.0
f,,2.0,
g,,2.0,1.0
h,,2.0,
q,,,1.0


In [37]:
# 각 항목이 들어 있는 갯수를 센다. 없는 값은 NaN으로 표시된다
result = frame.apply(pd.value_counts)
result

Unnamed: 0,X,Y,Z
a,3.0,,1.0
b,2.0,,1.0
c,3.0,,1.0
d,1.0,2.0,2.0
e,,1.0,2.0
f,,2.0,
g,,2.0,1.0
h,,2.0,
q,,,1.0


In [38]:
# 없는 값에 0을 대입한다
result = frame.apply(pd.value_counts).fillna(0) ## fillna 은 fii NaN을 뜻한다.
result

Unnamed: 0,X,Y,Z
a,3.0,0.0,1.0
b,2.0,0.0,1.0
c,3.0,0.0,1.0
d,1.0,2.0,2.0
e,0.0,1.0,2.0
f,0.0,2.0,0.0
g,0.0,2.0,1.0
h,0.0,2.0,0.0
q,0.0,0.0,1.0


In [39]:
result.head()

Unnamed: 0,X,Y,Z
a,3.0,0.0,1.0
b,2.0,0.0,1.0
c,3.0,0.0,1.0
d,1.0,2.0,2.0
e,0.0,1.0,2.0


In [40]:
# 결측치 처리
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna() # drop na은 drop NaN을 뜻하여 data에서 Na값들이 떨어져 나간 것을 볼 수 있습니다.

0    1.0
2    3.5
4    7.0
dtype: float64

In [41]:
# 같은 결과
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [42]:
data = DataFrame([[NA, 6.5, 3.], [NA, NA, NA],
                  [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,,6.5,3.0
1,,,
2,,,
3,,6.5,3.0


In [43]:
# 한 항목이라도 NA가 있으면 해당 행을 삭제한다
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2


In [44]:
# 행의 모든 항목이 NA일때 해당 행을 삭제한다
data.dropna(how='all')

Unnamed: 0,0,1,2
0,,6.5,3.0
3,,6.5,3.0


In [45]:
data

Unnamed: 0,0,1,2
0,,6.5,3.0
1,,,
2,,,
3,,6.5,3.0


In [46]:
# 컬럼에 대한 삭제시는 axis=1을 사용한다
clean2 = data.dropna(axis=1)
clean2

0
1
2
3


In [47]:
clean2 = data.dropna(axis=1, how='all')
clean2

Unnamed: 0,1,2
0,6.5,3.0
1,,
2,,
3,6.5,3.0


In [48]:
df = DataFrame(np.random.randn(7 , 3))
df.iloc[:4, 1] = NA 
df.iloc[:2, 2] = NA
df.iloc[0,0] = NA
df

Unnamed: 0,0,1,2
0,,,
1,0.236243,,
2,-0.364852,,1.212017
3,-0.156226,,-0.391007
4,-0.271375,-0.051075,1.838255
5,-0.394614,0.075961,-0.000364
6,-1.729751,0.744601,-0.232975


In [49]:
df.dropna(thresh=2)
# NA가 2개 이상인 경우 삭제

Unnamed: 0,0,1,2
2,-0.364852,,1.212017
3,-0.156226,,-0.391007
4,-0.271375,-0.051075,1.838255
5,-0.394614,0.075961,-0.000364
6,-1.729751,0.744601,-0.232975


In [50]:
df

Unnamed: 0,0,1,2
0,,,
1,0.236243,,
2,-0.364852,,1.212017
3,-0.156226,,-0.391007
4,-0.271375,-0.051075,1.838255
5,-0.394614,0.075961,-0.000364
6,-1.729751,0.744601,-0.232975


In [51]:
# 컬럼별로 다른 값을 채울 수 있다. 사전을 사용한다
df.fillna({1: 0.5, 2: -1}) 

Unnamed: 0,0,1,2
0,,0.5,-1.0
1,0.236243,0.5,-1.0
2,-0.364852,0.5,1.212017
3,-0.156226,0.5,-0.391007
4,-0.271375,-0.051075,1.838255
5,-0.394614,0.075961,-0.000364
6,-1.729751,0.744601,-0.232975


In [52]:
df # fillna()로 내용은 바뀌지 않는다. 

Unnamed: 0,0,1,2
0,,,
1,0.236243,,
2,-0.364852,,1.212017
3,-0.156226,,-0.391007
4,-0.271375,-0.051075,1.838255
5,-0.394614,0.075961,-0.000364
6,-1.729751,0.744601,-0.232975


In [53]:
# 새로운 변수를 정의하면 바뀐 값을 얻는다
df2 = df.fillna({1: 0.5, 3: -1})
df2

Unnamed: 0,0,1,2
0,,0.5,
1,0.236243,0.5,
2,-0.364852,0.5,1.212017
3,-0.156226,0.5,-0.391007
4,-0.271375,-0.051075,1.838255
5,-0.394614,0.075961,-0.000364
6,-1.729751,0.744601,-0.232975


In [2]:
import numpy as np
import pandas as pd

col_name1 = ['col1']
list1 = [1, 2, 3]
array1 = np.array(list1)
print('array1 shape: ', array1.shape)


df_list1 = pd.DataFrame(list1, columns = col_name1, index=range(1, 4, 1))
df_list1

array1 shape:  (3,)


Unnamed: 0,col1
1,1
2,2
3,3


In [3]:
df_array1 = pd.DataFrame(array1, columns=col_name1, index=range(1, 4, 1))
df_array1

Unnamed: 0,col1
1,1
2,2
3,3


In [4]:
a = [4, 5, 6]
df_array1['col_2'] = a

In [5]:
df_array1

Unnamed: 0,col1,col_2
1,1,4
2,2,5
3,3,6


In [6]:
df_array1['col_2']

1    4
2    5
3    6
Name: col_2, dtype: int64

In [9]:
series_1 = pd.Series([7, 8, 9], index=range(1, 4, 1))
df_array1['col_3'] = series_1

In [10]:
df_array1

Unnamed: 0,col1,col_2,col_3
1,1,4,7
2,2,5,8
3,3,6,9


In [13]:
df = pd.DataFrame({'Id':['A', 'B', 'B', 'C', 'D', 'E', 'F', 'F'],
                    'Value':[10, 20, 15, 5, 35, 20, 10, 25]})

In [16]:
df['Id'].value_counts()

C    1
D    1
B    2
A    1
F    2
E    1
Name: Id, dtype: int64

In [18]:
arr = [1, 2, 3]
print(arr)
arr

[1, 2, 3]


[1, 2, 3]

In [22]:
df_array1.apply(lambda x: x.max() - x.min(), axis=1)

1    6
2    6
3    6
dtype: int64

In [24]:
df_array1.values.tolist()

for i in df_array1.values.tolist():
    for j in i:
        print(j)

1
4
7
2
5
8
3
6
9


1->2
3->

In [39]:
import pandas as pd
import numpy as np

df = pd.DataFrame(  [[1, np.nan, 2, np.nan],
                    [3, np.nan, 4, np.nan],
                    [5, 6, 7, np.nan],
                    [np.nan, np.nan, np.nan, np.nan]],
                    columns=list('abcd'))

In [40]:
df

Unnamed: 0,a,b,c,d
0,1.0,,2.0,
1,3.0,,4.0,
2,5.0,6.0,7.0,
3,,,,


In [42]:
for i in range(5):
    x = np.random.randint(1, 6)
    print(x)

2
1
3
4
5


In [43]:
df

Unnamed: 0,a,b,c,d
0,1.0,,2.0,
1,3.0,,4.0,
2,5.0,6.0,7.0,
3,,,,


In [44]:
df.dropna()

Unnamed: 0,a,b,c,d


In [47]:
df.fillna(0)

Unnamed: 0,a,b,c,d
0,1.0,0.0,2.0,0.0
1,3.0,0.0,4.0,0.0
2,5.0,6.0,7.0,0.0
3,0.0,0.0,0.0,0.0


In [50]:
new_data = {'a':0, 'b':1, 'c':-9999, 'd': 00}
print(df.fillna(new_data))

     a    b       c    d
0  1.0  1.0     2.0  0.0
1  3.0  1.0     4.0  0.0
2  5.0  6.0     7.0  0.0
3  0.0  1.0 -9999.0  0.0
