# Pandas (panel data system)

# Syntax

In [1]:
import pandas as pd

In [2]:
df=pd.DataFrame(
    {"a":[4,5,6],
     "b":[7,8,9],
     "c":[10,11,12]},
     index=[1,2,3])
df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [3]:
# Subtracting "a" column
df["a"]

1    4
2    5
3    6
Name: a, dtype: int64

In [4]:
# Subtracting "a" and "b" columns (list형태로 묶는다)
df[["a","b"]]

Unnamed: 0,a,b
1,4,7
2,5,8
3,6,9


In [5]:
# Subtracting "1" row
df.loc[1]

a     4
b     7
c    10
Name: 1, dtype: int64

In [6]:
# Subtracting"1" row & "a" column
df.loc[3,"a"]

6

In [7]:
# # Subtracting a specific matrix
df.loc[[1,2],["a","b"]] # 여러 특정 행열

Unnamed: 0,a,b
1,4,7
2,5,8


In [8]:
# The first way to create a dataframe
df=pd.DataFrame(
    {"a":[4,5,9],
    "b":[7,8,9],
    "c":[10,11,12]},
    index=[1,2,3])
df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,9,9,12


In [9]:
# The second way to create a dataframe
df=pd.DataFrame(
    [[4,7,10],
    [5,8,11],
    [6,9,12]],
    index=[1,2,3],
    columns=['a','b','c'])
df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [10]:
# Creating a multiindex dataframe
df=pd.DataFrame(
    {'a':[4,5,6],
     'b':[7,8,9],
     'c':[10,11,12]},
     index=pd.MultiIndex.from_tuples(
     [('d',1),('d',2),('e',2)],
     names=['n','v']))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12


# Subset Observations (Rows)

In [11]:
df[df.Length > 7] # error: df에 Length라는 열 없기때문

AttributeError: 'DataFrame' object has no attribute 'Length'

In [12]:
df[df.b > 7] # 조건에 맞는 값들 색인

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,2,5,8,11
e,2,6,9,12


In [13]:
df.b > 7
df['b'] > 7

n  v
d  1    False
   2     True
e  2     True
Name: b, dtype: bool

In [14]:
df.drop_duplicates() # 중복값 없어 변화 없음

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12


In [15]:
df2=pd.DataFrame(
    {'a':[4,5,6,6],
     'b':[7,8,9,9],
     'c':[10,11,12,12]},
     index=pd.MultiIndex.from_tuples(
     [('d',1),('d',2),('e',2),('e',3)],
     names=['n','v']))
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12
e,3,6,9,12


In [16]:
df2.drop_duplicates() # 중복된 행 삭제, drop_duplicates?로 도움말

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12


In [17]:
df.sample(frac=0.5) # frac : 전체 행 중 추출할 행의 비율

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11


In [18]:
df.sample(n=2) # n =  추출할 행의 개수

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e,2,6,9,12
d,1,4,7,10


In [19]:
df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
             index=['cobra', 'viper', 'sidewinder'],
             columns=['max_speed', 'shield'])
df

Unnamed: 0,max_speed,shield
cobra,1,2
viper,4,5
sidewinder,7,8


In [20]:
df.loc['viper'] # 해당 행 정보 추출

max_speed    4
shield       5
Name: viper, dtype: int64

In [25]:
df.iloc[0:1] # 0번째 행부터 1번째 행까지 추출

Unnamed: 0,max_speed,shield
cobra,1,2


In [27]:
df.nlargest?

In [28]:
df = pd.DataFrame({'population': [59000000, 65000000, 434000,434000, 434000, 337000, 11300,11300, 11300],
                   'GDP': [1937894, 2583560 , 12011, 4520, 12128,17036, 182, 38, 311],
                   'alpha-2': ["IT", "FR", "MT", "MV", "BN","IS", "NR", "TV", "AI"]},
                  index=["Italy", "France", "Malta","Maldives", "Brunei", "Iceland","Nauru", "Tuvalu", "Anguilla"])
df

Unnamed: 0,population,GDP,alpha-2
Italy,59000000,1937894,IT
France,65000000,2583560,FR
Malta,434000,12011,MT
Maldives,434000,4520,MV
Brunei,434000,12128,BN
Iceland,337000,17036,IS
Nauru,11300,182,NR
Tuvalu,11300,38,TV
Anguilla,11300,311,AI


In [29]:
df.nlargest(3, 'population') # 항목 선택 후 상위부터 추출

Unnamed: 0,population,GDP,alpha-2
France,65000000,2583560,FR
Italy,59000000,1937894,IT
Malta,434000,12011,MT


In [30]:
df.nlargest(3, 'population', keep='last') # keep=last : 동순위 항목이 오름차순으로 선택됨

Unnamed: 0,population,GDP,alpha-2
France,65000000,2583560,FR
Italy,59000000,1937894,IT
Brunei,434000,12128,BN


In [31]:
df.nlargest(3, 'population', keep='all') # keep=all : 동순위 항목이 모두 선택됨

Unnamed: 0,population,GDP,alpha-2
France,65000000,2583560,FR
Italy,59000000,1937894,IT
Malta,434000,12011,MT
Maldives,434000,4520,MV
Brunei,434000,12128,BN


In [32]:
df.nlargest(3, ['population', 'GDP']) # population항목 기준 적용 후 GDP항목 기준으로 정렬

Unnamed: 0,population,GDP,alpha-2
France,65000000,2583560,FR
Italy,59000000,1937894,IT
Brunei,434000,12128,BN


In [35]:
df.nsmallest(3, 'population')

Unnamed: 0,population,GDP,alpha-2
Nauru,11300,182,NR
Tuvalu,11300,38,TV
Anguilla,11300,311,AI


In [33]:
df.nsmallest(3, ['population', 'GDP'])

Unnamed: 0,population,GDP,alpha-2
Tuvalu,11300,38,TV
Nauru,11300,182,NR
Anguilla,11300,311,AI


## Logic in Python (and pandas)

In [38]:
df=pd.DataFrame(
    {'a':[4,5,6],
     'b':[7,8,9],
     'c':[10,11,12]},
     index=pd.MultiIndex.from_tuples(
     [('d',1),('d',2),('e',2)],
     names=['n','v']))
df.a.isin([5]) # isin에는 리스트형태를 넣어야함
df['a'].isin([5])

n  v
d  1    False
   2     True
e  2    False
Name: a, dtype: bool

In [39]:
import numpy as np
df3=pd.DataFrame(
    {'a':[4,5,6,6,np.nan],
     'b':[7,8,np.nan,9,9],
     'c':[10,11,12,np.nan,12]},
     index=pd.MultiIndex.from_tuples(
     [('d',1),('d',2),('e',2),('e',3),('e',4)],
     names=['n','v']))
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,8.0,11.0
e,2,6.0,,12.0
e,3,6.0,9.0,
e,4,,9.0,12.0


In [40]:
pd.isnull(df3)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,False,False,False
d,2,False,False,False
e,2,False,True,False
e,3,False,False,True
e,4,True,False,False


In [41]:
df3['a'].isnull()
df3['a'].isnull().sum()

1

In [42]:
pd.notnull(df3) # df3.notnull()
df3.notnull().sum()

a    4
b    4
c    4
dtype: int64

* &,|,~,^,df.any(),df.all() L
* and, or, not, xor, any, all

In [43]:
~df3.a.notnull() 

n  v
d  1    False
   2    False
e  2    False
   3    False
   4     True
Name: a, dtype: bool

In [44]:
df.any?

In [45]:
df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
df

Unnamed: 0,A,B,C
0,1,0,0
1,2,2,0


In [46]:
df.any() # 각 항에 값이 적어도 한개 있는가

A     True
B     True
C    False
dtype: bool

In [47]:
df.all() # 각 항에 값이 모두 있는가

A     True
B    False
C    False
dtype: bool