# Pandas

: pandas 는 series data type과 dataFrame data type 으로 구성된다.

### Series (1차원) : numpy array 유사
: numpy와 달리 Series는 axis(행, 열)에 label을 부여할 수 있다.

### DataFrame (2차원, table)
: Python Program 안의 excel

![image.png](attachment:image.png)

In [1]:
import numpy as np
import pandas as pd

In [3]:
np.random.seed(101)
data = np.random.randn(5,4)
data

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [6]:
df = pd.DataFrame(data, columns=['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509


In [7]:
df.columns

Index(['W', 'X', 'Y', 'Z'], dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   W       5 non-null      float64
 1   X       5 non-null      float64
 2   Y       5 non-null      float64
 3   Z       5 non-null      float64
dtypes: float64(4)
memory usage: 288.0 bytes


In [9]:
df.describe() # 기술 통계

Unnamed: 0,W,X,Y,Z
count,5.0,5.0,5.0,5.0
mean,0.343858,0.453764,0.452287,0.431871
std,1.681131,1.061385,1.454516,0.594708
min,-2.018168,-0.758872,-0.933237,-0.589001
25%,0.188695,-0.319318,-0.848077,0.503826
50%,0.190794,0.628133,0.528813,0.605965
75%,0.651118,0.740122,0.907969,0.683509
max,2.70685,1.978757,2.605967,0.955057


### DataFrame Indexing

In [10]:

df['W']

0    2.706850
1    0.651118
2   -2.018168
3    0.188695
4    0.190794
Name: W, dtype: float64

In [11]:
df.W

0    2.706850
1    0.651118
2   -2.018168
3    0.188695
4    0.190794
Name: W, dtype: float64

In [13]:
df[['W', 'X', 'Y']]

Unnamed: 0,W,X,Y
0,2.70685,0.628133,0.907969
1,0.651118,-0.319318,-0.848077
2,-2.018168,0.740122,0.528813
3,0.188695,-0.758872,-0.933237
4,0.190794,1.978757,2.605967


### NEW column 추가/삭제

In [16]:
df['NEW'] = df['X'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,NEW
0,2.70685,0.628133,0.907969,0.503826,1.536102
1,0.651118,-0.319318,-0.848077,0.605965,-1.167395
2,-2.018168,0.740122,0.528813,-0.589001,1.268936
3,0.188695,-0.758872,-0.933237,0.955057,-1.692109
4,0.190794,1.978757,2.605967,0.683509,4.584725


In [17]:
df.drop('NEW',axis=1)
df

Unnamed: 0,W,X,Y,Z,NEW
0,2.70685,0.628133,0.907969,0.503826,1.536102
1,0.651118,-0.319318,-0.848077,0.605965,-1.167395
2,-2.018168,0.740122,0.528813,-0.589001,1.268936
3,0.188695,-0.758872,-0.933237,0.955057,-1.692109
4,0.190794,1.978757,2.605967,0.683509,4.584725


In [18]:
df = df.drop('NEW', axis=1) # df.drop('NEW', axis=1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509


## Missing Data 처리
- missing data 가 있는 row 혹은 columns을 완전히 삭제: dropna()
- 임의의 data로 대체 : fillna()

In [20]:
df = pd.DataFrame({'A':[1,np.nan,3], 'B':[5,np.nan,np.nan], 'C':[1,2,3]})

# missing value를 포함하고 있는 모든 column 삭제

df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [None]:
# missing value를 포함하고 있는 모든 row 삭제
df.dropna(axis=0)