<a href="https://colab.research.google.com/github/dayeong918/pdm011/blob/main/py-pandas/pandas_2_handling_df.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Python module 3. **pandas**

# Using pandas

* [10 Minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/10min.html)
* [Pandas tutorial with interactive exercises](https://www.kaggle.com/pistak/pandas-tutorial-with-interactive-exercises)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline  # work for Jupyter notebook or lab



---



## [2] Handling DataFrame
- head()
- tail()
- describe()
- info()

In [2]:
# Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:
dates = pd.date_range('20210927', periods=6)
dates

DatetimeIndex(['2021-09-27', '2021-09-28', '2021-09-29', '2021-09-30',
               '2021-10-01', '2021-10-02'],
              dtype='datetime64[ns]', freq='D')

In [3]:
# head()
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df.head()

Unnamed: 0,A,B,C,D
2021-09-27,1.15759,0.691274,0.036269,0.448139
2021-09-28,-0.330264,0.45589,1.404193,0.544384
2021-09-29,0.204433,-0.686301,-0.070389,0.874862
2021-09-30,-0.76041,0.700124,-0.658189,-0.771481
2021-10-01,1.35947,1.763408,-0.724488,1.455381


In [4]:
df.head(2)

Unnamed: 0,A,B,C,D
2021-09-27,1.15759,0.691274,0.036269,0.448139
2021-09-28,-0.330264,0.45589,1.404193,0.544384


In [5]:
df.tail(3)

Unnamed: 0,A,B,C,D
2021-09-30,-0.76041,0.700124,-0.658189,-0.771481
2021-10-01,1.35947,1.763408,-0.724488,1.455381
2021-10-02,-0.63318,-0.548448,0.103891,0.56083


In [6]:
# Display the index, columns, and the underlying NumPy data:
df.index

DatetimeIndex(['2021-09-27', '2021-09-28', '2021-09-29', '2021-09-30',
               '2021-10-01', '2021-10-02'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2021-09-27 to 2021-10-02
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [9]:
# describe() shows a quick statistic summary of your data:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.166273,0.395991,0.015215,0.518686
std,0.911296,0.907351,0.767532,0.731601
min,-0.76041,-0.686301,-0.724488,-0.771481
25%,-0.557451,-0.297363,-0.511239,0.4722
50%,-0.062915,0.573582,-0.01706,0.552607
75%,0.919301,0.697911,0.086985,0.796354
max,1.35947,1.763408,1.404193,1.455381


In [10]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A,6.0,0.166273,0.911296,-0.76041,-0.557451,-0.062915,0.919301,1.35947
B,6.0,0.395991,0.907351,-0.686301,-0.297363,0.573582,0.697911,1.763408
C,6.0,0.015215,0.767532,-0.724488,-0.511239,-0.01706,0.086985,1.404193
D,6.0,0.518686,0.731601,-0.771481,0.4722,0.552607,0.796354,1.455381


In [11]:
# Transposing your dataframe:
df.T

Unnamed: 0,2021-09-27,2021-09-28,2021-09-29,2021-09-30,2021-10-01,2021-10-02
A,1.15759,-0.330264,0.204433,-0.76041,1.35947,-0.63318
B,0.691274,0.45589,-0.686301,0.700124,1.763408,-0.548448
C,0.036269,1.404193,-0.070389,-0.658189,-0.724488,0.103891
D,0.448139,0.544384,0.874862,-0.771481,1.455381,0.56083


In [12]:
df.T.index

Index(['A', 'B', 'C', 'D'], dtype='object')

### Sorting

#### Sort by index
- sort_index(axis=0, ascending=False)
- sort_index(axis=1, ascending=False)

> Axis=0 Column-Wise Operation (수직으로)

> Axis=1 Row-Wise Operation (수평으로)

In [13]:
# Sorting by an axis:
df, df.sort_index(axis=0) #, ascending=False)

(                   A         B         C         D
 2021-09-27  1.157590  0.691274  0.036269  0.448139
 2021-09-28 -0.330264  0.455890  1.404193  0.544384
 2021-09-29  0.204433 -0.686301 -0.070389  0.874862
 2021-09-30 -0.760410  0.700124 -0.658189 -0.771481
 2021-10-01  1.359470  1.763408 -0.724488  1.455381
 2021-10-02 -0.633180 -0.548448  0.103891  0.560830,
                    A         B         C         D
 2021-09-27  1.157590  0.691274  0.036269  0.448139
 2021-09-28 -0.330264  0.455890  1.404193  0.544384
 2021-09-29  0.204433 -0.686301 -0.070389  0.874862
 2021-09-30 -0.760410  0.700124 -0.658189 -0.771481
 2021-10-01  1.359470  1.763408 -0.724488  1.455381
 2021-10-02 -0.633180 -0.548448  0.103891  0.560830)

In [14]:
df,df.sort_index(axis=1, ascending=False) 
# 내림차순 정렬
# 데이터는 기본적으로 오름차순으로 정렬됨
# 내림차순으로 정렬을 할 때는 ascending=False로 지정.

(                   A         B         C         D
 2021-09-27  1.157590  0.691274  0.036269  0.448139
 2021-09-28 -0.330264  0.455890  1.404193  0.544384
 2021-09-29  0.204433 -0.686301 -0.070389  0.874862
 2021-09-30 -0.760410  0.700124 -0.658189 -0.771481
 2021-10-01  1.359470  1.763408 -0.724488  1.455381
 2021-10-02 -0.633180 -0.548448  0.103891  0.560830,
                    D         C         B         A
 2021-09-27  0.448139  0.036269  0.691274  1.157590
 2021-09-28  0.544384  1.404193  0.455890 -0.330264
 2021-09-29  0.874862 -0.070389 -0.686301  0.204433
 2021-09-30 -0.771481 -0.658189  0.700124 -0.760410
 2021-10-01  1.455381 -0.724488  1.763408  1.359470
 2021-10-02  0.560830  0.103891 -0.548448 -0.633180)

#### Sort by value
- sort_values(by='column')

In [20]:
# Sorting by values:
df,df.sort_values(by='B') #, ascending=False)

(                   A         B         C         D
 2021-09-27  1.157590  0.691274  0.036269  0.448139
 2021-09-28 -0.330264  0.455890  1.404193  0.544384
 2021-09-29  0.204433 -0.686301 -0.070389  0.874862
 2021-09-30 -0.760410  0.700124 -0.658189 -0.771481
 2021-10-01  1.359470  1.763408 -0.724488  1.455381
 2021-10-02 -0.633180 -0.548448  0.103891  0.560830,
                    A         B         C         D
 2021-09-29  0.204433 -0.686301 -0.070389  0.874862
 2021-10-02 -0.633180 -0.548448  0.103891  0.560830
 2021-09-28 -0.330264  0.455890  1.404193  0.544384
 2021-09-27  1.157590  0.691274  0.036269  0.448139
 2021-09-30 -0.760410  0.700124 -0.658189 -0.771481
 2021-10-01  1.359470  1.763408 -0.724488  1.455381)

In [21]:
# Sorting by values:
df,df.sort_values(by='B', ascending=False)

(                   A         B         C         D
 2021-09-27  1.157590  0.691274  0.036269  0.448139
 2021-09-28 -0.330264  0.455890  1.404193  0.544384
 2021-09-29  0.204433 -0.686301 -0.070389  0.874862
 2021-09-30 -0.760410  0.700124 -0.658189 -0.771481
 2021-10-01  1.359470  1.763408 -0.724488  1.455381
 2021-10-02 -0.633180 -0.548448  0.103891  0.560830,
                    A         B         C         D
 2021-10-01  1.359470  1.763408 -0.724488  1.455381
 2021-09-30 -0.760410  0.700124 -0.658189 -0.771481
 2021-09-27  1.157590  0.691274  0.036269  0.448139
 2021-09-28 -0.330264  0.455890  1.404193  0.544384
 2021-10-02 -0.633180 -0.548448  0.103891  0.560830
 2021-09-29  0.204433 -0.686301 -0.070389  0.874862)

## indexing and slicing of DataFrame

#### Selecting data by indexing and slicing
- indexing
- slicing


In [22]:
# Selecting a single column, which yields a Series
df['A']

2021-09-27    1.157590
2021-09-28   -0.330264
2021-09-29    0.204433
2021-09-30   -0.760410
2021-10-01    1.359470
2021-10-02   -0.633180
Freq: D, Name: A, dtype: float64

In [23]:
# Selecting via [], which slices the rows.
df[0:3]

Unnamed: 0,A,B,C,D
2021-09-27,1.15759,0.691274,0.036269,0.448139
2021-09-28,-0.330264,0.45589,1.404193,0.544384
2021-09-29,0.204433,-0.686301,-0.070389,0.874862


In [24]:
df['20200927':'20201001'] # 인덱스가 아닌 값인 경우는 지정된 범위가 다 선택된다.

Unnamed: 0,A,B,C,D


#### Selecting data by label [중요!!]

> **loc, iloc**


In [50]:
df

Unnamed: 0,A,B,C,D
2021-09-27,1.15759,0.691274,0.036269,0.448139
2021-09-28,-0.330264,0.45589,1.404193,0.544384
2021-09-29,0.204433,-0.686301,-0.070389,0.874862
2021-09-30,-0.76041,0.700124,-0.658189,-0.771481
2021-10-01,1.35947,1.763408,-0.724488,1.455381
2021-10-02,-0.63318,-0.548448,0.103891,0.56083


In [26]:
df.loc[dates[0]]   # loc()

A    1.157590
B    0.691274
C    0.036269
D    0.448139
Name: 2021-09-27 00:00:00, dtype: float64

In [27]:
# Selecting on a multi-axis by label:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2021-09-27,1.15759,0.691274
2021-09-28,-0.330264,0.45589
2021-09-29,0.204433,-0.686301
2021-09-30,-0.76041,0.700124
2021-10-01,1.35947,1.763408
2021-10-02,-0.63318,-0.548448


#### [DIY: 도전코딩]

> Select data for first two days AND comumn 3,4 from df using loc[].

In [63]:
# df.loc[0:2,['C','D']]
# loc 함수를 쓸 때는 정수 인덱스 X, *value 인덱스 O:중간고사필기.
# df.loc['20210928':'20210930',['C','D']]
df.loc[dates[:2],['C','D']]

Unnamed: 0,C,D
2021-09-27,0.036269,0.448139
2021-09-28,1.404193,0.544384


#### Selecting data by position (iloc())
- index 사용

In [31]:
df

Unnamed: 0,A,B,C,D
2021-09-27,1.15759,0.691274,0.036269,0.448139
2021-09-28,-0.330264,0.45589,1.404193,0.544384
2021-09-29,0.204433,-0.686301,-0.070389,0.874862
2021-09-30,-0.76041,0.700124,-0.658189,-0.771481
2021-10-01,1.35947,1.763408,-0.724488,1.455381
2021-10-02,-0.63318,-0.548448,0.103891,0.56083


In [32]:
df.iloc[3]  # 결과는 차원축소형으로 표현됨.

A   -0.760410
B    0.700124
C   -0.658189
D   -0.771481
Name: 2021-09-30 00:00:00, dtype: float64

In [33]:
# [다시 도전] 
# Select data for first two days AND comumn 3,4 from df.
# Use iloc
df.iloc[:2,2:4]

Unnamed: 0,C,D
2021-09-27,0.036269,0.448139
2021-09-28,1.404193,0.544384


In [34]:
# Select one item
df.iloc[1,1]

0.45589020640642564

In [64]:
# [DIY: 다시 도전] iloc사용.
# Select data for first three days from df
# Your code
df.iloc[:3:]
# df[:3,:]

Unnamed: 0,A,B,C,D
2021-09-27,1.15759,0.691274,0.036269,0.448139
2021-09-28,-0.330264,0.45589,1.404193,0.544384
2021-09-29,0.204433,-0.686301,-0.070389,0.874862


#### Selecting data by Boolean indexing

In [None]:
df

Unnamed: 0,A,B,C,D
2021-09-27,0.157145,-0.65682,-0.392772,-0.428795
2021-09-28,1.489694,0.320003,-0.89781,-1.051432
2021-09-29,-0.882165,0.342783,0.372968,-2.059851
2021-09-30,1.258796,0.388243,-0.748868,-1.501119
2021-10-01,0.115433,0.417245,-0.289182,0.559837
2021-10-02,1.584692,-0.208394,-0.681593,-0.63617


In [None]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2021-09-27,0.157145,-0.65682,-0.392772,-0.428795
2021-09-28,1.489694,0.320003,-0.89781,-1.051432
2021-09-30,1.258796,0.388243,-0.748868,-1.501119
2021-10-01,0.115433,0.417245,-0.289182,0.559837
2021-10-02,1.584692,-0.208394,-0.681593,-0.63617


In [36]:
df[df > 0]

Unnamed: 0,A,B,C,D
2021-09-27,1.15759,0.691274,0.036269,0.448139
2021-09-28,,0.45589,1.404193,0.544384
2021-09-29,0.204433,,,0.874862
2021-09-30,,0.700124,,
2021-10-01,1.35947,1.763408,,1.455381
2021-10-02,,,0.103891,0.56083
