In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame(dict(col1 = [100, 200, 300],
                       col2 = [4, 5, 6],
                       shape = ["A", "B", "C"]))
df 

Unnamed: 0,col1,col2,shape
0,100,4,A
1,200,5,B
2,300,6,C


In [4]:
df.iloc[:2, ]

Unnamed: 0,col1,col2,shape
0,100,4,A
1,200,5,B


In [5]:
df.loc[:2, ]

Unnamed: 0,col1,col2,shape
0,100,4,A
1,200,5,B
2,300,6,C


In [6]:
df.shape

(3, 3)

In [7]:
df["shape"]

0    A
1    B
2    C
Name: shape, dtype: object

In [8]:
df["new_col"] = 1
df

Unnamed: 0,col1,col2,shape,new_col
0,100,4,A,1
1,200,5,B,1
2,300,6,C,1


In [11]:
df.new_col1 = 1

In [12]:
df.new_col1

1

In [13]:
df

Unnamed: 0,col1,col2,shape,new_col
0,100,4,A,1
1,200,5,B,1
2,300,6,C,1


In [14]:
df["col2"].replace(5, "ㅋㅋㅋ")

0      4
1    ㅋㅋㅋ
2      6
Name: col2, dtype: object

In [16]:
df["shape"].replace({"A": "ㅎㅎ", "B": "ㅠㅠ"})

0    ㅎㅎ
1    ㅠㅠ
2     C
Name: shape, dtype: object

In [19]:
df.columns = df.columns.str.upper()
df

Unnamed: 0,COL1,COL2,SHAPE,NEW_COL
0,100,4,A,1
1,200,5,B,1
2,300,6,C,1


In [20]:
df.columns = df.columns.str.lower()
df

Unnamed: 0,col1,col2,shape,new_col
0,100,4,A,1
1,200,5,B,1
2,300,6,C,1


In [22]:
df_t = pd.DataFrame(dict(obs = [100, 200, 300],
                         shape = ["aaa", "abc", "ccc"],
                         value = [123, "123-456", "010-1234-5678"]))
df_t

Unnamed: 0,obs,shape,value
0,100,aaa,123
1,200,abc,123-456
2,300,ccc,010-1234-5678


In [25]:
df_t["shape"].replace("aaa", "ㅋㅋ")

0     ㅋㅋ
1    abc
2    ccc
Name: shape, dtype: object

In [24]:
df_t["shape"].replace("a", "ㅋㅋ")

0    aaa
1    abc
2    ccc
Name: shape, dtype: object

In [23]:
df_t["shape"].str.replace("a", "ㅋㅋ")

0    ㅋㅋㅋㅋㅋㅋ
1      ㅋㅋbc
2       ccc
Name: shape, dtype: object

In [26]:
df_t["shape"].str.replace("c", "")

0    aaa
1     ab
2       
Name: shape, dtype: object

In [27]:
df_t["shape"].str.contains("c")

0    False
1     True
2     True
Name: shape, dtype: bool

In [28]:
~df_t["shape"].str.contains("c") # True/False 반전. 즉, c가 들어있지 않은 원소는?

0     True
1    False
2    False
Name: shape, dtype: bool

In [None]:
df_t.loc[df_t["shape"].str.contains("c"), ]

In [None]:
df_t.loc[~df_t["shape"].str.contains("c"), ]

In [30]:
df_t.loc[df_t["shape"].str.contains("a|b"), ] # a 또는 b

Unnamed: 0,obs,shape,value
0,100,aaa,123
1,200,abc,123-456


In [31]:
df_t["value"].str.split("-") # 123은 숫자이기 때문에 결측이 발생.

0                  NaN
1           [123, 456]
2    [010, 1234, 5678]
Name: value, dtype: object

In [32]:
df_t["value"].astype("str").str.split("-")

0                [123]
1           [123, 456]
2    [010, 1234, 5678]
Name: value, dtype: object

In [33]:
df_t["value"].astype("str").str.split("-").explode()

0     123
1     123
1     456
2     010
2    1234
2    5678
Name: value, dtype: object

In [34]:
df_t["value"].astype("str").str.split("-", expand = True)

Unnamed: 0,0,1,2
0,123,,
1,123,456.0,
2,10,1234.0,5678.0


In [35]:
# .add_prefix() 는 Pandas 2.0.0 부터 사용가능
df_t["value"].astype("str").str.split("-", expand = True).add_prefix("value_")

Unnamed: 0,value_0,value_1,value_2
0,123,,
1,123,456.0,
2,10,1234.0,5678.0


In [36]:
# None은 결측과 같아서 관련 메서드 사용가능
df_t["value"].astype("str").str.split("-", expand = True).fillna("😁")

Unnamed: 0,0,1,2
0,123,😁,😁
1,123,456,😁
2,10,1234,5678


In [37]:
df_t["value"].str.len()

0     NaN
1     7.0
2    13.0
Name: value, dtype: float64

In [42]:
ser_regex = pd.Series(["1234원", "1,234", "1,234$", "345.67"])
ser_regex

0     1234원
1     1,234
2    1,234$
3    345.67
dtype: object

In [43]:
ser_regex.str.replace(",", "")

0     1234원
1      1234
2     1234$
3    345.67
dtype: object

In [48]:
# regex: regular expression(정규표현식)
ser_regex.str.replace(",|원", "", regex = True) 

In [45]:
ser_regex.str.replace("$", "", regex = False) # 시험버전에서는 regex = True가 기본값

0     1234원
1     1,234
2     1,234
3    345.67
dtype: object

In [52]:
ser_regex.str.replace("[0-9]", "", regex = True) # 숫자를 제거

0     원
1     ,
2    ,$
3     .
dtype: object

In [51]:
ser_regex.str.replace("[^0-9]", "", regex = True) # 숫자를 제외한 모든 문자 제거

0     1234
1     1234
2     1234
3    34567
dtype: object

In [53]:
# 숫자와 마침표를 제외한 모든 문자 제거
ser_regex.str.replace("[^0-9.]", "", regex = True) 

0      1234
1      1234
2      1234
3    345.67
dtype: object

In [54]:
df_c = pd.DataFrame(dict(obs = [100, 200, 300],
                         col2 = ["code script", "python", "data!!"]))
df_c

Unnamed: 0,obs,col2
0,100,code script
1,200,python
2,300,data!!


In [56]:
# 시험버전의 pd.get_dummies() 는 대상 변수를 반드시 지정해야 하며
# One-Hot Encoding 을 실시하고자 하는 변수가 하나일지라도
# 해당 변수명을 리스트로 감싸주어야 한다.
# df_c_dum = pd.get_dummies(df_c, columns = ["col2"]) # 시험버전

# Pandas 1.0.0 부터는 object type의 변수는 별도지정없이 자동으로
# One-Hot Encoding 을 실시해준다. 필요시 columns 인자에 원하는 변수명을 리스트로 할당
df_c_dum = pd.get_dummies(df_c, dtype = "int") # Pandas 2.0.0
df_c_dum

Unnamed: 0,obs,col2_code script,col2_data!!,col2_python
0,100,1,0,0
1,200,0,0,1
2,300,0,1,0


**statsmodels** 라이브러리의 ols() 같은 클래스를 사용하는 경우 formula 입력시 변수명에 띄어쓰기나 특수문자가 있으면 해당 클래스 실행시 에러가 발생할 수 있음. 이를 위해 변수명을 별도로 처리를 해주어야 한다.

In [57]:
# 숫자, 영문 소문자, 영문 대문자, 언더바를 제외한 나머지를 모두 제거하는 정규식
df_c_dum.columns = df_c_dum.columns.str.replace("[^0-9a-zA-Z_]", "", regex = True)
df_c_dum

Unnamed: 0,obs,col2_codescript,col2_data,col2_python
0,100,1,0,0
1,200,0,0,1
2,300,0,1,0


In [58]:
pd.__version__

'2.0.2'

In [59]:
df = pd.read_csv("krx_202105_utf8.csv")
df.head(2)

Unnamed: 0,지수명,종가,대비,등락률,시가,고가,저가,거래량,거래대금,상장시가총액,날짜
0,KRX 300,1901.05,-11.63,-0.61,1913.15,1929.94,1896.65,279548840,13488481375036,2028800954161500,20210503
1,KTOP 30,11520.28,-86.3,-0.74,11609.44,11702.14,11490.84,50048566,5645660149606,1208343758559140,20210503


In [63]:
df_sub = df.loc[df["지수명"] == "KRX 300", ]
df_sub.head(2)

Unnamed: 0,지수명,종가,대비,등락률,시가,고가,저가,거래량,거래대금,상장시가총액,날짜
0,KRX 300,1901.05,-11.63,-0.61,1913.15,1929.94,1896.65,279548840,13488481375036,2028800954161500,20210503
29,KRX 300,1916.4,15.35,0.81,1903.57,1916.4,1892.36,244040965,11708937073166,2043945386926060,20210504


In [64]:
df["지수명"].unique()

array(['KRX 300', 'KTOP 30', 'KRX 100', 'KRX Mid 200', 'KRX 자동차',
       'KRX 반도체', 'KRX 헬스케어', 'KRX 은행', 'KRX 에너지화학', 'KRX 철강', 'KRX 방송통신',
       'KRX 건설', 'KRX 증권', 'KRX 기계장비', 'KRX 보험', 'KRX 운송', 'KRX 경기소비재',
       'KRX 필수소비재', 'KRX 미디어&엔터테인먼트', 'KRX 정보기술', 'KRX 유틸리티',
       'KRX 300 정보기술', 'KRX 300 금융', 'KRX 300 자유소비재', 'KRX 300 산업재',
       'KRX 300 헬스케어', 'KRX 300 커뮤니케이션서비스', 'KRX 300 소재', 'KRX 300 필수소비재'],
      dtype=object)

In [71]:
df.loc[df["종가"] == df["종가"].max(), ["지수명", "종가", "날짜"]]

Unnamed: 0,지수명,종가,날짜
117,KTOP 30,11869.29,20210510


In [73]:
df["종가"].idxmax()

117

In [72]:
df.loc[df["종가"].idxmax(), ["지수명", "종가", "날짜"]]

지수명     KTOP 30
종가     11869.29
날짜     20210510
Name: 117, dtype: object

In [66]:
df.loc[[df["종가"].idxmax()], ["지수명", "종가", "날짜"]]

Unnamed: 0,지수명,종가,날짜
117,KTOP 30,11869.29,20210510


In [76]:
df.iloc[0, ].reset_index()

Unnamed: 0,index,0
0,지수명,KRX 300
1,종가,1901.05
2,대비,-11.63
3,등락률,-0.61
4,시가,1913.15
5,고가,1929.94
6,저가,1896.65
7,거래량,279548840
8,거래대금,13488481375036
9,상장시가총액,2028800954161500


In [77]:
df["지수명"].unique()

array(['KRX 300', 'KTOP 30', 'KRX 100', 'KRX Mid 200', 'KRX 자동차',
       'KRX 반도체', 'KRX 헬스케어', 'KRX 은행', 'KRX 에너지화학', 'KRX 철강', 'KRX 방송통신',
       'KRX 건설', 'KRX 증권', 'KRX 기계장비', 'KRX 보험', 'KRX 운송', 'KRX 경기소비재',
       'KRX 필수소비재', 'KRX 미디어&엔터테인먼트', 'KRX 정보기술', 'KRX 유틸리티',
       'KRX 300 정보기술', 'KRX 300 금융', 'KRX 300 자유소비재', 'KRX 300 산업재',
       'KRX 300 헬스케어', 'KRX 300 커뮤니케이션서비스', 'KRX 300 소재', 'KRX 300 필수소비재'],
      dtype=object)

In [78]:
df_sub = df.loc[df["지수명"].str.contains("반도체"), ]
df_sub.head(2)

Unnamed: 0,지수명,종가,대비,등락률,시가,고가,저가,거래량,거래대금,상장시가총액,날짜
5,KRX 반도체,3852.24,-20.18,-0.52,3872.85,3940.15,3836.17,19861668,952405532133,131186340258830,20210503
34,KRX 반도체,3870.59,18.35,0.48,3849.45,3875.19,3774.04,20327698,792228260746,131691071180590,20210504


In [79]:
df = pd.read_csv("iris_missing.csv")
df_sub = df.head()
df_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sepal.Length  5 non-null      float64
 1   Sepal.Width   4 non-null      float64
 2   Petal.Length  4 non-null      float64
 3   Petal.Width   5 non-null      float64
 4   Species       5 non-null      object 
dtypes: float64(4), object(1)
memory usage: 328.0+ bytes


In [83]:
df_sub.count() # 정말정말 쓸일이 없음....

Sepal.Length    5
Sepal.Width     4
Petal.Length    4
Petal.Width     5
Species         5
dtype: int64

In [88]:
df_sub.dtypes

Sepal.Length    float64
Sepal.Width     float64
Petal.Length    float64
Petal.Width     float64
Species          object
dtype: object

In [84]:
df_sub.isna().sum() # 매우중요!!!!

Sepal.Length    0
Sepal.Width     1
Petal.Length    1
Petal.Width     0
Species         0
dtype: int64

In [85]:
df_sub.loc[df_sub["Sepal.Width"].isna(), ]

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,,,0.2,setosa


#### Q. "df_sub" 객체의 "Sepal.Width" 변수의 결측치를 제거하고 "df_base" 객체에 그 결과를 저장하시오.

In [90]:
# df_base = df_sub.loc[~df_sub["Sepal.Width"].isna(), ]
# df_base = df_sub.loc[df_sub["Sepal.Width"].isna() == False, ]
df_base = df_sub.loc[df_sub["Sepal.Width"].notna(), ]
df_base

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [91]:
df_sub

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,,,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [92]:
# 깊은복사(.copy()) 가 되지 않은 경우 다음과 같이 경고가 뜸.
# .copy() 또는 .reset_index(drop = True) 로 조치 가능.
df_sub["Sepal.Width"] = df_sub["Sepal.Width"].fillna(-1)
df_sub["Petal.Length"] = df_sub["Petal.Length"].fillna(df_sub["Petal.Length"].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub["Sepal.Width"] = df_sub["Sepal.Width"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub["Petal.Length"] = df_sub["Petal.Length"].fillna(df_sub["Petal.Length"].mean())


In [93]:
df_sub

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,-1.0,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [94]:
df_sub2 = df.dropna()
df_sub2.isna().sum()

Sepal.Length    0
Sepal.Width     0
Petal.Length    0
Petal.Width     0
Species         0
dtype: int64

머신러닝 모델링 코드 작성시, 데이터세트의 첫 번째 또는 마지막에 종속변수가 오는 것이 좋다. 그래서 .reset_index() 와 .set_index()의 연계(chaining)로 특정 변수를 가장 앞으로 이동시키는 것이 도움이 된다.

In [111]:
# 대상 변수의 기존 위치에 상관없이 무조건 데이터프레임 객체의 가장 첫번째로 이동됨
# 단, 기존의 index는 제거됨.
df = df.set_index("Species").reset_index()

In [112]:
df.head(2)

Unnamed: 0,Species,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,setosa,5.1,,,0.2
1,setosa,4.9,3.0,1.4,0.2


In [114]:
set(range(df.shape[1]))

{0, 1, 2, 3, 4}

In [108]:
# 특정 변수 마지막으로 옮기기.
idx_col = df.columns.to_list().index("Species")
df2 = df.iloc[:, list(set(range(df.shape[1])) - {idx_col}) + [idx_col]]
df2.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,,,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [116]:
np.r_[1:5, 0]

array([1, 2, 3, 4, 0])

In [115]:
df2 = df.iloc[:, np.r_[1:5, 0]]
df2.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,,,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [120]:
df2 = df[[i for i in df.columns if i not in ["Species"]] + ["Species"]]
df2.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,,,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [121]:
df2 = pd.concat([df.drop(columns = "Species"), 
                 df[["Species"]]],
                axis = 1) # 🙋‍♂️
df2.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,,,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [122]:
df.head(2)

Unnamed: 0,Species,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,setosa,5.1,,,0.2
1,setosa,4.9,3.0,1.4,0.2


In [123]:
df["Petal.Width"].rank()

0       20.0
1       20.0
2       20.0
3       20.0
4       20.0
       ...  
145    134.5
146      NaN
147    118.5
148    134.5
149      NaN
Name: Petal.Width, Length: 150, dtype: float64

In [124]:
df = pd.read_csv("bike.csv")
df.head(1)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16


In [125]:
df["season"].unique()

array([1, 2, 3, 4], dtype=int64)

In [126]:
df["season"].nunique() # number of unique elements

4

In [127]:
df["season"].value_counts()

season
4    2734
2    2733
3    2733
1    2686
Name: count, dtype: int64

In [128]:
df["season"].value_counts(normalize = True)

season
4    0.251148
2    0.251056
3    0.251056
1    0.246739
Name: proportion, dtype: float64

In [129]:
pd.crosstab(df["season"], df["holiday"])

holiday,0,1
season,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2615,71
2,2685,48
3,2637,96
4,2638,96


In [130]:
pd.crosstab(df["season"], df["holiday"],
            normalize = "index")
# 계절(season)별 공휴일(holiday)의 비율

holiday,0,1
season,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.973567,0.026433
2,0.982437,0.017563
3,0.964874,0.035126
4,0.964887,0.035113


In [133]:
df.groupby("season")["holiday"].mean()

season
1    0.026433
2    0.017563
3    0.035126
4    0.035113
Name: holiday, dtype: float64

In [132]:
pd.crosstab(df["season"], df["holiday"],
            normalize = "columns")
# 공휴일 여부에 따른 계절의 비율(?)

holiday,0,1
season,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.247281,0.228296
2,0.253901,0.154341
3,0.249362,0.308682
4,0.249456,0.308682


In [134]:
df.groupby("season")["holiday"].mean()

season
1    0.026433
2    0.017563
3    0.035126
4    0.035113
Name: holiday, dtype: float64

In [135]:
x = df.loc[df["season"] == df["season"].unique()[0], "holiday"]
x.mean()

0.02643335815338794

In [141]:
df.groupby(["season", "holiday"])["count"].mean()

season  holiday
1       0          117.494837
        1           73.929577
2       0          215.562756
        1          197.833333
3       0          234.293136
        1          237.822917
4       0          198.560273
        1          210.750000
Name: count, dtype: float64

In [142]:
df.groupby("season")[["casual", "count"]].mean()

Unnamed: 0_level_0,casual,count
season,Unnamed: 1_level_1,Unnamed: 2_level_1
1,15.489576,116.343261
2,47.446762,215.251372
3,52.220271,234.417124
4,28.580834,198.988296


In [143]:
df.groupby(["season", "holiday"])[["casual", "count"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,casual,count
season,holiday,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,15.689101,117.494837
1,1,8.140845,73.929577
2,0,47.609683,215.562756
2,1,38.333333,197.833333
3,0,50.773606,234.293136
3,1,91.958333,237.822917
4,0,28.133055,198.560273
4,1,40.885417,210.75


In [144]:
df.groupby("season")[["casual", "count"]].agg({"casual": "min", 
                                               "count": "max"})

Unnamed: 0_level_0,casual,count
season,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,801
2,0,873
3,0,977
4,0,948


In [145]:
df_g = df.groupby(["season", "holiday"])[["casual", "count"]].mean()
df_g.reset_index()

Unnamed: 0,season,holiday,casual,count
0,1,0,15.689101,117.494837
1,1,1,8.140845,73.929577
2,2,0,47.609683,215.562756
3,2,1,38.333333,197.833333
4,3,0,50.773606,234.293136
5,3,1,91.958333,237.822917
6,4,0,28.133055,198.560273
7,4,1,40.885417,210.75


#### Text 처리가 가미된 train/test set 분리 예제

In [153]:
df_split = pd.DataFrame(dict(id=["user_1001", "user_1002", 
                                 "user_1003", "user_1004", "user"],
                             value = [123, 234, 345, 456, 567]))
df_split

In [154]:
df_split["id1"] = df_split["id"].str.slice(5, 9)
df_split["id2"] = df_split["id"].str.replace("[^0-9]", "", regex = True)
df_split["id3"] = df_split["id"].str.extract("([0-9]{1,})$")
df_split["id4"] = df_split["id"].str.split("_", expand = True)[1]
df_split

Unnamed: 0,id,value,id1,id2,id3,id4
0,user_1001,123,1001.0,1001.0,1001.0,1001.0
1,user_1002,234,1002.0,1002.0,1002.0,1002.0
2,user_1003,345,1003.0,1003.0,1003.0,1003.0
3,user_1004,456,1004.0,1004.0,1004.0,1004.0
4,user,567,,,,


In [155]:
df_split_sub = df_split.loc[df_split["id1"] != "", ]
df_split_sub

Unnamed: 0,id,value,id1,id2,id3,id4
0,user_1001,123,1001,1001,1001,1001
1,user_1002,234,1002,1002,1002,1002
2,user_1003,345,1003,1003,1003,1003
3,user_1004,456,1004,1004,1004,1004


In [157]:
df_train = df_split_sub.loc[df_split_sub["id1"].astype("int") % 4 != 0, ]
df_test  = df_split_sub.loc[df_split_sub["id1"].astype("int") % 4 == 0, ]

In [158]:
df_train # "id1" 변수의 원소가 4의 배수가 아닌 행

Unnamed: 0,id,value,id1,id2,id3,id4
0,user_1001,123,1001,1001,1001,1001
1,user_1002,234,1002,1002,1002,1002
2,user_1003,345,1003,1003,1003,1003


In [159]:
df_test # "id1" 변수의 원소가 4의 배수인 행

Unnamed: 0,id,value,id1,id2,id3,id4
3,user_1004,456,1004,1004,1004,1004


In [161]:
ser_t1 = pd.Series(["2023-07-05", "2023-07-06"])
ser_t2 = pd.Series(["2023년 7월 5일", "2023년 7월 6일"])
ser_t3 = pd.Series([123456, 234567])

In [164]:
pd.to_datetime(ser_t1)

0   2023-07-05
1   2023-07-06
dtype: datetime64[ns]

In [166]:
# pd.to_datetime(ser_t2) # <-- Error!

https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior

In [167]:
pd.to_datetime(ser_t2, format = "%Y년 %m월 %d일")

0   2023-07-05
1   2023-07-06
dtype: datetime64[ns]

In [170]:
pd.to_datetime(ser_t3)

0   1970-01-01 00:00:00.000123456
1   1970-01-01 00:00:00.000234567
dtype: datetime64[ns]

In [169]:
pd.to_datetime(ser_t3, origin = "2023-07-05")

0   2023-07-05 00:00:00.000123456
1   2023-07-05 00:00:00.000234567
dtype: datetime64[ns]

In [168]:
pd.to_datetime(ser_t3, origin = "2023-07-05", unit = "ms")

0   2023-07-05 00:02:03.456
1   2023-07-05 00:03:54.567
dtype: datetime64[ns]

In [175]:
df = pd.read_csv("bike.csv")
df_dt = df[["datetime"]].copy()
df_dt["datetime"] = pd.to_datetime(df_dt["datetime"])
df_dt["year"    ] = df_dt["datetime"].dt.year
df_dt["month"   ] = df_dt["datetime"].dt.month
df_dt["day"     ] = df_dt["datetime"].dt.day
df_dt["date"    ] = df_dt["datetime"].dt.date
df_dt["weekday" ] = df_dt["datetime"].dt.weekday
df_dt["is_wend1"] = np.where(df_dt["weekday"] >= 5, 1, 0)
df_dt["is_wend2"] = (df_dt["weekday"] >= 5).astype("int")
df_dt["is_wend3"] = (df_dt["weekday"] >= 5) + 0
df_dt["is_wend4"] = (df_dt["weekday"] >= 5) * 1
df_dt["yyyymm"  ] = df_dt["datetime"].dt.strftime("%Y-%m")
df_dt.head(2)

Unnamed: 0,datetime,year,month,day,date,weekday,is_wend1,is_wend2,is_wend3,is_wend4,yyyymm
0,2011-01-01 00:00:00,2011,1,1,2011-01-01,5,1,1,1,1,2011-01
1,2011-01-01 01:00:00,2011,1,1,2011-01-01,5,1,1,1,1,2011-01


In [179]:
df_na = pd.DataFrame(dict(v1 = [1, 3, 5, np.nan, 11]))
df_na["v1_s1" ] = df_na["v1"].shift(1)
df_na["v1_sn1"] = df_na["v1"].shift(-1)
df_na["v1_s_mean"] = (df_na["v1_s1" ] + df_na["v1_sn1"]) / 2
df_na["v1_fill"] = np.where(df_na["v1"].isna(), df_na["v1_s_mean"], df_na["v1"])
df_na["v1_inter"] = df_na["v1"].interpolate() # 내삽(선형보간 설정이 기본값)
df_na

Unnamed: 0,v1,v1_s1,v1_sn1,v1_s_mean,v1_fill,v1_inter
0,1.0,,3.0,,1.0,1.0
1,3.0,1.0,5.0,3.0,3.0,3.0
2,5.0,3.0,,,5.0,5.0
3,,5.0,11.0,8.0,8.0,8.0
4,11.0,,,,11.0,11.0


In [180]:
pd.Series([1, np.nan, np.nan, np.nan, 6]).interpolate()

0    1.00
1    2.25
2    3.50
3    4.75
4    6.00
dtype: float64

In [181]:
df = pd.read_csv("bike.csv")

In [183]:
df.groupby("season")["count"].agg(lambda x: round(x.mean()))

season
1    116
2    215
3    234
4    199
Name: count, dtype: int64

In [184]:
def udf_mean_r0(x):
    return round(x.mean())

In [185]:
df.groupby("season")["count"].agg(udf_mean_r0)

season
1    116
2    215
3    234
4    199
Name: count, dtype: int64

lambda 또는 사용자 정의 함수는 .groupby(), .apply(), crosstab(), .pivot_table() 에서 사용할 수 있음.