In [1]:
import numpy as np

# 1차원 배열 생성
array1 = np.array([1, 2, 3])

# 2차원 배열 생성
array2 = np.array([[1, 2, 3],
                   [4, 5, 6]])


In [2]:
sequence_array = np.arange(10)
print(sequence_array)
print(sequence_array.dtype, sequence_array.shape)


[0 1 2 3 4 5 6 7 8 9]
int64 (10,)


In [3]:
list1 = [1, 2, 3]
print(type(list1))

array1 = np.array(list1)
print(type(array1))
print(array1, array1.dtype)


<class 'list'>
<class 'numpy.ndarray'>
[1 2 3] int64


In [4]:
array2 = np.array([[1, 2, 3],
                   [4, 5, 6]])

print("array2:")
print(array2)
print("Shape:", array2.shape)
print("차원 수 (ndim):", array2.ndim)


array2:
[[1 2 3]
 [4 5 6]]
Shape: (2, 3)
차원 수 (ndim): 2


In [None]:
array1 = np.arange(10)
print('array1:\n', array1)

# (2, 5)로 reshape
array2 = array1.reshape(2, 5)
print('array2:\n', array2)

# (5, 2)로 reshape
array3 = array1.reshape(5, 2)
print('array3:\n', array3)

# (4, 3)으로 reshape 시도 - 오류 발생
# array1.reshape(4, 3)  # ValueError 발생

In [None]:
array1 = np.arange(10)

# 자동 계산 가능한 경우
array2 = array1.reshape(-1, 5)
print('array2 shape:', array2.shape)

array3 = array1.reshape(5, -1)
print('array3 shape:', array3.shape)

# 자동 계산 불가능한 경우 - 오류 발생
# array4 = array1.reshape(-1, 4)  # ValueError 발생

## Pandas와 Numpy

In [6]:
import numpy as np
import pandas as pd

s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)


0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [7]:
dates = pd.date_range("20130101", periods=6)
print(dates)

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
print(df)


DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
                   A         B         C         D
2013-01-01 -0.130082  1.020925  0.467552 -0.138406
2013-01-02 -0.163484  1.245179  1.354596 -1.097992
2013-01-03 -1.362162  0.493030  0.768636  1.001769
2013-01-04  1.190499 -0.746026 -0.900196 -1.811050
2013-01-05 -1.053645 -1.365652  1.779070 -1.468868
2013-01-06 -0.885696 -0.060115  0.006117 -0.834293


In [15]:
df2 = pd.DataFrame({
    "A": 1.0,
    "B": pd.Timestamp("20130102"),
    "C": pd.Series(1, index=list(range(4)), dtype="float32"),
    "D": np.array([3] * 4, dtype="int32"),
    "E": pd.Categorical(["test", "train", "test", "train"]),
    "F": "foo"
})
print(df2)


     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo


In [9]:
print(df2.dtypes)


A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object


In [10]:
print(df.head())     # 상위 5개 행
print(df.tail(3))    # 하위 3개 행


                   A         B         C         D
2013-01-01 -0.130082  1.020925  0.467552 -0.138406
2013-01-02 -0.163484  1.245179  1.354596 -1.097992
2013-01-03 -1.362162  0.493030  0.768636  1.001769
2013-01-04  1.190499 -0.746026 -0.900196 -1.811050
2013-01-05 -1.053645 -1.365652  1.779070 -1.468868
                   A         B         C         D
2013-01-04  1.190499 -0.746026 -0.900196 -1.811050
2013-01-05 -1.053645 -1.365652  1.779070 -1.468868
2013-01-06 -0.885696 -0.060115  0.006117 -0.834293


In [11]:
print(df.index)
print(df.columns)


DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')


In [16]:
print(df.to_numpy())


[[-0.13008182  1.020925    0.46755198 -0.13840597]
 [-0.16348363  1.24517925  1.35459557 -1.0979919 ]
 [-1.36216178  0.49303013  0.76863563  1.00176926]
 [ 1.19049937 -0.7460263  -0.90019588 -1.81105043]
 [-1.05364532 -1.36565162  1.77907034 -1.46886794]
 [-0.88569592 -0.06011514  0.00611676 -0.83429334]]


In [17]:
print(df.describe())


              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean  -0.400762  0.097890  0.579296 -0.724807
std    0.921671  1.020149  0.959824  1.020758
min   -1.362162 -1.365652 -0.900196 -1.811050
25%   -1.011658 -0.574549  0.121476 -1.376149
50%   -0.524590  0.216457  0.618094 -0.966143
75%   -0.138432  0.888951  1.208106 -0.312378
max    1.190499  1.245179  1.779070  1.001769


In [18]:
print(df.T)


   2013-01-01  2013-01-02  2013-01-03  2013-01-04  2013-01-05  2013-01-06
A   -0.130082   -0.163484   -1.362162    1.190499   -1.053645   -0.885696
B    1.020925    1.245179    0.493030   -0.746026   -1.365652   -0.060115
C    0.467552    1.354596    0.768636   -0.900196    1.779070    0.006117
D   -0.138406   -1.097992    1.001769   -1.811050   -1.468868   -0.834293


In [23]:
print(df[["A","B"]])

                   A         B
2013-01-01 -0.130082  1.020925
2013-01-02 -0.163484  1.245179
2013-01-03 -1.362162  0.493030
2013-01-04  1.190499 -0.746026
2013-01-05 -1.053645 -1.365652
2013-01-06 -0.885696 -0.060115


In [24]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.130082,1.020925,0.467552,-0.138406
2013-01-02,-0.163484,1.245179,1.354596,-1.097992
2013-01-03,-1.362162,0.49303,0.768636,1.001769


In [25]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,-0.163484,1.245179,1.354596,-1.097992
2013-01-03,-1.362162,0.49303,0.768636,1.001769
2013-01-04,1.190499,-0.746026,-0.900196,-1.81105


In [26]:
df.loc["20130102":"20130104", ["A", "B"]]


Unnamed: 0,A,B
2013-01-02,-0.163484,1.245179
2013-01-03,-1.362162,0.49303
2013-01-04,1.190499,-0.746026


In [27]:
df.loc[dates[0], "A"]       # 위치 기반 접근
df.at[dates[0], "A"]        # 가장 빠른 방식


np.float64(-0.13008181647198822)

In [28]:
df.iloc[3]                  # 4번째 행 전체
df.iloc[3:5, 0:2]           # 4~5번째 행, 1~2번째 열
df.iloc[[1, 2, 4], [0, 2]]  # 특정 행/열 조합
df.iloc[1:3, :]             # 2~3번째 행, 전체 열
df.iloc[:, 1:3]             # 전체 행, 2~3번째 열
df.iloc[1, 1]               # 2행 2열 값
df.iat[1, 1]                # 빠른 2행 2열 접근


np.float64(1.2451792506298494)

In [29]:
df[df["A"] > 0]             # A 열이 0보다 큰 행만

df[df > 0]                 # 전체 DataFrame에서 양수만 표시, 나머지는 NaN


Unnamed: 0,A,B,C,D
2013-01-01,,1.020925,0.467552,
2013-01-02,,1.245179,1.354596,
2013-01-03,,0.49303,0.768636,1.001769
2013-01-04,1.190499,,,
2013-01-05,,,1.77907,
2013-01-06,,,0.006117,


In [None]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]

df2[df2["E"].isin(["two", "four"])]


In [30]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1], "E"] = 1
print(df1)


                   A         B         C         D    E
2013-01-01 -0.130082  1.020925  0.467552 -0.138406  1.0
2013-01-02 -0.163484  1.245179  1.354596 -1.097992  1.0
2013-01-03 -1.362162  0.493030  0.768636  1.001769  NaN
2013-01-04  1.190499 -0.746026 -0.900196 -1.811050  NaN


In [None]:
# 결측치가 있는 모든 행 제거
df1.dropna(how="any")

# 결측치를 5로 채우기
df1.fillna(value=5)


In [None]:
pd.isna(df1)

## 데이터 처리와 집계

In [2]:
import pandas as pd

# Titanic 데이터 CSV 파일 로드
titanic_df = pd.read_csv("../1_pandas_basic/data/titanic.csv")

# DataFrame 출력
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# 이름 기준 오름차순 정렬
titanic_sorted = titanic_df.sort_values(by=['Name'])
titanic_sorted.head(3)

# Pclass와 Name 기준 내림차순 정렬
titanic_sorted = titanic_df.sort_values(by=['Pclass', 'Name'], ascending=False)
titanic_sorted.head(3)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S
153,154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S
282,283,0,3,"de Pelsmaeker, Mr. Alfons",male,16.0,0,0,345778,9.5,,S


In [4]:
print(titanic_df['Pclass'].nunique())
print(titanic_df['Survived'].nunique())
print(titanic_df['Name'].nunique())

3
2
891


In [5]:
titanic_df.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [6]:
# 평균
titanic_df[['Age', 'Fare']].mean()

# 합계
titanic_df[['Age', 'Fare']].sum()

# 최솟값
titanic_df[['Age', 'Fare']].min()


Age     0.42
Fare    0.00
dtype: float64

In [None]:
# groupby 객체 생성
titanic_groupby = titanic_df.groupby('Pclass')

# Age와 Fare에 대해 count
titanic_groupby[['Age', 'Fare']].count()


In [None]:
# 최대값과 최소값을 나란히 출력
titanic_df.groupby('Pclass')['Age'].max(), titanic_df.groupby('Pclass')['Age'].min()


In [7]:
# max, min 함께 보기
titanic_df.groupby('Pclass')['Age'].agg([max, min])


  titanic_df.groupby('Pclass')['Age'].agg([max, min])
  titanic_df.groupby('Pclass')['Age'].agg([max, min])


Unnamed: 0_level_0,max,min
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80.0,0.92
2,70.0,0.67
3,74.0,0.42


In [8]:
titanic_df.groupby(['Pclass']).agg(
    age_max=('Age', 'max'),
    age_mean=('Age', 'mean'),
    fare_mean=('Fare', 'mean')
)


Unnamed: 0_level_0,age_max,age_mean,fare_mean
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80.0,38.233441,84.154687
2,70.0,29.87763,20.662183
3,74.0,25.14062,13.67555


In [None]:
agg_format = {
    'Age': 'max',
    'SibSp': 'sum',
    'Fare': 'mean'
}
titanic_df.groupby('Pclass').agg(agg_format)


In [None]:
# 이름의 길이 계산
titanic_df['Name_len'] = titanic_df['Name'].apply(lambda x: len(x))
titanic_df[['Name', 'Name_len']].head(3)


In [None]:
# 나이 기준으로 아동/성인 구분
titanic_df['Child_Adult'] = titanic_df['Age'].apply(lambda x: 'Child' if x <= 15 else 'Adult')
titanic_df[['Age', 'Child_Adult']].head(8)


In [None]:
def categorize_age(age):
    """
    나이에 따라 연령대를 분류하는 함수
    """
    if age <= 5:
        return 'Baby'
    elif age <= 12:
        return 'Child'
    elif age <= 18:
        return 'Teenager'
    elif age <= 25:
        return 'Student'
    elif age <= 35:
        return 'Young Adult'
    elif age <= 60:
        return 'Adult'
    else:
        return 'Elderly'

# 적용 및 확인
titanic_df['Age_cate'] = titanic_df['Age'].apply(categorize_age)
titanic_df[['Age', 'Age_cate']].head()


## 데이터 병합 및 변환 이론

In [9]:
import pandas as pd

df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                    'B': ['B0', 'B1', 'B2']})

df2 = pd.DataFrame({'C': ['C0', 'C1', 'C2'],
                    'D': ['D0', 'D1', 'D2']})

df3 = pd.concat([df1, df2], axis=0)  # 행 병합
df4 = pd.concat([df1, df2], axis=1)  # 열 병합

print(df1)
print('\n')
print(df2)
print('\n')
print(df3)
print('\n')
print(df4)


    A   B
0  A0  B0
1  A1  B1
2  A2  B2


    C   D
0  C0  D0
1  C1  D1
2  C2  D2


     A    B    C    D
0   A0   B0  NaN  NaN
1   A1   B1  NaN  NaN
2   A2   B2  NaN  NaN
0  NaN  NaN   C0   D0
1  NaN  NaN   C1   D1
2  NaN  NaN   C2   D2


    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1
2  A2  B2  C2  D2


In [10]:
import pandas as pd

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})

df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                    'C': ['C0', 'C1', 'C2'],
                    'D': ['D0', 'D1', 'D2']})

df_merged = pd.merge(df1, df2, on='key', how='inner')

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3


  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2


  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2


In [None]:
import pandas as pd

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})

df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                    'C': ['C0', 'C1', 'C2'],
                    'D': ['D0', 'D1', 'D2']})

df_merged = pd.merge(df1, df2, on='key', how='outer')

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


In [None]:
import pandas as pd

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})

df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                    'C': ['C0', 'C1', 'C2'],
                    'D': ['D0', 'D1', 'D2']})

df_merged = pd.merge(df1, df2, on='key', how='left')

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


In [None]:
import pandas as pd

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})

df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                    'C': ['C0', 'C1', 'C2'],
                    'D': ['D0', 'D1', 'D2']})

df_merged = pd.merge(df1, df2, on='key', how='right')

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


In [11]:
import pandas as pd

df1 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2'],
    'B': ['B0', 'B1', 'B2']
}, index=['K0', 'K1', 'K2'])

df2 = pd.DataFrame({
    'C': ['C0', 'C1', 'C2'],
    'D': ['D0', 'D1', 'D2']
}, index=['K0', 'K2', 'K3'])

df_merged = df1.join(df2, how='left')

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2


     C   D
K0  C0  D0
K2  C1  D1
K3  C2  D2


     A   B    C    D
K0  A0  B0   C0   D0
K1  A1  B1  NaN  NaN
K2  A2  B2   C1   D1


In [None]:
sales = pd.DataFrame({
    'customer_id': [1, 2, 3, 4],
    'product_id': [101, 102, 103, 104],
    'quantity': [5, 2, 3, 1]
})

customers = pd.DataFrame({
    'customer_id': [1, 2, 3, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'city': ['Seoul', 'Busan', 'Daegu', 'Incheon']
})

sales = sales.set_index('customer_id')
customers = customers.set_index('customer_id')

merged_data = sales.join(customers, how='left')

print(sales)
print('\n')
print(customers)
print('\n')
print(merged_data)


In [12]:
df1 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'C': ['C0', 'C1', 'C2', 'C3']
})

df2 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2'],
    'C': ['C4', 'C5', 'C6'],
    'D': ['D0', 'D1', 'D2']
})

df_merged = pd.merge(df1, df2, on='key', how='inner', suffixes=('_left', '_right'))

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


  key   A   C
0  K0  A0  C0
1  K1  A1  C1
2  K2  A2  C2
3  K3  A3  C3


  key   C   D
0  K0  C4  D0
1  K1  C5  D1
2  K2  C6  D2


  key   A C_left C_right   D
0  K0  A0     C0      C4  D0
1  K1  A1     C1      C5  D1
2  K2  A2     C2      C6  D2


In [None]:
df1 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})

df2 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2'],
    'C': ['C0', 'C1', 'C2'],
    'D': ['D0', 'D1', 'D2']
})

df_merged = pd.merge(df1, df2, on='key', how='inner')
df_merged = df_merged.drop('B', axis=1)

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


In [13]:
df1 = pd.DataFrame({
    'key1': ['K0', 'K1', 'K2', 'K3'],
    'key2': ['K4', 'K5', 'K6', 'K7'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})

df2 = pd.DataFrame({
    'key1': ['K0', 'K1', 'K2'],
    'key2': ['K4', 'K5', 'K6'],
    'C': ['C0', 'C1', 'C2'],
    'D': ['D0', 'D1', 'D2']
})

df_merged = pd.merge(df1, df2, on=['key1', 'key2'], how='inner')

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


  key1 key2   A   B
0   K0   K4  A0  B0
1   K1   K5  A1  B1
2   K2   K6  A2  B2
3   K3   K7  A3  B3


  key1 key2   C   D
0   K0   K4  C0  D0
1   K1   K5  C1  D1
2   K2   K6  C2  D2


  key1 key2   A   B   C   D
0   K0   K4  A0  B0  C0  D0
1   K1   K5  A1  B1  C1  D1
2   K2   K6  A2  B2  C2  D2


In [14]:
import pandas as pd

# Titanic 데이터 CSV 파일 로드
replace_test_df = pd.read_csv("../data/titanic.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../data/titanic.csv'

In [None]:
import numpy as np

# 예: Titanic 데이터
replace_test_df['Sex'] = replace_test_df['Sex'].replace({'male': 'Man', 'female': 'Woman'})
print(replace_test_df.head(10))


In [None]:
# NaN을 특정 값으로 대체
replace_test_df['Cabin'] = replace_test_df['Cabin'].replace(np.nan, 'CXXX')

# Cabin별 값 개수 확인
print(replace_test_df['Cabin'].value_counts(dropna=False))