# 라이브러리 로드

In [141]:
# 구글 드라이브 연결(데이터 로드를 위해서)
from google.colab import drive

drive.mount('/content/data')

Mounted at /content/data


## 데이터 분석용 라이브러리

In [142]:
# 데이터 분석에 사용할 라이브러리
import pandas as pd
import numpy as np

In [143]:
import logging

logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR)

## 데이터 시각화용 라이브러리

In [144]:
# 코렙 한글깨짐 방지
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fonts-nanum is already the newest version (20200506-1).
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 12 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/usr/share/fonts/truetype: skipping, looped directory detected
/usr/share/fonts/truetype/humor-sans: skipping, looped directory detected
/usr/share/fonts/truetype/liberation: skipping, looped directory detected
/usr/share/fonts/truetype/

In [145]:
# 데이터 시각화에 사용할 라이브러리
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

# 브라우저에서 바로 그려지도록
%matplotlib inline

# 그래프에 retina display 적용
%config InlineBackend.figure_format = 'retina'

# Colab 의 한글 폰트 설정
plt.rc('font', family='NanumBarunGothic')

# 유니코드에서  음수 부호설정
mpl.rc('axes', unicode_minus=False)

# 데이터 로드
- [타이타닉 데이터 로드](https://www.kaggle.com/c/titanic/)

In [146]:
DATA_PATH = "/content/data/MyDrive/google_lecture/data/"

df = pd.read_csv(DATA_PATH+"Titanic.csv")

In [147]:
df.shape, df.columns

((891, 12),
 Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'))

In [148]:
df.columns = [col.lower() for col in df.columns] # 컬럼명 소문자로 변환
df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

## 데이터 분리

In [149]:
from sklearn.model_selection import train_test_split

In [150]:
SEED = 42

X_tr, X_te = train_test_split(df, random_state=SEED, test_size = 0.2)
X_tr = X_tr.reset_index(drop=True)
X_te = X_te.reset_index(drop=True)

X_tr.shape, X_te.shape

((712, 12), (179, 12))

In [151]:
X_tr.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
1,734,0,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
2,383,0,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S
3,705,0,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
4,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S


In [152]:
X_tr.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [153]:
new_survived = pd.Categorical(X_tr['survived'])
new_survived = new_survived.rename_categories(["Died","Survived"])
print(new_survived[:5])
new_survived.describe()

['Died', 'Died', 'Died', 'Died', 'Died']
Categories (2, object): ['Died', 'Survived']


Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Died,444,0.623596
Survived,268,0.376404


## 데이터 확인

In [154]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  712 non-null    int64  
 1   survived     712 non-null    int64  
 2   pclass       712 non-null    int64  
 3   name         712 non-null    object 
 4   sex          712 non-null    object 
 5   age          572 non-null    float64
 6   sibsp        712 non-null    int64  
 7   parch        712 non-null    int64  
 8   ticket       712 non-null    object 
 9   fare         712 non-null    float64
 10  cabin        159 non-null    object 
 11  embarked     710 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 66.9+ KB


In [155]:
X_tr.describe(include="all")

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
count,712.0,712.0,712.0,712,712,572.0,712.0,712.0,712,712.0,159,710
unique,,,,712,2,,,,558,,117,3
top,,,,"Partner, Mr. Austen",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,,1,467,,,,7,,4,525
mean,448.234551,0.376404,2.330056,,,29.498846,0.553371,0.379213,,32.586276,,
std,256.731423,0.484824,0.824584,,,14.500059,1.176404,0.791669,,51.969529,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,224.75,0.0,2.0,,,21.0,0.0,0.0,,7.925,,
50%,453.5,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,673.5,1.0,3.0,,,38.0,1.0,0.0,,30.5,,


In [156]:
X_tr.describe(include=np.number)

Unnamed: 0,passengerid,survived,pclass,age,sibsp,parch,fare
count,712.0,712.0,712.0,572.0,712.0,712.0,712.0
mean,448.234551,0.376404,2.330056,29.498846,0.553371,0.379213,32.586276
std,256.731423,0.484824,0.824584,14.500059,1.176404,0.791669,51.969529
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,224.75,0.0,2.0,21.0,0.0,0.0,7.925
50%,453.5,0.0,3.0,28.0,0.0,0.0,14.4542
75%,673.5,1.0,3.0,38.0,1.0,0.0,30.5
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [157]:
X_tr.describe(exclude=np.number)

Unnamed: 0,name,sex,ticket,cabin,embarked
count,712,712,712,159,710
unique,712,2,558,117,3
top,"Partner, Mr. Austen",male,CA. 2343,C23 C25 C27,S
freq,1,467,7,4,525


In [158]:
X_tr.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
1,734,0,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
2,383,0,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S
3,705,0,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
4,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S


In [159]:
X_tr.tail()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
707,107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.65,,S
708,271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0,,S
709,861,0,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S
710,436,1,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0,B96 B98,S
711,103,0,1,"White, Mr. Richard Frasar",male,21.0,0,1,35281,77.2875,D26,S


## 타겟 데이터 확인

In [160]:
new_survived = pd.Categorical(X_tr["survived"])
new_survived = new_survived.rename_categories(["Died","Survived"])

new_survived.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Died,444,0.623596
Survived,268,0.376404


# Data Cleaning

## 필요없는 데이터

In [161]:
X_tr['passengerid'].nunique(), X_tr.shape[0]

(712, 712)

In [162]:
# passengerid는 전체 데이터가 unique하기 때문에 삭제
X_tr.drop('passengerid', axis=1, inplace=True)
X_te.drop('passengerid', axis=1, inplace=True)

X_tr.columns

Index(['survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked'],
      dtype='object')

### 결측치 처리

In [163]:
(X_tr.isnull().sum() / X_tr.shape[0]).round(4).sort_values(ascending=False) # 각 걸럼별 결측치 비율

cabin       0.7767
age         0.1966
embarked    0.0028
survived    0.0000
pclass      0.0000
name        0.0000
sex         0.0000
sibsp       0.0000
parch       0.0000
ticket      0.0000
fare        0.0000
dtype: float64

In [164]:
print(f'before: {X_tr.shape} / isnull().sum(): {X_tr.isnull().sum().sum()}')
# 결측치가 있는 행 제거 : X_tr.dropna(axis=0)
X_tr = X_tr.drop('cabin', axis=1)
X_te = X_te.drop('cabin', axis=1)
print(f'after: {X_tr.shape} / isnull().sum(): {X_tr.isnull().sum().sum()}')

before: (712, 11) / isnull().sum(): 695
after: (712, 10) / isnull().sum(): 142


In [165]:
X_tr['age'] = X_tr['age'].fillna(X_tr['age'].median())
X_te['age'] = X_te['age'].fillna(X_tr['age'].median())

In [166]:
embarked_mode = X_tr['embarked'].mode().values[0]

X_tr['embarked'] = X_tr['embarked'].fillna(embarked_mode)
X_te['embarked'] = X_te['embarked'].fillna(embarked_mode)

In [167]:
X_tr.isnull().sum().sum(), X_te.isnull().sum().sum()

(0, 0)

# Feature Extraction
> 기존 Feature에 기반하여 새로운 Feature들을 생성

## 데이터 타입

In [168]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  712 non-null    int64  
 1   pclass    712 non-null    int64  
 2   name      712 non-null    object 
 3   sex       712 non-null    object 
 4   age       712 non-null    float64
 5   sibsp     712 non-null    int64  
 6   parch     712 non-null    int64  
 7   ticket    712 non-null    object 
 8   fare      712 non-null    float64
 9   embarked  712 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 55.8+ KB


#### 수치형 데이터 타입 변환

In [169]:
df_number = X_tr.select_dtypes(include=np.number)
df_number.columns

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object')

In [170]:
df_number.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  712 non-null    int64  
 1   pclass    712 non-null    int64  
 2   age       712 non-null    float64
 3   sibsp     712 non-null    int64  
 4   parch     712 non-null    int64  
 5   fare      712 non-null    float64
dtypes: float64(2), int64(4)
memory usage: 33.5 KB


In [171]:
df_number.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,1,45.5,0,0,28.5
1,0,2,23.0,0,0,13.0
2,0,3,32.0,0,0,7.925
3,0,3,26.0,1,0,7.8542
4,0,3,6.0,4,2,31.275


In [172]:
# survived
X_tr["survived"] = X_tr["survived"].astype("int32")
X_te["survived"] = X_te["survived"].astype("int32")

In [173]:
# pclass
X_tr['pclass'].unique()

array([1, 2, 3])

In [174]:
X_tr["pclass"] = X_tr["pclass"].astype("category")
X_te["pclass"] = X_te["pclass"].astype("category")

In [175]:
# age
X_tr["age"] = X_tr["age"].astype("int32")
X_te["age"] = X_te["age"].astype("int32")

In [176]:
# sibsp
X_tr['sibsp'].unique()

array([0, 1, 4, 3, 2, 8, 5])

In [177]:
X_tr["sibsp"] = X_tr["sibsp"].astype("category")
X_te["sibsp"] = X_te["sibsp"].astype("category")

In [178]:
# parch
X_tr['parch'].unique()

array([0, 2, 1, 6, 4, 3, 5])

In [179]:
X_tr["parch"] = X_tr["parch"].astype("category")
X_te["parch"] = X_te["parch"].astype("category")

In [180]:
# fare
X_tr["fare"] = X_tr["fare"].astype("float32")
X_te["fare"] = X_te["fare"].astype("float32")

In [181]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   survived  712 non-null    int32   
 1   pclass    712 non-null    category
 2   name      712 non-null    object  
 3   sex       712 non-null    object  
 4   age       712 non-null    int32   
 5   sibsp     712 non-null    category
 6   parch     712 non-null    category
 7   ticket    712 non-null    object  
 8   fare      712 non-null    float32 
 9   embarked  712 non-null    object  
dtypes: category(3), float32(1), int32(2), object(4)
memory usage: 33.6+ KB


#### 범주형 데이터 타입 변환

In [182]:
df_object = X_tr.select_dtypes(include='object')
df_object.columns

Index(['name', 'sex', 'ticket', 'embarked'], dtype='object')

In [183]:
df_object.head()

Unnamed: 0,name,sex,ticket,embarked
0,"Partner, Mr. Austen",male,113043,S
1,"Berriman, Mr. William John",male,28425,S
2,"Tikkanen, Mr. Juho",male,STON/O 2. 3101293,S
3,"Hansen, Mr. Henrik Juul",male,350025,S
4,"Andersson, Miss. Ebba Iris Alfrida",female,347082,S


In [184]:
X_tr["sex"] = X_tr["sex"].astype("category")
X_te["sex"] = X_te["sex"].astype("category")

In [185]:
X_tr["embarked"] = X_tr["embarked"].astype("category")
X_te["embarked"] = X_te["embarked"].astype("category")

In [186]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   survived  712 non-null    int32   
 1   pclass    712 non-null    category
 2   name      712 non-null    object  
 3   sex       712 non-null    category
 4   age       712 non-null    int32   
 5   sibsp     712 non-null    category
 6   parch     712 non-null    category
 7   ticket    712 non-null    object  
 8   fare      712 non-null    float32 
 9   embarked  712 non-null    category
dtypes: category(5), float32(1), int32(2), object(2)
memory usage: 24.1+ KB


In [187]:
X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S


## 문자열

In [188]:
df_object = X_tr.select_dtypes(include='object')
df_object.columns

Index(['name', 'ticket'], dtype='object')

In [189]:
df_object.head()

Unnamed: 0,name,ticket
0,"Partner, Mr. Austen",113043
1,"Berriman, Mr. William John",28425
2,"Tikkanen, Mr. Juho",STON/O 2. 3101293
3,"Hansen, Mr. Henrik Juul",350025
4,"Andersson, Miss. Ebba Iris Alfrida",347082


In [190]:
df_object.describe()

Unnamed: 0,name,ticket
count,712,712
unique,712,558
top,"Partner, Mr. Austen",CA. 2343
freq,1,7


### 공백제거
> 앞뒤 공백제거, lstrip , rstrip

In [191]:
X_tr["name"] = X_tr["name"].map(lambda x: x.strip())
X_tr["ticket"] = X_tr["ticket"].map(lambda x: x.strip())

X_te["name"] = X_te["name"].map(lambda x: x.strip())
X_te["ticket"] = X_te["ticket"].map(lambda x: x.strip())

In [192]:
df_object.head()

Unnamed: 0,name,ticket
0,"Partner, Mr. Austen",113043
1,"Berriman, Mr. William John",28425
2,"Tikkanen, Mr. Juho",STON/O 2. 3101293
3,"Hansen, Mr. Henrik Juul",350025
4,"Andersson, Miss. Ebba Iris Alfrida",347082


### 문자열 포함 여부

In [193]:
dict_designation = {
    'Mr.': '남성',
    'Master.': '남성',
    'Sir.': '남성',
    'Miss.': '미혼 여성',
    'Mrs.': '기혼 여성',
    'Ms.': '미혼/기혼 여성',
    'Lady.': '숙녀',
    'Mlle.': '아가씨',
    # 직업
    'Dr.': '의사',
    'Rev.': '목사',
    'Major.': '계급',
    'Don.': '교수',
    'Col.': '군인',
    'Capt.': '군인',
    # 귀족
    'Mme.': '영부인',
    'Countess.': '백작부인',
    'Jonkheer.': '귀족'
}

In [194]:
dict_designation.keys()

dict_keys(['Mr.', 'Master.', 'Sir.', 'Miss.', 'Mrs.', 'Ms.', 'Lady.', 'Mlle.', 'Dr.', 'Rev.', 'Major.', 'Don.', 'Col.', 'Capt.', 'Mme.', 'Countess.', 'Jonkheer.'])

In [195]:
# X_tr['name'].map(lambda x: x) ->
x = 'Andersson, Miss. Ebba Iris Alfrida	'
x

'Andersson, Miss. Ebba Iris Alfrida\t'

In [196]:
'Mr.' in x

False

In [197]:
'Miss.' in x

True

In [198]:
for key in dict_designation.keys():
  result = 'unknown'
  if key in x:
    result = key
    break

print(result)

Miss.


In [199]:
dict_designation = {
    'Mr.': '남성',
    'Master.': '남성',
    'Sir.': '남성',
    'Miss.': '미혼 여성',
    'Mrs.': '기혼 여성',
    'Ms.': '미혼/기혼 여성',
    'Lady.': '숙녀',
    'Mlle.': '아가씨',
    # 직업
    'Dr.': '의사',
    'Rev.': '목사',
    'Major.': '계급',
    'Don.': '교수',
    'Col.': '군인',
    'Capt.': '군인',
    # 귀족
    'Mme.': '영부인',
    'Countess.': '백작부인',
    'Jonkheer.': '귀족'
}

def add_designation(name): # 호칭 함수
  designation = "unknown"
  for key in dict_designation.keys():
    if key in name:
      designation = key
      break
  return designation

X_tr['designation'] = X_tr['name'].map(lambda x: add_designation(x))
X_te['designation'] = X_te['name'].map(lambda x: add_designation(x))

X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.


In [200]:
cond = X_tr['designation'] == "unknown"
X_tr.loc[cond].head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation


In [201]:
X_tr[X_tr['designation'] == "unknown"].shape

(0, 11)

In [202]:
cond = X_te['designation'] == "unknown"
X_te.loc[cond].head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation


### 문자열 분리

In [203]:
# 1. Mr. 이런거 삭제... -> replace()
# 2. , 이걸로 나누기.... -> split()
# 3. 라스트 네임 추출
# 4. 새로운 컬럼에 적용

def get_last_name(name):
  last_name = None
  try:
    for key in dict_designation.keys(): # 이니셜을 다 조회하기
      if key in name: # 이니셜이 있는지 확인하기
        name = name.replace(key,'') # 이니셜을 제거하기
        last_name = name.split(',')[1].strip() # 라스트 네임 추출하기
  except:
    pass
  return last_name

X_tr['last_name'] = X_tr['name'].map(lambda x: get_last_name(x))
X_te['last_name'] = X_te['name'].map(lambda x: get_last_name(x))

X_tr[['name', 'last_name']].head()

Unnamed: 0,name,last_name
0,"Partner, Mr. Austen",Austen
1,"Berriman, Mr. William John",William John
2,"Tikkanen, Mr. Juho",Juho
3,"Hansen, Mr. Henrik Juul",Henrik Juul
4,"Andersson, Miss. Ebba Iris Alfrida",Ebba Iris Alfrida


In [204]:
# X_tr['last_name'] = X_tr['name'].map(lambda x: x.split(',')[1].split('.')[1])
# X_te['last_name'] = X_te['name'].map(lambda x: x.split(',')[1].split('.')[1])

In [205]:
X_tr['first_name'] = X_tr['name'].map(lambda x: x.split(',')[0])
X_te['first_name'] = X_te['name'].map(lambda x: x.split(',')[0])

X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,first_name
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Austen,Partner
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,William John,Berriman
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Juho,Tikkanen
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Henrik Juul,Hansen
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Ebba Iris Alfrida,Andersson


In [206]:
X_tr.tail()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,first_name
707,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21,0,0,343120,7.65,S,Miss.,Anna Kristine,Salkjelsvik
708,0,1,"Cairns, Mr. Alexander",male,28,0,0,113798,31.0,S,Mr.,Alexander,Cairns
709,0,3,"Hansen, Mr. Claus Peter",male,41,2,0,350026,14.1083,S,Mr.,Claus Peter,Hansen
710,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120.0,S,Miss.,Lucile Polk,Carter
711,0,1,"White, Mr. Richard Frasar",male,21,0,1,35281,77.287498,S,Mr.,Richard Frasar,White


In [207]:
X_tr['ticket']

0                 113043
1                  28425
2      STON/O 2. 3101293
3                 350025
4                 347082
             ...        
707               343120
708               113798
709               350026
710               113760
711                35281
Name: ticket, Length: 712, dtype: object

In [208]:
def add_ticket_number(ticket):
  try:
    ticket_split = ticket.split(' ')
    return int(ticket_split[-1])
  except:
    return 0 # ticket이 LINE인 경우

X_tr['ticket_number'] = X_tr['ticket'].map(lambda x: add_ticket_number(x)).astype("int32")
X_te['ticket_number'] = X_te['ticket'].map(lambda x: add_ticket_number(x)).astype("int32")

X_tr[['ticket_number', 'ticket']].head()

Unnamed: 0,ticket_number,ticket
0,113043,113043
1,28425,28425
2,3101293,STON/O 2. 3101293
3,350025,350025
4,347082,347082


In [209]:
X_tr[['ticket_number', 'ticket']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ticket_number  712 non-null    int32 
 1   ticket         712 non-null    object
dtypes: int32(1), object(1)
memory usage: 8.5+ KB


## 집계

### 피봇 테이블

In [210]:
X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,first_name,ticket_number
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Austen,Partner,113043
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,William John,Berriman,28425
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Juho,Tikkanen,3101293
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Henrik Juul,Hansen,350025
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Ebba Iris Alfrida,Andersson,347082


In [211]:
df_pivot = pd.pivot_table(X_tr, index='pclass', values='fare', aggfunc='mean').reset_index()
df_pivot.rename(columns = {'fare' : 'fare_mean_by_pclass'}, inplace = True)
df_pivot #.head()

Unnamed: 0,pclass,fare_mean_by_pclass
0,1,89.253914
1,2,20.575939
2,3,13.934861


In [212]:
print(f'before: {X_tr.shape}')
X_tr = pd.merge(X_tr,df_pivot,how="left",on="pclass")
X_te = pd.merge(X_te,df_pivot,how="left",on="pclass")
print(f'after: {X_tr.shape}')
X_tr.head()

before: (712, 14)
after: (712, 15)


Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,first_name,ticket_number,fare_mean_by_pclass
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Austen,Partner,113043,89.253914
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,William John,Berriman,28425,20.575939
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Juho,Tikkanen,3101293,13.934861
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Henrik Juul,Hansen,350025,13.934861
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Ebba Iris Alfrida,Andersson,347082,13.934861


### 그룹

In [213]:
agg_dict = {"survived" : "mean" , "sibsp" : "nunique", "parch" : "nunique" }
df_groupby = X_tr.groupby("pclass").agg(agg_dict).reset_index()
df_groupby

Unnamed: 0,pclass,survived,sibsp,parch
0,1,0.607362,4,4
1,2,0.483444,4,4
2,3,0.241206,7,7


In [214]:
agg_dict = {"survived" : "mean" , "sibsp" : "nunique", "parch" : "nunique" }
df_groupby = X_tr.groupby("pclass").agg(agg_dict).reset_index()

df_groupby.rename(columns = {'survived' : 'survived_by_pclass', 'sibsp' : 'len_sibsp_by_pclass', 'parch' : 'len_parch_by_pclass'}, inplace = True)
df_groupby

Unnamed: 0,pclass,survived_by_pclass,len_sibsp_by_pclass,len_parch_by_pclass
0,1,0.607362,4,4
1,2,0.483444,4,4
2,3,0.241206,7,7


In [215]:
print(f'before: {X_tr.shape}')
X_tr = pd.merge(X_tr,df_groupby,how="left",on="pclass")
X_te = pd.merge(X_te,df_groupby,how="left",on="pclass")
print(f'after: {X_tr.shape}')
X_tr.head()

before: (712, 15)
after: (712, 18)


Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,first_name,ticket_number,fare_mean_by_pclass,survived_by_pclass,len_sibsp_by_pclass,len_parch_by_pclass
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Austen,Partner,113043,89.253914,0.607362,4,4
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,William John,Berriman,28425,20.575939,0.483444,4,4
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Juho,Tikkanen,3101293,13.934861,0.241206,7,7
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Henrik Juul,Hansen,350025,13.934861,0.241206,7,7
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Ebba Iris Alfrida,Andersson,347082,13.934861,0.241206,7,7


## 데이터 변환/조합
> apply(), map() 등 사용

In [216]:
def sub_age(age):
  return age // 10

X_tr['sub_age'] = X_tr['age'].map(lambda x: sub_age(x))
X_te['sub_age'] = X_te['age'].map(lambda x: sub_age(x))
X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,first_name,ticket_number,fare_mean_by_pclass,survived_by_pclass,len_sibsp_by_pclass,len_parch_by_pclass,sub_age
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Austen,Partner,113043,89.253914,0.607362,4,4,4
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,William John,Berriman,28425,20.575939,0.483444,4,4,2
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Juho,Tikkanen,3101293,13.934861,0.241206,7,7,3
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Henrik Juul,Hansen,350025,13.934861,0.241206,7,7,2
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Ebba Iris Alfrida,Andersson,347082,13.934861,0.241206,7,7,0


In [217]:
def add_sub_embarked(row):
  return str(row['embarked']) + str(row['pclass']) + str(row['sibsp']) + str(row['parch'])

X_tr['sub_embarked'] = X_tr.apply(lambda row: add_sub_embarked(row), axis=1)
X_te['sub_embarked'] = X_te.apply(lambda row: add_sub_embarked(row), axis=1)
X_tr.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,designation,last_name,first_name,ticket_number,fare_mean_by_pclass,survived_by_pclass,len_sibsp_by_pclass,len_parch_by_pclass,sub_age,sub_embarked
0,0,1,"Partner, Mr. Austen",male,45,0,0,113043,28.5,S,Mr.,Austen,Partner,113043,89.253914,0.607362,4,4,4,S100
1,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13.0,S,Mr.,William John,Berriman,28425,20.575939,0.483444,4,4,2,S200
2,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,S,Mr.,Juho,Tikkanen,3101293,13.934861,0.241206,7,7,3,S300
3,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,S,Mr.,Henrik Juul,Hansen,350025,13.934861,0.241206,7,7,2,S310
4,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,S,Miss.,Ebba Iris Alfrida,Andersson,347082,13.934861,0.241206,7,7,0,S342


## 날짜

In [218]:
DATA_PATH = "/content/data/MyDrive/google_lecture/data/"

df_cinemaTicket = pd.read_csv(DATA_PATH+"cinemaTicket_Ref.csv")
df_cinemaTicket.shape

(142524, 14)

In [219]:
df_cinemaTicket.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142524 entries, 0 to 142523
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   film_code     142524 non-null  int64  
 1   cinema_code   142524 non-null  int64  
 2   total_sales   142524 non-null  int64  
 3   tickets_sold  142524 non-null  int64  
 4   tickets_out   142524 non-null  int64  
 5   show_time     142524 non-null  int64  
 6   occu_perc     142399 non-null  float64
 7   ticket_price  142524 non-null  float64
 8   ticket_use    142524 non-null  int64  
 9   capacity      142399 non-null  float64
 10  date          142524 non-null  object 
 11  month         142524 non-null  int64  
 12  quarter       142524 non-null  int64  
 13  day           142524 non-null  int64  
dtypes: float64(3), int64(10), object(1)
memory usage: 15.2+ MB


### datetime 적용

In [220]:
df_cinemaTicket["date"] = pd.to_datetime(df_cinemaTicket["date"])
df_cinemaTicket.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142524 entries, 0 to 142523
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   film_code     142524 non-null  int64         
 1   cinema_code   142524 non-null  int64         
 2   total_sales   142524 non-null  int64         
 3   tickets_sold  142524 non-null  int64         
 4   tickets_out   142524 non-null  int64         
 5   show_time     142524 non-null  int64         
 6   occu_perc     142399 non-null  float64       
 7   ticket_price  142524 non-null  float64       
 8   ticket_use    142524 non-null  int64         
 9   capacity      142399 non-null  float64       
 10  date          142524 non-null  datetime64[ns]
 11  month         142524 non-null  int64         
 12  quarter       142524 non-null  int64         
 13  day           142524 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(10)
memory usage: 15.2 MB


In [221]:
df_cinemaTicket["date"][:5]

0   2018-05-05
1   2018-05-05
2   2018-05-05
3   2018-05-05
4   2018-05-05
Name: date, dtype: datetime64[ns]

In [222]:
df_cinemaTicket["date"].dt.year[:5] # 연도

0    2018
1    2018
2    2018
3    2018
4    2018
Name: date, dtype: int64

In [223]:
df_cinemaTicket["date"].dt.month[:5] # 월

0    5
1    5
2    5
3    5
4    5
Name: date, dtype: int64

In [224]:
df_cinemaTicket["date"].dt.day[:5] # 일

0    5
1    5
2    5
3    5
4    5
Name: date, dtype: int64

In [225]:
df_cinemaTicket["date"].dt.quarter[:5] # 분기

0    2
1    2
2    2
3    2
4    2
Name: date, dtype: int64

In [226]:
df_cinemaTicket["date"].dt.weekday[:5] # 요일: 0 ~ 6(월요일 ~ 일요일)

0    5
1    5
2    5
3    5
4    5
Name: date, dtype: int64

In [227]:
df_cinemaTicket["date"].dt.dayofyear[:5] # 연기준 몇일째인지..

0    125
1    125
2    125
3    125
4    125
Name: date, dtype: int64

## 진행바

In [228]:
!pip install tqdm



In [229]:
from tqdm.auto import tqdm

In [230]:
i=0
for i in tqdm(np.random.rand(10000000)):
    i = i**2

  0%|          | 0/10000000 [00:00<?, ?it/s]

### with pandas

In [231]:
tqdm.pandas() # 판다스에서 progress_apply 메소드를 사용할수 있게 된다.

In [232]:
import time

def do_apply(x):
    time.sleep(0.01)
    return x
tmp = df.progress_apply(do_apply,axis = 1)

  0%|          | 0/891 [00:00<?, ?it/s]