In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# 경고 메시지 출력 표기 생략
warnings.filterwarnings('ignore')

In [2]:
# 데이터 불러오기
mydata = pd.read_csv("https://raw.githubusercontent.com/agtechresearch/LectureAlgorithm/main/csv/married_full.csv")
mydata.head()

Unnamed: 0,gender,age,age_partner,importance_same_religion,pref_of_partner_attractive,pref_of_partner_sincere,pref_of_partner_intelligence,pref_of_partner_funny,pref_of_partner_ambitious,pref_of_partner_shared_interests,...,my_eval_sincere,my_eval_intelligence,my_eval_funny,my_eval_ambition,my_eval_shared_interests,interests_correlate,expected_happy_with_couple_match,how_much_i_liked,guess_prob_liked,married
0,female,21.0,27.0,4.0,35.0,20.0,20.0,20.0,0.0,5.0,...,9.0,7.0,7.0,6.0,5.0,0.14,3.0,7.0,6.0,0
1,female,21.0,22.0,4.0,60.0,0.0,0.0,40.0,0.0,0.0,...,8.0,7.0,8.0,5.0,6.0,0.54,3.0,7.0,5.0,0
2,female,21.0,22.0,4.0,19.0,18.0,19.0,18.0,14.0,12.0,...,8.0,9.0,8.0,5.0,7.0,0.16,3.0,7.0,5.0,1
3,female,21.0,23.0,4.0,30.0,5.0,15.0,40.0,5.0,5.0,...,6.0,8.0,7.0,6.0,8.0,0.61,3.0,7.0,6.0,1
4,female,21.0,24.0,4.0,30.0,10.0,20.0,10.0,10.0,20.0,...,6.0,7.0,7.0,6.0,6.0,0.21,3.0,6.0,6.0,1


In [3]:
# 변수가 33개가 있으므로, 모든 변수를 출력하려면 다음과 같이 설정해야 함
pd.options.display.max_columns = 35
mydata.head()

Unnamed: 0,gender,age,age_partner,importance_same_religion,pref_of_partner_attractive,pref_of_partner_sincere,pref_of_partner_intelligence,pref_of_partner_funny,pref_of_partner_ambitious,pref_of_partner_shared_interests,eval_by_partner_attractive,eval_by_partner_sincere,eval_by_partner_intelligence,eval_by_partner_funny,eval_by_partner_ambitous,eval_by_partner_shared_interests,my_pref_attractive,my_pref_sincere,my_pref_intellicence,my_pref_funny,my_pref_ambtition,my_pref_shared_interests,my_eval_attractive,my_eval_sincere,my_eval_intelligence,my_eval_funny,my_eval_ambition,my_eval_shared_interests,interests_correlate,expected_happy_with_couple_match,how_much_i_liked,guess_prob_liked,married
0,female,21.0,27.0,4.0,35.0,20.0,20.0,20.0,0.0,5.0,6.0,8.0,8.0,8.0,8.0,6.0,15.0,20.0,20.0,15.0,15.0,15.0,6.0,9.0,7.0,7.0,6.0,5.0,0.14,3.0,7.0,6.0,0
1,female,21.0,22.0,4.0,60.0,0.0,0.0,40.0,0.0,0.0,7.0,8.0,10.0,7.0,7.0,5.0,15.0,20.0,20.0,15.0,15.0,15.0,7.0,8.0,7.0,8.0,5.0,6.0,0.54,3.0,7.0,5.0,0
2,female,21.0,22.0,4.0,19.0,18.0,19.0,18.0,14.0,12.0,10.0,10.0,10.0,10.0,10.0,10.0,15.0,20.0,20.0,15.0,15.0,15.0,5.0,8.0,9.0,8.0,5.0,7.0,0.16,3.0,7.0,5.0,1
3,female,21.0,23.0,4.0,30.0,5.0,15.0,40.0,5.0,5.0,7.0,8.0,9.0,8.0,9.0,8.0,15.0,20.0,20.0,15.0,15.0,15.0,7.0,6.0,8.0,7.0,6.0,8.0,0.61,3.0,7.0,6.0,1
4,female,21.0,24.0,4.0,30.0,10.0,20.0,10.0,10.0,20.0,8.0,7.0,9.0,6.0,9.0,7.0,15.0,20.0,20.0,15.0,15.0,15.0,5.0,6.0,7.0,7.0,6.0,6.0,0.21,3.0,6.0,6.0,1


In [4]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(mydata, mydata['married']):
    sss_train_set = mydata.loc[train_index]
    sss_test_set = mydata.loc[test_index]

In [5]:
X_train = sss_train_set.drop("married", axis=1)
y_train = sss_train_set["married"].copy()

X_test = sss_test_set.drop("married", axis=1)
y_test = sss_test_set["married"].copy()

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_attribs = X_train.columns[1:]     # 수치형 column 지정
cat_attribs = ['gender']    # 범주형 column 지정

# 수치형 변수 파이프라인
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),      # null값 중앙값 대체
    ('std_scaler', StandardScaler()),                   # 표준화
])

# 전체 전처리 파이프라인
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),                 # 수치형 변수 파이프라인 적용
    ('cat', OneHotEncoder(drop='first'), cat_attribs),  # 범주형 변수 OHE 적용
])

X_train_scaled = full_pipeline.fit_transform(X_train)

In [16]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# random_state값은 원하시는 숫자 아무거나 넣으시면 됩니다.
imputer_mice = IterativeImputer(random_state=83)
x_train = imputer_mice.fit_transform(X_train_scaled)

In [18]:
x_train = pd.DataFrame(x_train, columns=X_train.columns)

In [15]:
imputer_mice = IterativeImputer(random_state=83)
X_test = imputer_mice.fit_transform(X_test)

Unnamed: 0,gender,age,age_partner,importance_same_religion,pref_of_partner_attractive,pref_of_partner_sincere,pref_of_partner_intelligence,pref_of_partner_funny,pref_of_partner_ambitious,pref_of_partner_shared_interests,eval_by_partner_attractive,eval_by_partner_sincere,eval_by_partner_intelligence,eval_by_partner_funny,eval_by_partner_ambitous,eval_by_partner_shared_interests,my_pref_attractive,my_pref_sincere,my_pref_intellicence,my_pref_funny,my_pref_ambtition,my_pref_shared_interests,my_eval_attractive,my_eval_sincere,my_eval_intelligence,my_eval_funny,my_eval_ambition,my_eval_shared_interests,interests_correlate,expected_happy_with_couple_match,how_much_i_liked,guess_prob_liked
0,0.463693,-0.65716,1.920094,-0.594558,0.37649,-0.790534,0.410735,0.693468,0.505295,0.432928,-0.09006,-0.226349,1.350353,1.296494,1.716349,-0.2026,0.3728,-0.044232,-1.224339,-0.102134,1.296082,-1.154213,-0.690963,-0.243517,-2.334349,-2.811928,-2.262102,-0.418213,-0.311352,-1.181458,-1.540303,1.0
1,-0.384368,1.880797,1.562187,-0.194905,1.089894,0.698979,-1.220752,0.693468,-1.079912,0.432928,-1.248539,-0.226349,-1.778602,-0.452119,-1.730336,0.115151,-0.336191,-0.044232,0.410769,-0.431007,0.188351,-0.104115,0.488983,-0.243517,-0.757157,-1.055419,-0.275729,-0.085922,0.263009,-0.076847,-0.581876,1.0
2,0.463693,-0.939155,0.846372,-0.594558,-0.336914,-0.045777,-0.405008,1.506342,0.505295,-1.130822,1.647659,1.741653,1.871845,1.879365,1.223966,-0.2026,0.3728,-0.782276,-0.406785,-0.924317,2.087318,-0.104115,0.488983,0.417422,0.294304,-0.469917,0.220865,1.077099,0.263009,-0.076847,-0.102663,1.0
3,-0.667055,-0.375165,-0.943166,-1.066947,0.116811,1.037098,0.113804,1.210456,-0.431562,-0.088322,0.48918,0.429651,-0.735617,-0.452119,0.731582,0.591776,-1.045182,-0.044232,-1.224339,-0.102134,1.296082,1.471032,1.078956,0.417422,1.345765,-1.055419,0.717458,-1.415088,-0.311352,1.580069,0.855764,1.0
4,-0.101681,0.188826,-0.943166,1.403705,0.37649,-0.045777,-1.220752,-0.119406,-1.872515,0.432928,-0.669299,-0.88235,0.307368,-0.452119,0.239199,0.591776,-1.045182,-1.520319,2.045877,-0.102134,-0.286391,-2.729361,-1.280936,-1.565395,-2.86008,-1.055419,-2.262102,-1.780609,-0.311352,-2.838373,-2.019517,0.0


In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# random_state값은 원하시는 숫자 아무거나 넣으시면 됩니다.
imputer_mice = IterativeImputer(random_state=83)
numeric_data = imputer_mice.fit_transform(newdata)