In [2]:
# Titanic Data Set을 이용해서 이진분류(Logistic Regression)을 구현

# 필요한 module import

import numpy as np
import pandas as pd
from sklearn import linear_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.optimizers import Adam

#Raw Data Lodaing
titanic = pd.read_csv('/content/drive/MyDrive/빅데이터 과정 폴더/data/titanic/train.csv')
display(titanic.head(),titanic.shape) #(891, 12)

# 사용하지 않은 컬럼부터 제거
titanic.drop(['PassengerId','Name','Ticket','Fare','Cabin'],
             axis=1, inplace=True)
display(titanic.head(3),titanic.shape)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


(891, 12)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S


(891, 7)

In [None]:
# 데이터 전처리
gender_dict ={'male':0,
              'female':1}

titanic['Sex'] = titanic['Sex'].map(gender_dict)
display(titanic.head(3),titanic.shape)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,0,22.0,1,0,S
1,1,1,1,38.0,1,0,C
2,1,3,1,26.0,0,0,S


(891, 7)

In [None]:
# 가족처리
titanic['Family']=titanic['SibSp'] +titanic['Parch']
titanic.drop(['SibSp','Parch'], axis=1, inplace=True)
display(titanic.head(3),titanic.shape)

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,0,22.0,S,1
1,1,1,1,38.0,C,1
2,1,3,1,26.0,S,0


(891, 6)

In [None]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       714 non-null    float64
 4   Embarked  889 non-null    object 
 5   Family    891 non-null    int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 41.9+ KB


In [None]:
# Embarked 컬럼에 결측치가 2개가 있어요
# 결측치를 찾아서 'Q'로 대체
titanic['Embarked'] = titanic['Embarked'].fillna('Q')

# Embarked 컬럼의 영문자를 숫자로 바꿔줄거임
embarked_mapping = {'S':0,
                   'C':1,
                   'Q':2}

titanic['Embarked'] = titanic['Embarked'].map(embarked_mapping)
display(titanic.head(3),titanic.shape)

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,0,22.0,0,1
1,1,1,1,38.0,1,1
2,1,3,1,26.0,0,0


(891, 6)

In [None]:
# 나이 중에 NaN이 있음
# 처리해야 해요... 그런데 NaN위 비중이 20%
# 이런경우에는 삭제하면 당연히 좋지 않아요
# 머신러닝의 결과가 많이 달라질 수 있어요. 그래서 조심
# NaN값을 적절하게 대체해서 사용
# 다른 머신러닝 기법을 이용해서 이 NaN이 어떤값이 되면 좋을지를 추측하는 방법
# 다른 방법은 전통적인 통계치를 이용하는 방법이 있고 일반적으로 많이 사용
# 평균값을 사용

# 가장 간단한 방법은 나이의 평균값을 구해서 NaN을 채우는 방법
# 여자는 여자의 나이 평균을 구해서 채우고 남자는 남자의 나이 평균을 구해서 채워줌
titanic['Age']= titanic['Age'].fillna(titanic['Age'].mean())

In [None]:
# 나이는 어떻게 처리하면 좋을까요?
# 그냥 실제값을 사용하는게 좋을까요?-> 좋지 않을거 같아요
# binning 처리를해요(구간을 이용해서 구간에 어떤 구간에 포함되는지를 명시)
# 8살미만이면 0, 8살부터 20살까지면 1

# 8살 미만 0으로 대체
titanic.loc[titanic['Age'] < 8,:'Age']=0
titanic.loc[(titanic['Age'] >= 8) & (titanic['Age'] < 20),'Age'] =1
titanic.loc[(titanic['Age'] >= 20) & (titanic['Age'] < 65),'Age'] =2
titanic.loc[titanic['Age'] >= 65,'Age'] =3
display(titanic.head(3),titanic.shape)

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,0,2.0,0,1
1,1,1,1,2.0,1,1
2,1,3,1,2.0,0,0


(891, 6)

In [None]:
# 데이터 전처리 끝
# Training data set 준비
x_data =titanic.drop('Survived',axis=1, inplace=False).values
t_data = titanic['Survived'].values.reshape(-1,1)
display(titanic)
# training data 와 test data로 분리
# 그런 다음 training data를 다시 training과 validation용으로 다시 나누어야 해요
# 그런데 이런 validation 데이터를 나눠서 중간평가를 하는건 Keras가 대신 해줘요
# 따라서 특별한 경우가 아니면 validation data는 keras를 이용해서 처리

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,0,0.0,0.0,0,1
1,0,0,0.0,0.0,1,1
2,0,0,0.0,0.0,0,0
3,0,0,0.0,0.0,0,1
4,0,0,0.0,0.0,0,0
...,...,...,...,...,...,...
886,0,0,0.0,0.0,0,0
887,0,0,0.0,0.0,0,0
888,0,0,0.0,0.0,0,3
889,0,0,0.0,0.0,1,0


In [None]:
# Model을 만들면 되요

keras_model = Sequential()

keras_model.add(Flatten(input_shape=(5,)))
keras_model.add(Dense(units=1,
                      activation='sigmoid'))

keras_model.compile(optimizer=Adam(learning_rate=1e-2),
                    loss='binary_crossentropy',
                    metrics=['accuracy'])

keras_model.fit(x_data,
                t_data,
                epochs=300,
                verbose=1,
                validation_split=0.2)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.src.callbacks.History at 0x7bb787b4dcf0>

In [None]:
# scikit-learn으로 학습하고 평가까지 진행

# scikit-learn으로 학습하고 평가까지 진행하려면 당연히
# test data가 있어야 해요

# 전체 데이터를 training data 와 test data로 분리
# 여기서 말하는 전체데이터는 x_data, t_data를 의미
# 7:3 비율로 데이터를 나눌꺼

from sklearn.model_selection import train_test_split
from sklearn import linear_model

x_data_train, x_data_test, t_data_train, t_data_test = \
train_test_split(x_data,
                 t_data,
                 test_size=0.3)

sklearn_model = linear_model.LogisticRegression()
sklearn_model.fit(x_data_train, t_data_train)

sklearn_model.score(x_data_test, t_data_test) #0.8544776119402985

  y = column_or_1d(y, warn=True)


0.8544776119402985

In [3]:
# 만들어진 모델을 이용해서 제출파일을 만들어야 해요
# 제공된 test.csv를 이용해서 파일을 만들면 되요
titanic_test = pd.read_csv('/content/drive/MyDrive/빅데이터 과정 폴더/data/titanic/test.csv')
submission= pd.read_csv('/content/drive/MyDrive/빅데이터 과정 폴더/data/titanic/gender_submission.csv')

titanic_test.drop(['Name','Ticket','Fare','Cabin'],
                  axis=1, inplace=True)

# 데이터 전처리
gender_dict ={'male':0,
              'female':1}

titanic_test['Sex'] = titanic_test['Sex'].map(gender_dict)


# 가족처리
titanic_test['Family']=titanic_test['SibSp'] +titanic_test['Parch']
titanic_test.drop(['SibSp','Parch'], axis=1, inplace=True)


# 여자는 여자의 나이 평균을 구해서 채우고 남자는 남자의 나이 평균을 구해서 채워줌
titanic_test['Age']= titanic_test['Age'].fillna(titanic_test['Age'].mean())

# 8살 미만 0으로 대체
titanic_test.loc[titanic_test['Age'] < 8,:'Age']=0
titanic_test.loc[(titanic_test['Age'] >= 8) & (titanic_test['Age'] < 20),'Age'] =1
titanic_test.loc[(titanic_test['Age'] >= 20) & (titanic_test['Age'] < 65),'Age'] =2
titanic_test.loc[titanic_test['Age'] >= 65,'Age'] =3

# Embarked 컬럼에 결측치가 2개가 있어요
# 결측치를 찾아서 'Q'로 대체
titanic_test['Embarked'] = titanic_test['Embarked'].fillna('Q')

# Embarked 컬럼의 영문자를 숫자로 바꿔줄거임
embarked_mapping = {'S':0,
                   'C':1,
                   'Q':2}

titanic_test['Embarked'] = titanic_test['Embarked'].map(embarked_mapping)

predict = keras_model.predict(titanic_test)

submission['Survived'] = predict
submission['Survived'] = (submission['Survived'] > 0.5).astype('int')
submission.to_csv('sub.csv', index=False)
# test.csv를 이용해서 우리모델에 대한 예측값을 얻고 그 값으로 gender_submission.csv파일 형식으로 결과파일을 만들어서 Kaggle에 upload해서 모델의 정확도를 측정 받으면 됨

NameError: ignored

In [None]:
submission = pd.read_csv('/content/drive/MyDrive/빅데이터 과정 폴더/data/titanic/gender_submission.csv')
display(submission)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
# Multinomial Classification을 구현
# 사용하는 데이터셋은 BMI 데이터셋

# tensorflow keras 와  scikit-learn을 이용해서 구현

# 일단 필요한 모듈부터 import

import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

# Raw Data Loading
df = pd.read_csv('/content/drive/MyDrive/빅데이터 과정 폴더/data/bmi/bmi.csv',
                 skiprows=3)
display(df.head(),df.shape) #(20000,3)

# 데이터 전처리
# 1. 결측치를 찾아서 만약 존재하면 처리
#df.info()
# 결측치 없음
#(df.isnull()).sum() false의 합을 확인 하여 true값이 있는지 확인

# 2. 이상치 처리
#plt.boxplot(df['label'].values) #label에는 이상치 없음
# plt.hist(df['label'].values, bins=3)
#plt.boxplot(df['weight'].values) # height에는 이상치가 없어요
#plt.show()

# 정규화를 해요
x_data = df[['height', 'weight']].values # 2차원
t_data = df['label'] #1차원
scaler =MinMaxScaler()
scaler.fit(x_data)

x_data_norm = scaler.transform(x_data)

#4.train data와 test data를 분할
x_data_train, x_data_test, t_data_train,t_data_test = \
train_test_split(x_data,
                 t_data,
                 test_size = 0.3)


Unnamed: 0,label,height,weight
0,1,188,71
1,2,161,68
2,0,178,52
3,2,136,63
4,1,145,52


(20000, 3)

In [None]:
# Scikit Learn 구현
sklearn_model = linear_model.LogisticRegression()
sklearn_model.fit(x_data_train,
                  t_data_train)

sklearn_model_result = sklearn_model.score(x_data_test,
                                           t_data_test)

print('sklearn model의 accuracy:{}'.format(sklearn_model_result))
#0.982
print(sklearn_model.predict_proba([[167,54]]))
#[[6.05183975e-03 9.93948160e-01 1.49211741e-20]]

sklearn model의 accuracy:0.982
[[3.46504133e-02 9.65349587e-01 1.65443565e-21]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#Tensorflow keras 구현
keras_model = Sequential()

keras_model.add(Flatten(input_shape=(2,)))
keras_model.add(Dense(units=3,
                      activation='softmax'))
keras_model.compile(optimizer=Adam(learning_rate=1e-2),
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])

# 정규화된 데이터를 사용

x_data_norm_train, x_data_norm_test, t_data_train, t_data_test = \
train_test_split(x_data_norm,
                 t_data,
                 test_size=0.3)
keras_model.fit(x_data_norm_train,
                t_data_train,
                epochs=200,
                verbose=1,
                validation_split=0.3)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x7bb7848ba0b0>

In [None]:
# 학습이 다 되면!!
# 평가를 해야 해요
print(keras_model.evaluate(x_data_norm_test, t_data_test))

# Prediction
my_state = np.array([[187,82]])
#[0.0632053092122078, 0.984000027179718]
my_state_norm = scaler.transform(my_state)
print(keras_model.predict(my_state_norm))

[0.0632053092122078, 0.984000027179718]
[[9.7651984e-08 9.1730273e-01 8.2697131e-02]]
