# 머신 러닝 실기 평가

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Iris sepal_width에 대한 회귀식 

In [3]:
from sklearn.datasets import load_iris
iris = load_iris()

In [4]:
columns = ['sepal_length','sepal_width','petal_length','petal_width']
irisDF = pd.DataFrame(iris.data, columns=columns)
irisDF['variety']=iris.target
irisDF.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,variety
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [5]:
y = irisDF['sepal_width'].values
y[:3]

array([3.5, 3. , 3.2])

In [6]:
del irisDF['sepal_width']

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = \
    train_test_split(irisDF, y, stratify=irisDF['variety'], 
                     test_size=0.2, random_state=2020)

In [8]:
from sklearn.linear_model import LinearRegression
sim_lr = LinearRegression()

In [9]:
sim_lr.fit(x_train.values, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
y_pred = sim_lr.predict(x_test.values)

In [11]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [12]:
rmse

0.3344055511870948

In [13]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.3636547458379473

In [14]:
sim_lr.coef_

array([ 0.63711424, -0.53485016,  0.55807355, -0.12647156])

In [15]:
sim_lr.intercept_

0.782649011498552

#### sepal_width에 대한 회귀식
- sl = 0.637 * sl - 0.535 * pl + 0.558 * pw - 0.126 * v + 0.783

#### RMSE 값: 0.3344

## 폐암 환자 생존율 예측

In [16]:
data_set = np.loadtxt("data/ThoraricSurgery.csv", delimiter=",")
data_set

array([[293.  ,   1.  ,   3.8 , ...,   0.  ,  62.  ,   0.  ],
       [  1.  ,   2.  ,   2.88, ...,   0.  ,  60.  ,   0.  ],
       [  8.  ,   2.  ,   3.19, ...,   0.  ,  66.  ,   1.  ],
       ...,
       [406.  ,   6.  ,   5.36, ...,   0.  ,  62.  ,   0.  ],
       [ 25.  ,   8.  ,   4.32, ...,   0.  ,  58.  ,   1.  ],
       [447.  ,   8.  ,   5.2 , ...,   0.  ,  49.  ,   0.  ]])

In [17]:
X = data_set[:, :-1]
Y = data_set[:, -1]
X.shape, Y.shape

((470, 17), (470,))

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = \
    train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2020)

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train, y_train)
y_pred_svc = svc.predict(x_test)
accuracy_score(y_test, y_pred_svc)

0.851063829787234

In [20]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
y_pred_dtc = dtc.predict(x_test)
accuracy_score(y_test, y_pred_dtc)

0.6808510638297872

In [21]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)
accuracy_score(y_test, y_pred_lr)

0.8404255319148937

## 차원 축소후 군집화

In [22]:
from sklearn.preprocessing import StandardScaler

iris_std = StandardScaler().fit_transform(iris.data)
iris_std[:3, :]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ]])

In [23]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

#fit( )과 transform( ) 을 호출하여 PCA 변환 데이터 반환
pca.fit(iris_std)
iris_pca = pca.transform(iris_std)

In [24]:
columns = {'PCA1', 'PCA2'}
iris_pca_df = pd.DataFrame(data=iris_pca, columns=columns)
iris_pca_df.head(3)

Unnamed: 0,PCA1,PCA2
0,-2.264703,0.480027
1,-2.080961,-0.674134
2,-2.364229,-0.341908


In [25]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, init='k-means++', max_iter=300,
                random_state=2020)
kmeans.fit(iris_pca_df)
k2result = kmeans.labels_

In [26]:
from sklearn.metrics import silhouette_samples, silhouette_score
score_samples = silhouette_samples(iris_pca_df.values, k2result)
average_score2 = silhouette_score(iris_pca_df.values, k2result)
average_score2

0.6145202036230449

In [27]:
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300,
                random_state=2020)
kmeans.fit(iris_pca_df)
k3result = kmeans.labels_

In [28]:
average_score3 = silhouette_score(iris_pca_df.values, k3result)
average_score3

0.5091683341538228

In [29]:
kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300,
                random_state=2020)
kmeans.fit(iris_pca_df)
k4result = kmeans.labels_

In [30]:
average_score4 = silhouette_score(iris_pca_df.values, k4result)
average_score4

0.44450802349850316

In [31]:
iris_pca_df['class'] = iris.target
iris_pca_df['k2'] = k2result
iris_pca_df['k3'] = k3result
iris_pca_df['k4'] = k4result
iris_pca_df.head(5)

Unnamed: 0,PCA1,PCA2,class,k2,k3,k4
0,-2.264703,0.480027,0,1,1,3
1,-2.080961,-0.674134,0,1,1,0
2,-2.364229,-0.341908,0,1,1,0
3,-2.299384,-0.597395,0,1,1,0
4,-2.389842,0.646835,0,1,1,3


In [32]:
iris_pca_df.tail(5)

Unnamed: 0,PCA1,PCA2,class,k2,k3,k4
145,1.870503,0.386966,2,0,2,1
146,1.56458,-0.896687,2,0,0,2
147,1.52117,0.269069,2,0,2,1
148,1.372788,1.011254,2,0,2,1
149,0.960656,-0.024332,2,0,0,2


In [33]:
# 실루엣 계수
average_score2, average_score3, average_score4

(0.6145202036230449, 0.5091683341538228, 0.44450802349850316)

## 미드웨이 영화 리뷰 감성 분석

In [34]:
midway = pd.read_csv('./data/midway.tsv', sep='\t')

In [35]:
midway.head(3)

Unnamed: 0.1,Unnamed: 0,평점,일시,감상평
0,0,9,2019.12.31 09:48,미드웨이가 재밌으면 추천 백두산이 재밌으면 비추
1,1,10,2019.12.31 10:41,저 해전이 있었기에 우리나라 광복도 가능 했음
2,2,10,2019.12.31 09:38,백두산 상영관 대폭줄이고 미드웨이 상영관 대폭늘려라


In [36]:
midway.drop(midway.columns[[0, 2]], axis=1, inplace=True)

In [37]:
midway['평점'].value_counts()

10    3071
8      513
9      468
6      216
7      169
1      107
4       97
2       90
5       68
3       26
Name: 평점, dtype: int64

In [38]:
positive = midway['평점'] == 10
negative = midway['평점'] < 8
mid_pos = midway[positive]
mid_pos['class'] = np.ones(len(mid_pos), int)
mid_neg = midway[negative]
mid_neg['class'] = np.zeros(len(mid_neg), int)
mid_df = pd.concat([mid_pos, mid_neg])

In [39]:
del mid_df['평점']
mid_df.head()

Unnamed: 0,감상평,class
1,저 해전이 있었기에 우리나라 광복도 가능 했음,1
2,백두산 상영관 대폭줄이고 미드웨이 상영관 대폭늘려라,1
3,방금 개봉했는데 1점 준애는 뭐냐 ㅋㅋ 일본놈이냐? 이제 광고돌고 있을 시간이다.,1
4,교과서에는 미드웨이 해전에서 미국이 승리했다고 한 줄로 서술되어 있지만 단순히 한 ...,1
5,스케일 겁나커서 보는내내 입벌리고 봣네욬ㅋㅋ재밋습니다,1


In [40]:
mid_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3844 entries, 1 to 4824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   감상평     3397 non-null   object
 1   class   3844 non-null   int32 
dtypes: int32(1), object(1)
memory usage: 75.1+ KB


In [41]:
mid_df.dropna(inplace=True)
mid_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3397 entries, 1 to 4824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   감상평     3397 non-null   object
 1   class   3397 non-null   int32 
dtypes: int32(1), object(1)
memory usage: 66.3+ KB


In [44]:
X_train, X_test, y_train, y_test = \
    train_test_split(mid_df['감상평'], mid_df['class'], test_size=0.2, 
                     stratify=mid_df['class'], random_state=2020)
X_train.shape, X_test.shape

((2717,), (680,))

In [45]:
from konlpy.tag import Okt

okt = Okt()
def okt_tokenizer(text):
    # 입력 인자로 들어온 text를 형태소 단어로 토큰화하여 list 객체 반환
    tokens_ko = okt.morphs(text)
    return tokens_ko

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(tokenizer=okt_tokenizer, ngram_range=(1,2), 
                        min_df=3, max_df=0.9)
tvect.fit(X_train)
tfidf_matrix_train = tvect.transform(X_train)

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Logistic Regression 을 이용하여 감성 분석 Classification 수행. 
lg_clf = LogisticRegression(random_state=2020)

# Parameter C 최적화를 위해 GridSearchCV 를 이용. 
params = {'C': [1 ,3.5, 4.5, 5.5, 10]}
grid_cv = GridSearchCV(lg_clf, param_grid=params , cv=3 ,scoring='accuracy', verbose=1 )
grid_cv.fit(tfidf_matrix_train, y_train)
print(grid_cv.best_params_, round(grid_cv.best_score_, 4))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


{'C': 10} 0.8579


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.4s finished


In [48]:
from sklearn.metrics import accuracy_score

# 학습 데이터를 적용한 TfidfVectorizer를 이용하여 테스트 데이터를 TF-IDF 값으로 Feature 변환함. 
tfidf_matrix_test = tvect.transform(X_test)

# classifier 는 GridSearchCV에서 최적 파라미터로 학습된 classifier를 그대로 이용
best_estimator = grid_cv.best_estimator_
preds = best_estimator.predict(tfidf_matrix_test)

print('Logistic Regression 정확도: ', 
      accuracy_score(y_test, preds))

Logistic Regression 정확도:  0.8647058823529412
