### 지도학습 - 분류

#### 의사결정나무

In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv')

In [5]:
import sklearn

# 의사결정나무 분류모델을 위한 패키지 임포트
from sklearn.tree import DecisionTreeClassifier
# 학습 및 테스트 데이터 셋 분리를 위한 패키지 임포트
from sklearn.model_selection import train_test_split

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [11]:
# Age 결측치는 평균으로 대체
age_mean = df['Age'].mean()
df['Age'].fillna(age_mean, inplace = True)
df['Age'].isnull().sum()

0

In [14]:
# Embarked의 결측치는 최빈값으로 대체
em = df['Embarked'].mode()
df['Embarked'].fillna(em[0], inplace =True)
df['Embarked'].isnull().sum()

0

In [19]:
# Cabin 컬럼은 제외
df = df.drop(['Cabin'], axis = 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [21]:
# LabelEncoder()함수를 사용하여 Sex 컬럼을 1과 0으로 인코딩
from sklearn.preprocessing import LabelEncoder

df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,S


In [22]:
# Embarked 도 똑같이 레이블 인코딩
from sklearn.preprocessing import LabelEncoder

df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

In [23]:
# SibSp, Parch를 더해서 FamilySize 컬럼생성
df['FamilySize'] = df['SibSp'] + df['Parch']
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,2,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,0,1
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,2,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,2,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,2,0


* 분석 데이터셋 준비

In [28]:
# X와 y로 나누기

X = df[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize']]
y = df['Survived']

X

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,3,1,22.000000,7.2500,2,1
1,1,0,38.000000,71.2833,0,1
2,3,0,26.000000,7.9250,2,0
3,1,0,35.000000,53.1000,2,1
4,3,1,35.000000,8.0500,2,0
...,...,...,...,...,...,...
886,2,1,27.000000,13.0000,2,0
887,1,0,19.000000,30.0000,2,0
888,3,0,29.699118,23.4500,2,3
889,1,1,26.000000,30.0000,0,0


In [29]:
# train_test_split()함수를 통해 8:2의 비율로 트레인데이터와 테스트 데이터로 나누기
# train_test_split(X, y test_size = 0.n)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 11)

print(X_train.shape)
print(X_test.shape)

(712, 6)
(179, 6)


In [30]:
# 의사결정모델 학습
dt = DecisionTreeClassifier(random_state = 11)
dt.fit(X_train, y_train)

In [31]:
# 학습된 모델을 통해 X_test 를 입력하여 예측 실시
pred = dt.predict(X_test)

In [32]:
# 정확도 측정
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)

0.7877094972067039


In [34]:
# 평가지표 전체 계산
from sklearn.metrics import classification_report

rpt = classification_report(y_test, pred)
print(rpt)

              precision    recall  f1-score   support

           0       0.84      0.83      0.84       118
           1       0.68      0.70      0.69        61

    accuracy                           0.79       179
   macro avg       0.76      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179



#### KNN을 통한 분류

In [12]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv')

df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [13]:
# 4개의 독립변수에 대해 Min-Max 정규화 실시

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df['sepal_length'] = scaler.fit_transform(df[['sepal_length']])
df['sepal_width'] = scaler.fit_transform(df[['sepal_width']])
df['petal_length'] = scaler.fit_transform(df[['petal_length']])
df['petal_width'] = scaler.fit_transform(df[['sepal_width']])
df.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0.222222,0.625,0.067797,0.625,setosa
1,0.166667,0.416667,0.067797,0.416667,setosa
2,0.111111,0.5,0.050847,0.5,setosa
3,0.083333,0.458333,0.084746,0.458333,setosa
4,0.194444,0.666667,0.067797,0.666667,setosa


In [15]:
# X와 y분리

X = df.drop(['species'], axis = 1 )
y = df['species']

X

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0.222222,0.625000,0.067797,0.625000
1,0.166667,0.416667,0.067797,0.416667
2,0.111111,0.500000,0.050847,0.500000
3,0.083333,0.458333,0.084746,0.458333
4,0.194444,0.666667,0.067797,0.666667
...,...,...,...,...
145,0.666667,0.416667,0.711864,0.416667
146,0.555556,0.208333,0.677966,0.208333
147,0.611111,0.416667,0.711864,0.416667
148,0.527778,0.583333,0.745763,0.583333


In [17]:
# 8:2 로 분할

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2 , random_state = 11)

print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [18]:
# KNeighborsClassifier 객체 생성

knn = KNeighborsClassifier(n_neighbors = 3) # n = 3으로 지정
knn.fit(X_train, y_train) # 학습 진행

In [19]:
# 예측 진행
pred = knn.predict(X_test)

In [22]:
# 정확도 측정

from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, pred)
print(acc)

0.8333333333333334


In [23]:
# 평가 지표 계산

from sklearn.metrics import classification_report
rpt = classification_report(y_test,pred)
print(rpt)

              precision    recall  f1-score   support

      setosa       1.00      0.89      0.94         9
  versicolor       0.73      0.80      0.76        10
   virginica       0.82      0.82      0.82        11

    accuracy                           0.83        30
   macro avg       0.85      0.84      0.84        30
weighted avg       0.84      0.83      0.84        30



#### SVM

In [26]:
import pandas as pd
import numpy as np
import sklearn

from sklearn import svm
from sklearn.model_selection import train_test_split

df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv')

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [28]:
## 전처리
# Age 결측은 평균값으로 대체
age_mean = df['Age'].mean()

df['Age'].fillna(age_mean, inplace =True)

In [32]:
# Embarked 컬럼의 결측치를 최빈값으로 대체
em = df['Embarked'].mode()

df['Embarked'].fillna(em[0], inplace =True)


In [33]:
# Cabin 컬럼 제외
df = df.drop(['Cabin'], axis = 1)

In [34]:
# FamilySize 만들기
df['FamilySize'] = df['SibSp'] + df['Parch']

In [36]:
# Sex 컬럼에 대해 원핫 인코딩 실시
onehot_sex = pd.get_dummies(df['Sex'])
df = pd.concat([df, onehot_sex], axis = 1)

In [38]:
onehot_embarked = pd.get_dummies(df['Embarked'])
df = pd.concat([df, onehot_embarked], axis = 1 )

In [39]:
df= df.drop(['Sex', 'Embarked'], axis = 1)

In [40]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,FamilySize,female,male,C,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,1,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,0,1,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,1,1,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,0,0,1,0,0,1


In [47]:
# 데이터셋 준비

X = df[['Pclass', 'Age', 'Fare', 'FamilySize', 'female', 'male', 'C', 'Q', 'S']]
y = df['Survived']

# 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 10)

print(X_train.shape)

(623, 9)


In [48]:
# svm 객체 생성
sv = svm.SVC(kernel = 'rbf')
sv.fit(X_train, y_train)

## SVM의 파라미터 조정

sv = svm.SVC(kernel = 'linear', C=1, gamma = 0.1)

sv = svm.SVC(kernel = 'rbf', C=1, gamma = 0.1)

In [49]:
# 예측 실시
pred = sv.predict(X_test)

In [50]:
# 정확도
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, pred)
print(acc)

0.7238805970149254


In [52]:
# 다른 지표
from sklearn.metrics import classification_report

rpt = classification_report(y_test, pred)
print(rpt)

              precision    recall  f1-score   support

           0       0.71      0.96      0.82       174
           1       0.79      0.29      0.42        94

    accuracy                           0.72       268
   macro avg       0.75      0.62      0.62       268
weighted avg       0.74      0.72      0.68       268



#### 로지스틱 회귀

In [54]:
import pandas as pd
import numpy as np
import sklearn

df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv')

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [55]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df['sepal_length'] = scaler.fit_transform(df[['sepal_length']])
df['sepal_width'] = scaler.fit_transform(df[['sepal_width']])
df['petal_length'] = scaler.fit_transform(df[['petal_length']])
df['petal_width'] = scaler.fit_transform(df[['sepal_width']])

# X와 y분리

X = df.drop(['species'], axis = 1 )
y = df['species']
# 8:2 로 분할

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2 , random_state = 11)

print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [57]:
# LogisticRegression 객체 생성
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [58]:
pred = lr.predict(X_test)

#### 랜덤포레스트 전문

In [63]:
import pandas as pd
import numpy as np
import sklearn

from sklearn import svm
from sklearn.model_selection import train_test_split

df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv')

df.head()

## 전처리
# Age 결측은 평균값으로 대체
age_mean = df['Age'].mean()

df['Age'].fillna(age_mean, inplace =True)
# Embarked 컬럼의 결측치를 최빈값으로 대체
em = df['Embarked'].mode()

df['Embarked'].fillna(em[0], inplace =True)
# Cabin 컬럼 제외
df = df.drop(['Cabin'], axis = 1)
# FamilySize 만들기
df['FamilySize'] = df['SibSp'] + df['Parch']
# Sex 컬럼에 대해 원핫 인코딩 실시
onehot_sex = pd.get_dummies(df['Sex'])
df = pd.concat([df, onehot_sex], axis = 1)
onehot_embarked = pd.get_dummies(df['Embarked'])
df = pd.concat([df, onehot_embarked], axis = 1 )
df= df.drop(['Sex', 'Embarked'], axis = 1)

# 데이터셋 준비

X = df[['Pclass', 'Age', 'Fare', 'FamilySize', 'female', 'male', 'C', 'Q', 'S']]
y = df['Survived']

# 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 11)

print(X_train.shape)

# 모델 생성
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 50, max_depth =3, random_state =20)
rf.fit(X_train, y_train)

pred = rf.predict(X_test)

from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)

(712, 4)
0.770949720670391


### 지도학습 - 회귀

#### 단순선형회귀


In [65]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/auto-mpg.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    396 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model-year    398 non-null    int64  
dtypes: float64(4), int64(3)
memory usage: 21.9 KB


In [66]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model-year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70
4,17.0,8,302.0,140.0,3449,10.5,70


In [69]:
# 상관계수 분석
corr = df.corr(method='pearson')
print(corr)

                   mpg  cylinders  displacement  horsepower    weight  \
mpg           1.000000  -0.775396     -0.804203   -0.777575 -0.831741   
cylinders    -0.775396   1.000000      0.950721    0.843751  0.896017   
displacement -0.804203   0.950721      1.000000    0.897787  0.932824   
horsepower   -0.777575   0.843751      0.897787    1.000000  0.864350   
weight       -0.831741   0.896017      0.932824    0.864350  1.000000   
acceleration  0.420289  -0.505419     -0.543684   -0.687241 -0.417457   
model-year    0.579267  -0.348746     -0.370164   -0.420697 -0.306564   

              acceleration  model-year  
mpg               0.420289    0.579267  
cylinders        -0.505419   -0.348746  
displacement     -0.543684   -0.370164  
horsepower       -0.687241   -0.420697  
weight           -0.417457   -0.306564  
acceleration      1.000000    0.288137  
model-year        0.288137    1.000000  


In [70]:
# horsepower에 결측치가있는 2개의 행 삭제
df = df.dropna(axis=0)

In [72]:
# weight와 mpg간의 관계 분석
X = df[['weight']] # X는 여러개가 들어갈 수 있으므로 2차원의 형태로 있어야함
y = df['mpg']

In [74]:
# 데이터셋 나누기
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 10)

print(X_train.shape)

(316, 1)


In [76]:
# LinearRegression 객체 생성
lr = LinearRegression()
lr.fit(X_train, y_train)

In [77]:
pred = lr.predict(X_test)

In [78]:
# 기울기 와 y절편 출력
print(lr.coef_)
print(lr.intercept_)

[-0.00774371]
46.62501834798047


In [79]:
# 결정계수 출력
from sklearn.metrics import r2_score

score = r2_score(y_test, pred)
print(score)

# 선형회귀모델을 학습 데이터를 상대로도 r2 스코어 측정 가능



0.7015633872576372


#### 다중 선형 회귀

In [80]:
# 단순선형회귀와 같음

#### 의사결정나무 예측

In [104]:
import pandas as pd
import numpy as np


df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/housing.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [105]:
df = df.dropna(axis=0)

In [106]:
df = df.drop(['ocean_proximity'], axis = 1)

In [107]:
X = df.drop('median_house_value', axis = 1)
y = df['median_house_value']

from sklearn.tree import DecisionTreeRegressor

In [112]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [113]:
# DeicisionTreeRegressor 객체 생성
dt = DecisionTreeRegressor(max_depth = 3, random_state =42)
dt.fit(X_train, y_train)

In [114]:
pred = dt.predict(X_test)

In [115]:
# 평가 MSE
from sklearn.metrics import mean_squared_error
# dir(sklearn.metrics)
mse = mean_squared_error(y_test, pred)
print(mse)


6793101269.876856


#### 랜덤 포레스트 회귀

In [124]:
import pandas as pd
import numpy as np
import sklearn

df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/housing.csv')

In [125]:
from sklearn.ensemble import RandomForestRegressor
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [126]:
df = df.dropna(axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


In [127]:
df = df.drop('ocean_proximity', axis=1)

In [128]:
X = df.drop('median_house_value', axis=1)
y= df['median_house_value']

In [130]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3 , random_state=42)
print(X_train.shape)
print(y_train.shape)

(14303, 8)
(14303,)


In [133]:
# RandomForestRegressor 객체 생성
rfr = RandomForestRegressor(max_depth = 3, random_state=42)
rfr.fit(X_train, y_train)

In [134]:
pred = rfr.predict(X_test)

In [137]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, pred)
print(mse)

6447828605.376922


### 비지도학습(군집분석)

#### K-means 알고리즘

In [138]:
import pandas as pd
import numpy as np
import sklearn

df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv')

In [145]:
from sklearn.cluster import KMeans

In [149]:
# species를 레이블 인코딩
from sklearn.preprocessing import LabelEncoder

df['species'] = LabelEncoder().fit_transform(df['species'])

In [150]:
# x준비

X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]

In [155]:
# KMeans 객체 생성
cluster1 = KMeans(n_clusters = 3, n_init=10, max_iter=500, random_state=42)
cluster1.fit(X)

In [157]:
# 결과값을 변수에 저장
cluster_center = cluster1.cluster_centers_
cluster_prediction = cluster1.predict(X)
print(pd.DataFrame(cluster_center))
print(cluster_prediction)

          0         1         2         3
0  5.901613  2.748387  4.393548  1.433871
1  5.006000  3.428000  1.462000  0.246000
2  6.850000  3.073684  5.742105  2.071053
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]


In [158]:
# 예측한 값을 데이터 프레임에 붙이기
df['cluster'] = cluster_prediction
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,cluster
0,5.1,3.5,1.4,0.2,0,1
1,4.9,3.0,1.4,0.2,0,1
2,4.7,3.2,1.3,0.2,0,1
3,4.6,3.1,1.5,0.2,0,1
4,5.0,3.6,1.4,0.2,0,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,2
146,6.3,2.5,5.0,1.9,2,0
147,6.5,3.0,5.2,2.0,2,2
148,6.2,3.4,5.4,2.3,2,2


In [160]:
# kmeans의 적절한 군집 표현 알기
scope = range(1,10)
inertias = []
for k in scope :
  model = KMeans(n_clusters = k, n_init = 10,max_iter=500 ,random_state=42)
  model.fit(X)
  inertias.append(model.inertia_)
  print(k, inertias[k-1])

1 681.3706
2 152.3479517603579
3 78.851441426146
4 57.22847321428572
5 46.446182051282065
6 39.03998724608726
7 34.46949589883801
8 30.1865551948052
9 28.28937085137085


#### 연관분석

In [163]:
from mlxtend.frequent_patterns import apriori, association_rules

df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/retail_dataset.csv', sep = ',')

In [164]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


In [168]:
items = set()

In [169]:
for col in df :
  items.update(df[col].unique())

In [170]:
items

{'Bagel',
 'Bread',
 'Cheese',
 'Diaper',
 'Eggs',
 'Meat',
 'Milk',
 'Pencil',
 'Wine',
 nan}