In [5]:
import pandas as pd
import numpy as np


pd.set_option('display.max_rows', 200)    
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 20)


# 1. 데이터 가져오기. 데이터 정보 확인
# 2005년부터 2020년까지의 16년간의 데이터
world_2020 = pd.read_csv('data/projectdata/world-happiness-report.csv')  
world_2020.info() 
world_2020.shape   # 1949행, 11열 

happy20 = pd.read_csv('data/projectdata/2020.csv')
happy20.info()     # 결측치 없음 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1949 entries, 0 to 1948
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      1949 non-null   object 
 1   year                              1949 non-null   int64  
 2   Life Ladder                       1949 non-null   float64
 3   Log GDP per capita                1913 non-null   float64
 4   Social support                    1936 non-null   float64
 5   Healthy life expectancy at birth  1894 non-null   float64
 6   Freedom to make life choices      1917 non-null   float64
 7   Generosity                        1860 non-null   float64
 8   Perceptions of corruption         1839 non-null   float64
 9   Positive affect                   1927 non-null   float64
 10  Negative affect                   1933 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 167.6+ KB
<class 'pa

In [6]:
# 2. 결측치 처리

# 원본데이터를 통한 결측치 분석
# 피처의 값이 하나도 존재하지 않는 경우 ( 나라 : 피처명 ) => 결측치가 존재하는 행을 삭제하면 나라도 삭제되는 문제 발생
# Hongkong S.A.R. of China, Kosovo : Healthy life expectancy at birth
# China, Turkmenistan : Perceptions of corruption
# Somalia, South Sudan : Log GDP per capita, Generosity
# Somaliland region, North Cyprus : Log GDP per capita, Healthy life expectancy at birth, Generosity

# 일년치만 관측된 나라 중 3개의 피처가 결측치인 경우
# Oman(2011), Cuba(2006), Maldives(2018) 
# 중요 6가지 지표 중 (http://naver.me/FGoFBWHm ) 절반 이상이 없는 것은 분석의 의미가 없다고 생각 => 삭제 



# (1) 결측치 개수 확인 
world_2020.isnull().sum()
world_2020.isnull().sum().sum()  # 373 


# (2) 결측치가 3개 이상 있는 행을 삭제  
world_2020 = world_2020.dropna(thresh=9, axis=0)
# thresh=9 : nan이 아닌 값(정상값)이 최소 9개 이상 나와야 함. 
world_2020.isnull().sum().sum()   #239


# (3) 각 나라별 피처의 평균값으로 결측치 대체
world_2020['Log GDP per capita'].fillna(world_2020.groupby('Country name')\
                                        ['Log GDP per capita'].transform('mean'),inplace=True)

world_2020['Social support'].fillna(world_2020.groupby('Country name')\
                                        ['Social support'].transform('mean'),inplace=True)

world_2020['Healthy life expectancy at birth'].fillna(world_2020.groupby('Country name')\
                                        ['Healthy life expectancy at birth'].transform('mean'),inplace=True)
        
world_2020['Freedom to make life choices'].fillna(world_2020.groupby('Country name')\
                                        ['Freedom to make life choices'].transform('mean'),inplace=True)

world_2020['Generosity'].fillna(world_2020.groupby('Country name')\
                                        ['Generosity'].transform('mean'),inplace=True)

world_2020['Perceptions of corruption'].fillna(world_2020.groupby('Country name')\
                                        ['Perceptions of corruption'].transform('mean'),inplace=True)

world_2020['Positive affect'].fillna(world_2020.groupby('Country name')\
                                        ['Positive affect'].transform('mean'),inplace=True)

world_2020['Negative affect'].fillna(world_2020.groupby('Country name')\
                                        ['Negative affect'].transform('mean'),inplace=True)

world_2020.isnull().sum().sum()   # 60




# (4) 나라별 피처의 값이 아예 존재하지 않는 경우 happy20 데이터에서 찾아서 채우기
world_2020.isnull().sum()  
# Log GDP per capita : 7, Healthy life expectancy at birth : 22,
# Generosity : 7, Perceptions of corruption : 24

mask = world_2020['Log GDP per capita'].isnull()
print(world_2020[mask]['Country name'].unique())  #  Somalia , South Sudan 

mask = world_2020['Healthy life expectancy at birth'].isnull()
print(world_2020[mask]['Country name'].unique())  # Hong Kong S.A.R. of China, Kosovo

mask = world_2020['Generosity'].isnull()
print(world_2020[mask]['Country name'].unique())  # Somalia , South Sudan 

mask = world_2020['Perceptions of corruption'].isnull()
print(world_2020[mask]['Country name'].unique())  # China , Turkmenistan


# (4)-1. Somalia, South Sudan => Log GDP per capita, Generosity 구하기 
# Somalia가 happy20 데이터에 없음 => 삭제
mask = world_2020['Country name']=='Somalia'  
world_2020.drop(world_2020[mask].index,inplace=True)
world_2020.isnull().sum().sum()  # 54 (Log GDP per capita, Generosity 각 3개씩 삭제됨)

# 인덱스 재정렬 
world_2020.reset_index(drop=True, inplace=True)  

# South Sudan의 Log GDP per capita, Generosity happy20에서 구해서 결측값 채우기
# Log GDP per capita
mask = happy20['Country name'] =='South Sudan'
SSudan_GDP_20 = happy20[mask]['Logged GDP per capita'].values[0]
SSudan_GDP_20  # 7.425359726

world_2020['Log GDP per capita'].fillna(SSudan_GDP_20,inplace=True)
world_2020.isnull().sum()  

# Generosity
mask = happy20['Country name'] =='South Sudan'
SSudan_Generosity_20 = happy20[mask]['Generosity'].values[0]
SSudan_Generosity_20  # 0.016518548

world_2020['Generosity'].fillna(SSudan_Generosity_20,inplace=True)
world_2020.isnull().sum()  


# (4)-2. Hong Kong S.A.R. of China, Kosovo => Healthy life expectancy at birth

# Hong Kong S.A.R. of China
mask = happy20['Country name'] =='Hong Kong S.A.R. of China'
HKSAR_HLE_20 = happy20[mask]['Healthy life expectancy'].values[0]
HKSAR_HLE_20   # 76.77170563

mask = world_2020['Country name'] =='Hong Kong S.A.R. of China'
world_2020[mask]
world_2020.loc[692:702,'Healthy life expectancy at birth'].replace(np.nan,HKSAR_HLE_20,inplace=True)
world_2020.isnull().sum()  

# Kosovo
mask = happy20['Country name'] =='Kosovo'
KSV_HLE_20 = happy20[mask]['Healthy life expectancy'].values[0]
KSV_HLE_20   # 63.88555527 

mask = world_2020['Country name'] =='Kosovo'
world_2020[mask]
world_2020.loc[894:906,'Healthy life expectancy at birth'].replace(np.nan,KSV_HLE_20,inplace=True)
world_2020.isnull().sum()  


# (4)-3. China , Turkmenistan => Perceptions of corruption
# Turkmenistan
mask = happy20['Country name'] =='Turkmenistan'
T_PC_20 = happy20[mask]['Perceptions of corruption'].values[0]
T_PC_20   # 0.883691847

mask = world_2020['Country name'] =='Turkmenistan'
world_2020[mask].index
world_2020.loc[1729:1739,'Perceptions of corruption'].replace(np.nan,T_PC_20,inplace=True)
world_2020.isnull().sum()  

# China
mask = happy20['Country name'] =='China'
C_PC_20 = happy20[mask]['Perceptions of corruption'].values[0]
C_PC_20   # 0.7539711

mask = world_2020['Country name'] =='China'
world_2020[mask].index
world_2020.loc[338:352,'Perceptions of corruption'].replace(np.nan,C_PC_20,inplace=True)
world_2020.isnull().sum()  

['Somalia' 'South Sudan']
['Hong Kong S.A.R. of China' 'Kosovo']
['Somalia' 'South Sudan']
['China' 'Turkmenistan']


Country name                        0
year                                0
Life Ladder                         0
Log GDP per capita                  0
Social support                      0
Healthy life expectancy at birth    0
Freedom to make life choices        0
Generosity                          0
Perceptions of corruption           0
Positive affect                     0
Negative affect                     0
dtype: int64

In [7]:
###### 회귀분석######\

# 1. 독립속성과 종속속성 나누기
# 1-1. 독립속성 선택 : Country name, year, Life Ladder, Positive affect, Negative affect 제외한 나머지 피처 
X = world_2020[['Log GDP per capita','Social support', 'Healthy life expectancy at birth',
       'Freedom to make life choices', 'Generosity','Perceptions of corruption']] 
# 1-2. 종속속성 선택
y = world_2020['Life Ladder']


# 2. 정규화하기
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)


# 3. train 데이터와 test 데이터로 구분
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=4)

# 4. 회귀모델 생성
from sklearn import linear_model
lr = linear_model.LinearRegression()

# 5. 학습하기
model = lr.fit(X_train, y_train)

# 6. 평가
# 6-1. R2 출력하기
print(model.score(X_train,y_train))   # 0.7416343112439837
print(model.score(X_test,y_test))     # 0.7446313433457559

# 6-2. RMSE score 출력하기. MSE가 작을수록 좋은 것 
from math import sqrt
from sklearn.metrics import mean_squared_error
y_pre = lr.predict(X_train)
rmse = sqrt(mean_squared_error(y_train,y_pre))
print(rmse)   # 0.5707768339431321

y_pre = lr.predict(X_test)
rmse = sqrt(mean_squared_error(y_test,y_pre))
print(rmse)   # 0.565258320347989



# r2 score 값은 1에 가까울수록 성능이 좋다.
# 학습데이터의 r2score와 test 데이터의 r2score는 값이 비슷한 경우가 학습데이터가 모델을 잘 설명하는 데이터.

0.7416343112439837
0.7446313433457559
0.5707768339431321
0.565258320347989


In [8]:
###### 그룹화하기 ######

# 행복지수 기준으로 내림차순 
world_2020.sort_values(by='Life Ladder', ascending=False,inplace=True)

# 인덱스 재정렬
world_2020.reset_index(drop=True, inplace=True)  

# 1,2,3 으로 그룹화

newlist = []
for idx in world_2020.index :
    if idx <= 635 :
        newlist += ['1']
    elif idx <= 1271 :
        newlist += ['2']
    else :
        newlist += ['3']
world_2020['group'] = newlist
print(world_2020['group'].unique())   # ['1' '2' '3']

['1' '2' '3']


In [9]:
####### 데이터 분리 #######

# 독립속성과 종속속성 나누기
X = world_2020[['Log GDP per capita','Social support', 'Healthy life expectancy at birth',
       'Freedom to make life choices', 'Generosity','Perceptions of corruption']] 
y = world_2020['group']


# 정규화하기
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)


# train 데이터와 test 데이터로 구분
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=4)
print('train data 개수: ', X_train.shape)   # (1525, 6)
print('test data 개수: ', X_test.shape)     # (382, 6)

train data 개수:  (1525, 6)
test data 개수:  (382, 6)


In [10]:
###### 1. KNN 분류 모형 ######

# KNN 분류 모형 - sklearn 사용
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_hat = knn.predict(X_test)
print(y_hat[0:10])          # ['2' '2' '3' '1' '2' '1' '1' '2' '1' '3']
print(y_test.values[0:10])  # ['3' '2' '2' '1' '2' '1' '2' '2' '1' '3']

# 성능평가하기
from sklearn import metrics
knn_matrix = metrics.confusion_matrix(y_test, y_hat)
print(knn_matrix)

'''
[[107  13   0]
 [ 24  93  17]
 [  4  16 108]]
'''
knn_report = metrics.classification_report(y_test, y_hat)
print(knn_report)
'''
              precision    recall  f1-score   support

           1       0.79      0.89      0.84       120
           2       0.76      0.69      0.73       134
           3       0.86      0.84      0.85       128

    accuracy                           0.81       382
   macro avg       0.81      0.81      0.81       382
weighted avg       0.81      0.81      0.80       382
'''
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("정확도(Accuracy) :", accuracy_score(y_test,y_hat))   # 0.806282722513089

['2' '2' '3' '1' '2' '1' '1' '2' '1' '3']
['3' '2' '2' '1' '2' '1' '2' '2' '1' '3']
[[107  13   0]
 [ 24  93  17]
 [  4  16 108]]
              precision    recall  f1-score   support

           1       0.79      0.89      0.84       120
           2       0.76      0.69      0.73       134
           3       0.86      0.84      0.85       128

    accuracy                           0.81       382
   macro avg       0.81      0.81      0.81       382
weighted avg       0.81      0.81      0.80       382

정확도(Accuracy) : 0.806282722513089


In [11]:
###### 2. SVM 분류 모형 ######

from sklearn import svm
svm_model = svm.SVC(kernel='rbf')
svm_model.fit(X_train, y_train)
y_hat= svm_model.predict(X_test)
print(y_hat[0:10])             # ['3' '2' '3' '1' '2' '1' '1' '1' '1' '3']
print(y_test.values[0:10])     # ['3' '2' '2' '1' '2' '1' '2' '2' '1' '3'] 

# 모형 성능 평가
from sklearn import metrics
svm_matrix = metrics.confusion_matrix(y_test,y_hat)
print(svm_matrix)
'''
[[105  15   0]
 [ 20  93  21]
 [  1  16 111]]
'''
svm_report = metrics.classification_report(y_test, y_hat)
print(svm_report)
'''
              precision    recall  f1-score   support

           1       0.83      0.88      0.85       120
           2       0.75      0.69      0.72       134
           3       0.84      0.87      0.85       128

    accuracy                           0.81       382
   macro avg       0.81      0.81      0.81       382
weighted avg       0.81      0.81      0.81       382
'''
print("정확도(Accuracy) : ", accuracy_score(y_test,y_hat))    #  0.8089005235602095

['3' '2' '3' '1' '2' '1' '1' '1' '1' '3']
['3' '2' '2' '1' '2' '1' '2' '2' '1' '3']
[[105  15   0]
 [ 20  93  21]
 [  1  16 111]]
              precision    recall  f1-score   support

           1       0.83      0.88      0.85       120
           2       0.75      0.69      0.72       134
           3       0.84      0.87      0.85       128

    accuracy                           0.81       382
   macro avg       0.81      0.81      0.81       382
weighted avg       0.81      0.81      0.81       382

정확도(Accuracy) :  0.8089005235602095


In [13]:
###### 3. Decision Tree 분류 모형 #####
from sklearn import tree
tree_model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=5)
tree_model.fit(X_train, y_train)
y_hat = tree_model.predict(X_test)
print(y_hat[0:10])           # ['3' '2' '3' '1' '3' '2' '1' '2' '1' '3']
print(y_test.values[0:10])   # ['3' '2' '2' '1' '2' '1' '2' '2' '1' '3']

# 모형 성능 평가
tree_matrix = metrics.confusion_matrix(y_test, y_hat)
print(tree_matrix)
'''
[[ 88  31   1]
 [ 12  98  24]
 [  2  17 109]]
'''
tree_report = metrics.classification_report(y_test, y_hat)
print(tree_report)
'''
              precision    recall  f1-score   support

           1       0.86      0.73      0.79       120
           2       0.67      0.73      0.70       134
           3       0.81      0.85      0.83       128

    accuracy                           0.77       382
   macro avg       0.78      0.77      0.77       382
weighted avg       0.78      0.77      0.77       382
'''
# 정확도
print("정확도(Accuracy) : ", accuracy_score(y_test,y_hat))   #  0.7722513089005235

['3' '2' '3' '1' '3' '2' '1' '2' '1' '3']
['3' '2' '2' '1' '2' '1' '2' '2' '1' '3']
[[ 88  31   1]
 [ 12  98  24]
 [  2  17 109]]
              precision    recall  f1-score   support

           1       0.86      0.73      0.79       120
           2       0.67      0.73      0.70       134
           3       0.81      0.85      0.83       128

    accuracy                           0.77       382
   macro avg       0.78      0.77      0.77       382
weighted avg       0.78      0.77      0.77       382

정확도(Accuracy) :  0.7722513089005235


In [14]:
###### 4. 로지스틱회귀분석 ######

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=4)
lr.fit(X_train,y_train)
y_hat = lr.predict(X_test)
print(y_hat[:10])           # ['3' '2' '3' '1' '2' '1' '1' '1' '1' '3']
print(y_test.values[:10])   # ['3' '2' '2' '1' '2' '1' '2' '2' '1' '3']


# 모형 성능 평가
from sklearn.metrics import confusion_matrix
confmat = confusion_matrix(y_test,y_hat)
print(confmat)
'''
[[103  17   0]
 [ 20  94  20]
 [  1  26 101]]
'''
logistic_report = metrics.classification_report(y_test, y_hat)
print(logistic_report)
'''
              precision    recall  f1-score   support

           1       0.83      0.86      0.84       120
           2       0.69      0.70      0.69       134
           3       0.83      0.79      0.81       128

    accuracy                           0.78       382
   macro avg       0.78      0.78      0.78       382
weighted avg       0.78      0.78      0.78       382
'''
print("정확도(Accuracy) : ", accuracy_score(y_test,y_hat))    # 0.7801047120418848

['3' '2' '3' '1' '2' '1' '1' '1' '1' '3']
['3' '2' '2' '1' '2' '1' '2' '2' '1' '3']
[[103  17   0]
 [ 20  94  20]
 [  1  26 101]]
              precision    recall  f1-score   support

           1       0.83      0.86      0.84       120
           2       0.69      0.70      0.69       134
           3       0.83      0.79      0.81       128

    accuracy                           0.78       382
   macro avg       0.78      0.78      0.78       382
weighted avg       0.78      0.78      0.78       382

정확도(Accuracy) :  0.7801047120418848


In [16]:
###### 5. GradientBoostingClassifier 모델 ######

from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
clf.fit(X_train,y_train)
y_hat = clf.predict(X_test)
print(y_hat[:10])          # ['3' '2' '3' '1' '2' '1' '1' '2' '1' '3']
print(y_test[:10].values)  # ['3' '2' '2' '1' '2' '1' '2' '2' '1' '3']

# 모형 성능 평가
from sklearn.metrics import confusion_matrix
confmat = confusion_matrix(y_test,y_hat)
print(confmat)
'''
[[100  19   1]
 [ 20  92  22]
 [  1  16 111]]
'''
GBC_report = metrics.classification_report(y_test, y_hat)
print(GBC_report)
'''
              precision    recall  f1-score   support

           1       0.83      0.83      0.83       120
           2       0.72      0.69      0.70       134
           3       0.83      0.87      0.85       128

    accuracy                           0.79       382
   macro avg       0.79      0.80      0.79       382
weighted avg       0.79      0.79      0.79       382
'''

# 정확도 출력하기
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("정확도(Accuracy) :", accuracy_score(y_test,y_hat))   # 0.7931937172774869

['3' '2' '3' '1' '2' '1' '1' '2' '1' '3']
['3' '2' '2' '1' '2' '1' '2' '2' '1' '3']
[[101  18   1]
 [ 20  92  22]
 [  1  16 111]]
              precision    recall  f1-score   support

           1       0.83      0.84      0.83       120
           2       0.73      0.69      0.71       134
           3       0.83      0.87      0.85       128

    accuracy                           0.80       382
   macro avg       0.80      0.80      0.80       382
weighted avg       0.79      0.80      0.79       382

정확도(Accuracy) : 0.7958115183246073
