In [1]:
import numpy  as np
import pandas as pd

print('numpy  version - ' , np.__version__) 
print('pandas version - ' , pd.__version__) 

from   io import StringIO
import missingno as msno
# ml
import sklearn
from   sklearn.datasets import load_iris, load_breast_cancer

print('sklearn version - ' , sklearn.__version__)

from sklearn.model_selection import train_test_split, KFold , StratifiedKFold , cross_val_score, cross_validate, GridSearchCV 
from sklearn.tree            import DecisionTreeClassifier
from sklearn.ensemble        import RandomForestClassifier         
from sklearn.linear_model    import LogisticRegression , LinearRegression


from sklearn.metrics         import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, make_scorer , precision_recall_curve 
from sklearn.impute          import SimpleImputer

from sklearn.preprocessing   import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler

from IPython.display import Image

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

numpy  version -  1.20.3
pandas version -  1.3.4
sklearn version -  0.24.2


## 고등학교 진학률 데이터를 활용하여 속성이 비슷한 중학교끼리 클러스터
- 데이터 인코딩, 지도시각화(위도, 경도, folium)

### 진행절차
#### 01. 데이터 전처리 - (지역, 코드 , 유형, 주야)
#### 02. 군집모형생성 - 분석에 사용할 피처는 과학고, 외고국제고, 자사고 진학률
#### 03. 표준화
#### 04. 모형 객체 생성
#### 05. 모형 학습
#### 06. 예측
#### 07. 예측 결과를 데이터 프레임에 추가
#### 08. 클러스터 값으로 그룹화, 그룹별 내용 출력
#### 09. 지도 그래프 시각화

In [3]:
school = pd.read_excel('2022-01-19/data/middle_shcool_graduates_report.xlsx')
school_frm = school.copy()
school_frm.head(1)

FileNotFoundError: [Errno 2] No such file or directory: '2022-01-19/data/middle_shcool_graduates_report.xlsx'

In [None]:
school_frm.info()
school_frm.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
print ('01. 데이터 전처리 - (지역, 코드 , 유형, 주야)') 
label_encoder = LabelEncoder()
school_frm['지역'] = label_encoder.fit_transform(school_frm['지역'])
school_frm['코드'] = label_encoder.fit_transform(school_frm['코드'])
school_frm['유형'] = label_encoder.fit_transform(school_frm['유형'])
school_frm['주야'] = label_encoder.fit_transform(school_frm['주야'])

In [None]:
def feature_scaling(method='None' , input_data = None) :
    if method == 'standard' :
        return StandardScaler().fit_transform(input_data)
    if method == 'minmax' :
        return MinMaxScaler().fit_transform(input_data)

In [None]:
print('02. 군집모형생성 - 분석에 사용할 피처는 과학고, 외고국제고, 자사고 진학률')

school_data = school_frm[['과학고', '외고_국제고', '자사고']]
school_data

In [None]:
print('03. 표준화')
school_scaler = feature_scaling(method='standard', input_data = school_data)
school_scaler

In [None]:
print('04. 모형 객체 생성')
print('군집모형 - ')
kmeans = KMeans(n_clusters = 3)

print('05. 모형 학습')
kmeans.fit(school_scaler)

print('06. 모형 예측')
print('군집예측 - ')
print(kmeans.labels_)

In [None]:
print('07. 예측 결과를 데이터 프레임에 추가')
school_frm['cid'] = kmeans.labels_
school_frm

In [None]:
print('08. 클러스터 값으로 그룹화, 그룹별 내용 출력')
school_pca = PCA(n_components = 2) 

school_pca_trans  = school_pca.fit_transform(school_scaler)
school_pca_trans

In [None]:
school_frm['std_pca_x'] = school_pca_trans[ : , 0]
school_frm['std_pca_y'] = school_pca_trans[ : , 1]
school_frm

In [None]:
school_kmeans = KMeans(n_clusters=3)
school_kmeans.fit(school_frm.iloc[ : , -2 : ])

In [None]:
plt.figure(figsize=(20,10))

plt.scatter(school_frm['std_pca_x'] , 
            school_frm['std_pca_y'] , 
            c = school_frm['cid'])

plt.scatter(school_kmeans.cluster_centers_[ : , 0],
            school_kmeans.cluster_centers_[ : , 1],
            marker = '^', 
            c = ['r', 'b', 'y'], 
            s = 100)

plt.show()
plt.close()

In [None]:
import folium
school_frm['cid'].unique()

In [None]:
print("09. 지도 그래프 시각화")
school_map = folium.Map(location=[37.56639984255284, 126.97796323615714],
                        zoom_start = 12,
                        tiles = 'cartodb positron') 

def cid_color(x):
    if x == 0:
            return folium.Icon(color = 'darkpurple')
    if x == 1:
            return folium.Icon(color = 'orange')
    if x == 2:
            return folium.Icon(color = 'pink')
            

for name, lat, lng, cid in zip(school_frm['학교명'], school_frm['위도'], school_frm['경도'], school_frm['cid']):
    folium.Marker(
        [lat, lng],
        popup=name,
        icon = cid_color(cid)
    ).add_to(school_map)
    
school_map

In [None]:
개원,개포 핑크

In [None]:
서울대학교 사범대학 부설중학교 주황