# Import

In [1]:
import os
import re
import sys
from collections import defaultdict

import pandas as pd

# Declare Path

In [2]:
root_dir = '/Users/jake/OneDrive - leverage innovative users/Documents/News_Item/University/'
dirs = [x for x in os.listdir(root_dir) if x != '.DS_Store']
files = {}

# for d in dirs:
d = 'c1_registration'
dir_path = os.path.join(root_dir, d)
files[d] = [os.path.join(root_dir, d, x) for x in os.listdir(dir_path) if x != '.DS_Store' and x.endswith('.tsv')]

files

{'c1_registration': ['/Users/jake/OneDrive - leverage innovative users/Documents/News_Item/University/c1_registration/2018_registration.tsv',
  '/Users/jake/OneDrive - leverage innovative users/Documents/News_Item/University/c1_registration/2017_registration.tsv',
  '/Users/jake/OneDrive - leverage innovative users/Documents/News_Item/University/c1_registration/2016_registration.tsv']}

# Load Files

In [3]:
pd.set_option('display.max_row', 50)
df = defaultdict(dict)
for dir_name, file_list in files.items():
    key = dir_name.split('_')[-1]
    for file in file_list:
        file_name = re.search(r'(\d{4})', file.split('/')[-1]).group(1)
        df[file_name] = pd.read_csv(file, sep='\t')

df

defaultdict(dict,
            {'2018':      기준년도       학교종류 설립구분  지역  상태                학교  입학금 (A)  수업료 (B)  \
             0    2018        대학교   사립  경남  기존     가야대학교(김해) _본교    480.0  6,683.1   
             1    2018        대학교   사립  경기  기존         가천대학교 _본교    742.0  8,219.4   
             2    2018        대학교   사립  강원  기존      가톨릭관동대학교 _본교    616.0    7,145   
             3    2018        대학교   사립  경기  기존        가톨릭대학교 _본교    739.0  6,909.4   
             4    2018        대학교   사립  서울  기존     가톨릭대학교 _제2캠퍼스    739.0    8,910   
             5    2018        대학교   사립  서울  기존     가톨릭대학교 _제3캠퍼스    739.0    6,122   
             6    2018        대학교   사립  서울  기존      감리교신학대학교 _본교    617.0    6,100   
             7    2018        대학교   사립  경기  기존         강남대학교 _본교    750.0  7,383.6   
             8    2018        대학교   국립  강원  기존       강릉원주대학교 _본교      0.0  4,253.3   
             9    2018        대학교   국립  강원  기존    강릉원주대학교 _제2캠퍼스      0.0  4,292.1   
             10   2018      

# Functions

In [4]:
def convert_str2int(d, cols):
    d.iloc[::,6:] = d.iloc[::,6:].applymap(lambda x: pd.to_numeric(x.replace(',', '') if type(x)!=int and type(x)!=float else x))

# Index and Columns

In [5]:
df['2016'].columns

Index(['기준년도', '학교종류', '설립구분', '지역', '상태', '학교', '입학금 (A)', '수업료 (B)',
       '기성회비＊ (C)', '등록금 (D=B+C)', '인문사회', '자연과학', '예체능', '공학', '의학'],
      dtype='object')

In [6]:
df['2017'].columns

Index(['기준년도', '학교종류', '설립구분', '지역', '상태', '학교', '입학금 (A)', '수업료 (B)',
       '등록금 (D=B)', '인문사회', '자연과학', '예체능', '공학', '의학'],
      dtype='object')

In [7]:
# 13 univ
univ_names = ['서울', '연세', '고려', '서강', '성균관', '한양', '중앙', '경희', '한국외국어', '서울시립', '건국', '동국', '홍익']
univ = [u + '대학교 _본교' for u in univ_names]

idx_2016 = df['2016']['학교'].isin(univ)
idx_2017 = df['2017']['학교'].isin(univ)
idx_2018 = df['2018']['학교'].isin(univ)
idx_public1_2018 = df['2018']['설립구분'].str.contains('국립')
idx_public2_2018 = df['2018']['설립구분'].str.contains('공립')
idx_public3_2018 = df['2018']['설립구분'].str.contains('법인')

idx_public_2018 = idx_public1_2018 | idx_public2_2018 | idx_public3_2018

cols = ['학교', '총입학자수', '일반고 학생수', '일반고 비율',
       '과학고 학생수', '과학고 비율', '외고· 국제고 학생수', '외고· 국제고 비율', '예술· 체육고 학생수',
       '예술· 체육고 비율', '산업수요맞춤형 고등학교 학생수', '산업수요맞춤형 고등학교 비율', '특성화고 학생수',
       '특성화고 비율', '자율고 학생수', '자율고 비율', '영재학교 학생수', '영재학교 비율', '검정고시 학생수',
       '검정고시 비율', '그 외 기타 학생수', '그 외 기타 비율']

cols_numeric = [
    '입학금 (A)', '수업료 (B)',
       '등록금 (D=B)', '인문사회', '자연과학', '예체능', '공학', '의학'
]

# Pre-process

In [8]:
# df['2016_submission'][cols_sub] = df['2016_submission'][cols_sub].applymap(lambda x: pd.to_numeric(x.replace(',', '') if type(x)!=int else x))
convert_str2int(df['2016'], cols_numeric)
convert_str2int(df['2017'], cols_numeric)
convert_str2int(df['2018'], cols_numeric)

# Export TSV

In [9]:
# name0 = 'c1-11_public_univ_registration_fee.tsv'
# file0 = os.path.join(root_dir, name0)

# df['2018'].loc[idx_public_2018].sort_values('등록금 (D=B)', ascending=True)[:10].to_csv(file0, sep='\t', index=False)

# -------------------------------------------------------------------

In [10]:
df['2018'].loc[df['2018']['학교'].str.contains('포항')]

Unnamed: 0,기준년도,학교종류,설립구분,지역,상태,학교,입학금 (A),수업료 (B),등록금 (D=B),인문사회,자연과학,예체능,공학,의학
206,2018,대학교,사립,경북,기존,포항공과대학교 _본교,539.0,5580.0,5580.0,0.0,5580.0,0.0,5580.0,0.0


In [11]:
df['2018'].loc[idx_public_2018].sort_values('등록금 (D=B)', ascending=True)

Unnamed: 0,기준년도,학교종류,설립구분,지역,상태,학교,입학금 (A),수업료 (B),등록금 (D=B),인문사회,자연과학,예체능,공학,의학
62,2018,대학교,특별법법인,대구,기존,대구경북과학기술원 _본교,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
213,2018,방송통신대학,국립,서울,기존,한국방송통신대학교 _본교,0.0,754.6,754.6,731.2,732.0,0.0,2183.3,0.0
43,2018,대학교,특별법법인,광주,기존,광주과학기술원 _본교,0.0,2060.0,2060.0,0.0,2060.0,0.0,2060.0,0.0
113,2018,대학교,공립,서울,기존,서울시립대학교 _본교,0.0,2390.7,2390.7,2044.0,2409.5,2919.0,2701.0,0.0
95,2018,교육대학,국립,부산,기존,부산교육대학교 _본교,0.0,3024.0,3024.0,3024.0,3024.0,3024.0,0.0,0.0
108,2018,교육대학,국립,서울,기존,서울교육대학교 _본교,0.0,3088.0,3088.0,3088.0,0.0,0.0,0.0,0.0
209,2018,대학교,국립,충북,기존,한국교원대학교 _본교,0.0,3182.8,3182.8,2874.0,3473.1,3799.6,3600.0,0.0
199,2018,교육대학,국립,강원,기존,춘천교육대학교 _본교,0.0,3185.0,3185.0,3185.0,3185.2,3184.8,0.0,0.0
28,2018,교육대학,국립,인천,기존,경인교육대학교 _본교,0.0,3189.0,3189.0,3189.0,0.0,0.0,0.0,0.0
219,2018,대학교,특별법국립,충남,기존,한국전통문화대학교 _본교,0.0,3209.0,3209.0,2713.1,3563.5,3355.5,3425.6,0.0


In [15]:
df['2018'].loc[idx_2018].sort_values('등록금 (D=B)', ascending=True)

Unnamed: 0,기준년도,학교종류,설립구분,지역,상태,학교,입학금 (A),수업료 (B),등록금 (D=B),인문사회,자연과학,예체능,공학,의학
113,2018,대학교,공립,서울,기존,서울시립대학교 _본교,0.0,2390.7,2390.7,2044.0,2409.5,2919.0,2701.0,0.0
110,2018,대학교,국립대법인,서울,기존,서울대학교 _본교,0.0,6011.4,6011.4,5015.3,6110.4,7392.8,5996.0,9896.3
218,2018,대학교,사립,서울,기존,한국외국어대학교 _본교,838.0,7125.4,7125.4,6945.4,7711.0,0.0,8437.0,0.0
31,2018,대학교,사립,서울,기존,경희대학교 _본교,766.0,7804.8,7804.8,6356.7,8419.8,8458.7,8580.0,10802.9
185,2018,대학교,사립,서울,기존,중앙대학교 _본교,824.0,7883.2,7883.2,6812.0,8480.1,8893.4,9036.0,10992.0
74,2018,대학교,사립,서울,기존,동국대학교 _본교,860.0,7939.2,7939.2,6938.0,8383.7,8842.9,9160.0,0.0
105,2018,대학교,사립,서울,기존,서강대학교 _본교,813.0,7951.5,7951.5,7222.9,8436.0,0.0,9386.0,0.0
12,2018,대학교,사립,서울,기존,건국대학교 _본교,787.0,8192.3,8192.3,6781.9,8124.8,9082.8,9218.9,10180.0
242,2018,대학교,사립,서울,기존,홍익대학교 _본교,837.0,8254.0,8254.0,6848.0,6848.0,8984.0,8807.7,0.0
34,2018,대학교,사립,서울,기존,고려대학교 _본교,836.9,8257.7,8257.7,7147.4,8668.5,8928.2,9680.0,12414.0


In [16]:
df['2018'].loc[~idx_2018].sort_values('등록금 (D=B)', ascending=True)

Unnamed: 0,기준년도,학교종류,설립구분,지역,상태,학교,입학금 (A),수업료 (B),등록금 (D=B),인문사회,자연과학,예체능,공학,의학
42,2018,대학교,사립,전남,기존,광주가톨릭대학교 _본교,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62,2018,대학교,특별법법인,대구,기존,대구경북과학기술원 _본교,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180,2018,기술대학,사립,서울,기존,정석대학 _본교,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
213,2018,방송통신대학,국립,서울,기존,한국방송통신대학교 _본교,0.0,754.6,754.6,731.2,732.0,0.0,2183.3,0.0
187,2018,대학교,사립,경기,기존,중앙승가대학교 _본교,168.0,1760.0,1760.0,1760.0,0.0,0.0,0.0,0.0
152,2018,대학교,사립,전남,기존,영산선학대학교 _본교,0.0,2000.0,2000.0,2000.0,0.0,0.0,0.0,0.0
43,2018,대학교,특별법법인,광주,기존,광주과학기술원 _본교,0.0,2060.0,2060.0,0.0,2060.0,0.0,2060.0,0.0
111,2018,사이버대학(대학),사립,서울,기존,서울디지털대학교 _본교,300.0,2091.8,2091.8,2093.9,0.0,2088.5,0.0,0.0
216,2018,사이버대학(대학),사립,서울,기존,한국열린사이버대학교 _본교,250.0,2120.6,2120.6,2120.6,2120.7,0.0,0.0,0.0
36,2018,사이버대학(대학),사립,서울,기존,고려사이버대학교 _본교,100.0,2184.0,2184.0,2185.5,0.0,0.0,2181.0,0.0


In [17]:
df['2017'].loc[~idx_2017].sort_values('등록금 (D=B)', ascending=True)

Unnamed: 0,기준년도,학교종류,설립구분,지역,상태,학교,입학금 (A),수업료 (B),등록금 (D=B),인문사회,자연과학,예체능,공학,의학
42,2017,대학교,사립,전남,기존,광주가톨릭대학교 _본교,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62,2017,대학교,특별법법인,대구,기존,대구경북과학기술원 _본교,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,2017,기술대학,사립,서울,기존,정석대학 _본교,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
216,2017,방송통신대학,국립,서울,기존,한국방송통신대학교 _본교,7.0,749.9,749.9,724.9,732.0,0.0,2183.3,0.0
190,2017,대학교,사립,경기,기존,중앙승가대학교 _본교,210.0,1760.0,1760.0,1760.0,0.0,0.0,0.0,0.0
155,2017,대학교,사립,전남,기존,영산선학대학교 _본교,150.0,2000.0,2000.0,2000.0,0.0,0.0,0.0,0.0
43,2017,대학교,특별법법인,광주,기존,광주과학기술원 _본교,0.0,2060.0,2060.0,0.0,2060.0,0.0,2060.0,0.0
114,2017,사이버대학(대학),사립,서울,기존,서울디지털대학교 _본교,300.0,2090.0,2090.0,2092.6,0.0,2085.9,0.0,0.0
219,2017,사이버대학(대학),사립,서울,기존,한국열린사이버대학교 _본교,250.0,2119.3,2119.3,2121.7,2112.9,0.0,0.0,0.0
36,2017,사이버대학(대학),사립,서울,기존,고려사이버대학교 _본교,100.0,2185.1,2185.1,2180.0,0.0,0.0,2196.0,0.0
