In [None]:
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup

import urllib
import tqdm

In [None]:
FELVI_URL = 'http://www.felvi.hu/felveteli/ponthatarok_rangsorok/elmult_evek/!ElmultEvek/ajax_back.php'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36',
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8,hu;q=0.6',
    'x-prototype.test': 'x-prototype.test'
}

In [None]:
session = requests.session()
session.get('http://www.felvi.hu/felveteli/ponthatarok_rangsorok/elmult_evek/!ElmultEvek/elmult_evek.php?stat=13')
response = session.get(FELVI_URL + '?stat=13', headers=headers)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    content = soup.findAll('select')

In [None]:
id_mapping = {}
for select in content:
    tmp = {}
    for option in select.findAll('option'):
        tmp[option.get('value')] = option.getText()
    id_mapping[select.get('name')] = tmp

In [None]:
id_mapping['sta_kar_id']['0']

# Get actual data

In [None]:
session = requests.session()
session.get('http://www.felvi.hu/felveteli/ponthatarok_rangsorok/elmult_evek/!ElmultEvek/elmult_evek.php?stat=13')
data = {
    'rendez_mit': 'sta_sorrend',
    'rendez_hogy': 'ASC',
    'sta_ev': '0',
    #'sta_int_id': '7',
    'sta_kar_id': '10', 
    'stat': '13',
    'oldal': '1',
    'elsokor': ''
}
DATA = {}

In [None]:
for f_id, faculty in id_mapping['sta_kar_id'].iteritems():
    if f_id == '0':
        continue
    print faculty,
    faculty_data = pd.DataFrame()
    data['sta_kar_id'] = f_id
    for y_id, year in id_mapping['sta_ev'].iteritems():
        if y_id == '0':
            continue
        print year,
        data['sta_ev'] = y_id.encode('utf-8')
        for page in xrange(1, 100):
            data['oldal'] = page
            print page,
            encoded = urllib.urlencode(data) + '&php_self=/felvi/statisztika//elmult_evek.php'
            response = session.get(FELVI_URL + '?' + encoded, headers=headers)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                content = soup.find('table', attrs ={'class': 'tblc'})
                if content is None:
                    break
                df = pd.io.html.read_html(
                        io=content.prettify().encode('utf-8'), 
                        encoding='utf-8'
                )[0]
                df = df[2:]
                faculty_data = pd.concat((
                    faculty_data,
                    df
                ))
        
    DATA[faculty] = faculty_data
    print
    print 'Done.' 

In [None]:
BASE_URI = u'./data/karok/'
DATA_len = len(DATA)

for i, (faculty, df) in enumerate(DATA.iteritems()):
    print u'Saving {} ({}/{})...'.format(faculty, i+1, DATA_len),
    df.to_excel(BASE_URI + faculty + '.xls')
    print 'Done.'

In [None]:
allin = pd.DataFrame()
faculties = sorted(DATA.keys())
for i, faculty in enumerate(faculties):
    print 'Concatting {}/{}...'.format(i+1, DATA_len),
    allin = pd.concat((DATA[faculty], allin))
    print 'Done.'

print 'Saving...',
allin.to_excel(BASE_URI + 'allin.xlsx')
print 'Done.'

# Képzési helyek scrape

In [None]:
FELVI_URL = 'http://www.felvi.hu/felveteli/szakok_kepzesek'
# munkarend
eljs = ['N', 'L', 'E']
# szaknevek
majors = [
    u'építőmérnök',
    u'kertészmérnöki',
    u'környezetgazdálkodási agrármérnök',
    u'környezetmérnök',
    u'környezettan',
    u'mezőgazdasági mérnöki',
    u'természetvédelmi mérnöki',
    u'vegyészmérnök'
]
# formátum
data = {
    'elj':"N",
    'mehet':"1",
    'reszletese':"",
    'szaknev':"vegyészmérnök",
    'intId':"999999",
    'oldal':"0",
    'limit':"1000",
    'dom':"0"
}

In [None]:
def request(major):
    print major, 'download process started'
    print
    content = {}
    for elj in eljs:
        print 'Request started'
        data['elj'] = elj
        response = requests.post(FELVI_URL, data=data)
        if response.status_code == 200:
            print 'Page downloaded:', elj
            soup = BeautifulSoup(response.content)
            content[elj] = soup.find(id='talalatilista').find('table')
            print 'Content extracted'
    print 'Done.'
    return content

In [None]:
content = {}
for major in majors[:1]:
    content[major] = request(major)

In [None]:
a = soup.find(id='talalatilista').find('table')

In [None]:
df = pd.io.html.read_html(io=a.prettify().encode('utf-8'), encoding='utf-8')

In [None]:
import sys
reload(sys) 
sys.setdefaultencoding("utf-8")

In [None]:
df[2:]

---

# Aggregating data

## Preparing df

In [None]:
allin.columns = ['institute', 'period', 'major', 'types',
                 'applied', 'first', 'accepted', 'threshold']
allin.threshold.replace('n.i.', np.nan, inplace=True)

allin['year'] = allin.period.map(lambda x: int(x.split('/')[0]))

allin['level'] = allin.types.str[0]
allin['form'] = allin.types.str[1]
allin['finance'] = allin.types.str[2]

for col in ['applied', 'first', 'accepted', 'threshold']:
    allin[col] = allin[col].astype(np.float)

allin.head()

## Aggregating

In [None]:
groups = [
    ['year', 'major'],
    ['year', 'major', 'level'],
    ['year', 'major', 'level', 'form'],
    ['year', 'major', 'level', 'finance'],
    ['year', 'major', 'level', 'form', 'finance'],
    ['year', 'major', 'level', 'institute'],
    ['year', 'major', 'level', 'institute', 'form'],
    ['year', 'major', 'level', 'institute', 'finance'],
    ['year', 'major', 'level', 'institute', 'form', 'finance'],
]
stats = {'applied': 'sum', 'first': 'sum', 
         'accepted': 'sum', 'threshold': 'mean'}

## Saving

In [None]:
with pd.ExcelWriter('./data/stats2017.xlsx') as writer:
    for group in tqdm.tqdm(groups):
        sheetname = '_'.join(map(lambda x: x[:2].upper(), group))
        (allin.groupby(group)
              .agg(stats)
              .reset_index()
              .to_excel(writer, sheet_name=sheetname, encoding='utf-8', index=False))

## Loading & Reaggregating

In [None]:
infile = './data/stats2017.xlsx'
DATA = {}
with pd.ExcelFile(infile) as reader:
    for sheet in reader.sheet_names:
        DATA[sheet] = pd.read_excel(reader, sheetname=sheet)

In [None]:
sheet_mapping = {col.upper()[:2]: col 
                 for col in ['year', 'major', 'level', 
                             'institute', 'form', 'finance']}

In [None]:
with pd.ExcelWriter('./data/stats2017turizmus.xlsx') as writer:
    for sheetname, df in tqdm.tqdm(DATA.items()):
        if sheetname == u'YE_MA':
            continue
        group = [mapping[level] for level in sheetname.split('_')]

        turizmus = df.major.str.startswith('turizmus')
        df.loc[(df.level=='M')&(turizmus), 'major'] = 'turizmus-menedzsment'
        df.loc[(df.level.isin(['A', 'F']))&(turizmus), 'major'] = 'turizmus-vendéglátás'
        
        (df.loc[(df.level.isin(['A', 'F', 'M']))&(turizmus)]
           .groupby(group, as_index=False)
           .agg(stats)
           .to_excel(writer, sheet_name=sheetname, encoding='utf-8', index=False))