In [1]:
import pandas as pd
import numpy as np
import urllib
import requests
import json
import re
from bs4 import BeautifulSoup
import threading
import gc
import time

from requests import ConnectionError, ReadTimeout

class Request(object):
    def __init__(self, request_session):
        self.request_session = request_session
    
    def get(self, url, headers={'Accept': '* / *',
               'Accept-Language': 'zh-TW, zh; q=0.9, en-US; q=0.8, en; q=0.7, zh-CN; q=0.6',
               'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3610.2 Safari/537.36'
               }, timeout=10, allow_redirects=False):
        try:
            print(url)
            headers['Host'] = urllib.request.splithost(urllib.request.splittype(url)[1])[0]
            res = self.request_session.get(url, headers=headers, timeout=timeout, allow_redirects=allow_redirects)
        except ConnectionError as ce:
            print('ConnectionError: ' + str(ce))
            return self.get(url=url, headers=headers, timeout=timeout, allow_redirects=allow_redirects)
        except ReadTimeout as rte:
            print('ReadTimeout: ' + str(rte))
            return self.get(url=url, headers=headers, timeout=timeout, allow_redirects=allow_redirects)

        return res
    
    def post(self, url, data, headers={'Accept': '* / *',
               'Accept-Language': 'zh-TW, zh; q=0.9, en-US; q=0.8, en; q=0.7, zh-CN; q=0.6',
               'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3610.2 Safari/537.36'
               }, timeout=10, allow_redirects=False):
        try:
            headers['host'] = urllib.request.splithost(urllib.request.splittype(url)[1])[0]
            res = self.request_session.post(url, data=data, headers=headers, timeout=timeout, allow_redirects=allow_redirects)
        except ConnectionError as ce:
            print('ConnectionError: ' + str(ce))
            return self.post(url=url, data=data, headers=headers, timeout=timeout, allow_redirects=allow_redirects)
        except ReadTimeout as rte:
            print('ReadTimeout: ' + str(rte))
            return self.post(url=url, data=data, headers=headers, timeout=timeout, allow_redirects=allow_redirects)

        return res

    def get_cookie_dict(self):
        return requests.utils.dict_from_cookiejar(self.request_session.cookies)

    def clear_cookie(self):
        self.request_session.cookies.clear()

In [ ]:
df_colleges = pd.DataFrame()

In [ ]:
page_size = 20
for page in range(0, 60):
    url = f'https://gaokao.chsi.com.cn/zyk/pub/myd/specAppraisalTop.action?xlcc=bk&start={page*page_size}'
    req = Request(requests.Session())

    res = req.get(url, timeout=20)

    soup = BeautifulSoup(res.text, 'html.parser')
    query_result = soup.find(id = 'queryResult')
    query_result = query_result.findChildren('tr')

    for item in query_result:
        a = item.find('a', title='点击查看院校信息')
        if a == None:
            continue
        df_college = pd.DataFrame()
        df_college['Name'] = [a.string.strip()]
        df_college['DetailUrl'] = [a['href']]

        a = item.find('a', attrs={'class':'check_detail'})
        df_college['SpecialtyDetailUrl'] = [a['href']]

        df_colleges = df_colleges.append(df_college, ignore_index=True)

In [ ]:
df_colleges

In [ ]:
df_colleges.to_csv('SUSTC_Journals/本科院校名录.csv', index=False)

In [ ]:
req = Request(requests.Session())

In [ ]:
df_specialties = pd.DataFrame()

In [ ]:
%%time
for i in range(0, len(df_colleges)):
    if df_colleges.iloc[i]['Name'] in list(df_specialties['CollegeName']):
        continue
    url = 'https://gaokao.chsi.com.cn' + df_colleges.iloc[i]['SpecialtyDetailUrl']
    res = req.get(url, timeout=40)

    soup = BeautifulSoup(res.text, 'html.parser')
    query_result = soup.find(id = 'queryResult')
    query_result = query_result.findChildren('tr', align='left')

    # satisfy_cats = ['综合', '办学', '教学', '就业']
    satisfy_cats = ['Zonghe', 'Banxue', 'Jiaoxue', 'Jiuye']

    for item in query_result:
        td = item.find('td', attrs={'class':'first_td'})
        if td == None:
            continue
        df_specialty = pd.DataFrame()
        df_specialty['CollegeName'] = [df_colleges.iloc[i]['Name']]
        df_specialty['Specialty'] = [td.string.strip()]

        query_subs = item.findAll('table', attrs={'class':'zymydMoreTable'})

        for index, sub in enumerate(query_subs):
            satisfy_cat = satisfy_cats[index]
            avg_rank = sub.find('span', attrs={'class':'avg_rank'})
            if (avg_rank != None) and (avg_rank.string != None):
                df_specialty[f'{satisfy_cat}_avg_rank'] = [avg_rank.string.strip()]

            vote_num_detail = sub.find('span', attrs={'class':'vote_num_detail'})
            if (vote_num_detail != None) and (vote_num_detail.string != None): 
                df_specialty[f'{satisfy_cat}_vote_num'] = [vote_num_detail.string.strip()]

            sub_ratings = sub.findAll('div', attrs={'class':'progress_bar'})
            for sub_index, sub_rating in enumerate(sub_ratings):
                df_specialty[f'{satisfy_cat}_{5-sub_index}_star_percent'] = [sub_ratings[sub_index]['style'].split(':')[1].strip()[:-1]]

        df_specialties = df_specialties.append(df_specialty, ignore_index=True)

In [ ]:
df_specialties

In [ ]:
df_specialties.to_csv('SUSTC_Journals/本科院校专业满意度.csv', index=False)

In [ ]:
df_specialties['Specialty'].unique()

# 高考网专业分数线

In [2]:
req = Request(requests.Session())
col_names = ['Specialty Name', 'College Name', 'Average Grade', 'Max Grade', 'Region', 'Category', 'Year', 'Level']


In [3]:
def grab_grades(year):
    df_grade = pd.DataFrame()
    total_page = 25000
    for page in range(7401, 15000):
        if page > total_page:
            break
        url = f'http://college.gaokao.com/spepoint/y{year}/p{page}'
        res = req.get(url, timeout=20)
        req.clear_cookie()
        soup = BeautifulSoup(res.text, 'html.parser')

        if total_page > 24000:
            query_result = soup.find(id='qx')
            if query_result != None:
                total_page = int(query_result.find(id='pagenum').previousSibling.string.split('/')[1].split('页')[0])
                
        query_result = soup.findChildren('tr', attrs={'class':re.compile('sz*')})

        for item in query_result:
            df_gradex = pd.DataFrame()
            subs = item.find_all('td')
            for index in range(0, 7):
                df_gradex[col_names[index]] = [subs[index].string]
            df_grade = df_grade.append(df_gradex, ignore_index=True)

        if (page % 200 == 0) or (page == total_page):
            df_grade = df_grade.drop_duplicates()
            df_grade.to_csv(f'SUSTC_Journals/本科院校专业录取分数线{year}_{page}.csv', index=False)
            df_grade = pd.DataFrame()
            gc.collect()
        
        time.sleep(5)

In [4]:
%%time
threads = []
for year in range(2009, 2017):
    threadx = threading.Thread(target=grab_grades, name=f'Thread{year}', args=(year,))
    threadx.start()
    threads.append(threadx)

for t in threads:
    t.join()

http://college.gaokao.com/spepoint/y2009/p7401
http://college.gaokao.com/spepoint/y2010/p7401http://college.gaokao.com/spepoint/y2011/p7401

http://college.gaokao.com/spepoint/y2012/p7401
http://college.gaokao.com/spepoint/y2013/p7401
http://college.gaokao.com/spepoint/y2014/p7401
http://college.gaokao.com/spepoint/y2015/p7401
http://college.gaokao.com/spepoint/y2016/p7401
http://college.gaokao.com/spepoint/y2016/p7402
http://college.gaokao.com/spepoint/y2013/p7402
http://college.gaokao.com/spepoint/y2015/p7402
http://college.gaokao.com/spepoint/y2014/p7402
http://college.gaokao.com/spepoint/y2009/p7402
http://college.gaokao.com/spepoint/y2012/p7402
http://college.gaokao.com/spepoint/y2011/p7402
http://college.gaokao.com/spepoint/y2010/p7402


In [ ]:
df_grade

# 高考网地区批次线

In [ ]:
req = Request(requests.Session())
col_names = ['Year', 'Region', 'Category', 'Batch Name', 'Min Grade']

In [ ]:
def grab_batch_grades():
    df_grade = pd.DataFrame()
    total_page = 700
    for page in range(1, 600):
        if page > total_page:
            break
        url = f'http://college.gaokao.com/areapoint/p{page}'
        res = req.get(url, timeout=20)
        req.clear_cookie()
        soup = BeautifulSoup(res.text, 'html.parser')

        if total_page > 500:
            query_result = soup.find(id='qx')
            if query_result != None:
                total_page = int(query_result.find(id='pagenum').previousSibling.string.split('/')[1].split('页')[0])
                
        query_result = soup.findChildren('tr', attrs={'class':re.compile('sz*')})

        for item in query_result:
            df_gradex = pd.DataFrame()
            subs = item.find_all('td')
            for index in range(0, 5):
                df_gradex[col_names[index]] = [subs[index].string]
            df_grade = df_grade.append(df_gradex, ignore_index=True)

        if (page % 50 == 0) or (page == total_page):
            df_grade = df_grade.drop_duplicates()
            df_grade.to_csv(f'SUSTC_Journals/地区批次录取分数线_{page}.csv', index=False)
            df_grade = pd.DataFrame()
            gc.collect()
        
        time.sleep(10)

In [ ]:
%%time
grab_batch_grades()