In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pyspark.sql import SparkSession, functions as F

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.parquet('../../数据/ticket_preprocessed.parquet')
df.show()
df.count()

+--------+-------------------+----------------+----------+--------+----------+------------------+----------+----------+----------+------------------+----------+----------+--------+
|班次代码|           发车时间|      乘车站名称|到达站名称|座位类型|乘车站省份|乘车站行政地理分区|乘车站经度|乘车站纬度|到达站省份|到达站行政地理分区|到达站经度|到达站纬度|    距离|
+--------+-------------------+----------------+----------+--------+----------+------------------+----------+----------+----------+------------------+----------+----------+--------+
|  KS1057|2020-05-07 09:00:00|苏州北广场汽车站|  常熟南站|       1|    江苏省|              华东|120.608475| 31.330946|    江苏省|              华东| 120.74239| 31.628862|  35.478|
|  KS3197|2020-05-07 10:50:00|        苏州南站|  常熟南站|       1|    江苏省|              华东|120.638145|  31.27728|    江苏省|              华东| 120.74239| 31.628862|  40.325|
|  GT1001|2020-05-07 17:40:00|          沙溪站|    太仓站|       1|    湖南省|              华中|109.902885| 26.756468|    江苏省|              华东| 121.19665| 31.510124| 1216.72|
|  GT1001|2020-05-07 17:40:00|          沙溪

38448537

## 计算所有时段的人口流动

In [4]:
def get_provincial_flows(df):
    
    inflow = df[df.乘车站省份 != df.到达站省份].groupBy('到达站省份').count().toPandas()
    inflow.columns = ['省份', '流入人次']

    outflow = df[df.乘车站省份 != df.到达站省份].groupBy('乘车站省份').count().toPandas()
    outflow.columns = ['省份', '流出人次']

    internal_flow = df[df.乘车站省份 == df.到达站省份].groupBy('乘车站省份').count().toPandas()
    internal_flow.columns = ['省份', '内部流动人次']
    
    provinces = inflow.merge(outflow, how='outer').merge(internal_flow, how='outer').fillna(0)
    provinces[['流出人次', '内部流动人次']] = provinces[['流出人次', '内部流动人次']].astype(int)
    provinces.set_index('省份', inplace=True)

    return provinces

In [5]:
sections = {
    '华北': ['北京市', '天津市', '河北省', '山西省', '内蒙古自治区'],
    '东北': ['黑龙江省', '吉林省', '辽宁省'],
    '华东': ['上海市', '江苏省', '浙江省', '安徽省', '江西省', '山东省', '福建省', '台湾省'],
    '华中': ['河南省', '湖北省', '湖南省'],
    '华南': ['广东省', '广西壮族自治区', '海南省', '香港特别行政区', '澳门特别行政区'],
    '西南': ['重庆市', '四川省', '贵州省', '云南省', '西藏自治区'],
    '西北': ['陕西省', '甘肃省', '青海省', '宁夏回族自治区', '新疆维吾尔自治区']
}

inv_sections = {province: section for section, provinces in sections.items() for province in provinces}

In [6]:
r = requests.get('https://www.maigoo.com/news/480543.html')
soup = BeautifulSoup(r.text, 'lxml')
provinces_info = soup.select('.md_td+ .md_td')[4:]

In [7]:
r = requests.get('https://zh.wikipedia.org/wiki/%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD%E5%90%84%E7%9C%81%E7%BA%A7%E8%A1%8C%E6%94%BF%E5%8C%BA%E4%BA%BA%E5%8F%A3%E5%88%97%E8%A1%A8')
soup = BeautifulSoup(r.text, 'lxml')
provinces_population = soup.find_all('tbody')[1].find_all('tr')[3:]

In [8]:
infos = {}

for i in range(31):
    province_info = provinces_info[(4 * i):(4 * (i + 1))]
    name, gdp, gdp_growth, income = [item.text for item in province_info]
    for province in inv_sections.keys():
        if name in province:
            name = province
    infos[name] = list(map(float, [gdp, gdp_growth, income]))

for province_population in provinces_population:
    name, population = [item.text.strip() for item in province_population.find_all('td')[1:3]]
    infos[name].append(float(population))

In [9]:
def get_gdp(province):
    try:
        return infos[province][0]
    except KeyError:
        pass

def get_gdp_growth(province):
    try:
        return infos[province][1]
    except KeyError:
        pass

def get_income(province):
    try:
        return infos[province][2]
    except KeyError:
        pass

def get_population(province):
    try:
        return infos[province][3]
    except KeyError:
        pass

In [10]:
provinces = get_provincial_flows(df)
provinces['行政地理分区'] = provinces.index.map(lambda x: inv_sections[x])
provinces['GDP'] = provinces.index.map(get_gdp)
provinces['GDP增速'] = provinces.index.map(get_gdp_growth)
provinces['人均可支配收入'] = provinces.index.map(get_income)
provinces['总人口'] = provinces.index.map(get_population)
provinces

Unnamed: 0_level_0,流入人次,流出人次,内部流动人次,行政地理分区,GDP,GDP增速,人均可支配收入,总人口
省份,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
西藏自治区,18782,32626,645169,西南,1902.74,7.8,21744.0,364.8
北京市,75044,1384,2,华北,36102.6,1.2,69434.0,2189.3
辽宁省,291651,991011,3090902,东北,25115.0,0.6,32738.0,4259.1
浙江省,680803,791388,5920387,华东,64613.0,3.6,52397.0,6456.8
广西壮族自治区,216988,18221,66689,华南,22156.69,3.7,24562.0,5012.7
海南省,47080,18073,500617,华南,5532.39,3.5,27904.0,1008.1
重庆市,281509,12641,16265,西南,25002.79,3.9,30824.0,3205.4
香港特别行政区,19230,0,0,华南,,,,
河北省,204812,16447,85,华北,36206.9,3.9,27136.0,7461.0
福建省,197091,30554,118007,华东,43903.89,3.3,37202.0,4154.0


In [11]:
sections = provinces.groupby('行政地理分区').agg('sum').drop(columns=['GDP增速', '人均可支配收入'])
sections

Unnamed: 0_level_0,流入人次,流出人次,内部流动人次,GDP,总人口
行政地理分区,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
东北,1000666,1323181,3775803,51124.82,9851.4
华东,3215002,2850248,13738098,387437.57,42347.0
华中,623306,1109148,2191655,140222.02,22356.4
华北,705625,114070,295904,121404.96,16933.4
华南,648723,262055,1852621,138450.02,18622.1
西北,608104,657982,3746397,55922.61,10352.8
西南,834147,1318889,5212486,117852.79,20514.8


## 观察不同时段的数据缺失情况

In [12]:
get_valid_times = F.collect_set(F.concat_ws('-', F.year('发车时间'), F.month('发车时间')))
valid_times = df.groupBy('乘车站省份').agg(get_valid_times.alias('有效时期')).toPandas()
valid_times.set_index('乘车站省份', inplace=True)
valid_times

Unnamed: 0_level_0,有效时期
乘车站省份,Unnamed: 1_level_1
西藏自治区,"[2020-6, 2020-1, 2020-2, 2020-5, 2020-3, 2020-..."
北京市,"[2020-6, 2020-2, 2020-5, 2020-3, 2021-3, 2020-..."
辽宁省,"[2021-5, 2020-6, 2020-1, 2020-2, 2021-2, 2020-..."
浙江省,"[2020-6, 2020-1, 2020-2, 2019-12, 2020-5, 2020..."
广西壮族自治区,"[2020-6, 2020-1, 2020-2, 2020-5, 2020-3, 2021-..."
海南省,"[2020-6, 2020-2, 2020-5, 2020-3, 2020-4, 2021-..."
重庆市,"[2020-6, 2020-2, 2020-5, 2020-3, 2021-3, 2020-..."
河北省,"[2020-6, 2020-2, 2020-5, 2020-3, 2021-3, 2020-..."
福建省,"[2019-9, 2020-1, 2020-2, 2019-12, 2020-6, 2020..."
湖南省,"[2021-5, 2020-6, 2020-1, 2020-2, 2021-2, 2020-..."


In [13]:
times = {}
for province in valid_times.index:
    for time in valid_times.loc[province, '有效时期']:
        if time not in times:
            times[time] = 1
        else:
            times[time] += 1
times

{'2020-6': 30,
 '2020-1': 17,
 '2020-2': 30,
 '2020-5': 30,
 '2020-3': 30,
 '2020-4': 30,
 '2021-3': 31,
 '2021-4': 31,
 '2021-5': 7,
 '2021-2': 2,
 '2021-1': 1,
 '2020-7': 5,
 '2019-12': 7,
 '2019-11': 3,
 '2019-9': 2,
 '2019-7': 1,
 '2020-8': 4,
 '2020-9': 4}

上面单元格的输出表示各时段有数据记录的省份个数。

2020年1～6月、2021年3～4月的数据比较全，大部分省份在这些时段有数据记录；其余时段则缺失严重，只有个位数的省份在这些时段有数据记录。

接下来我们具体观察这些时段上各省份数据的缺失情况。

In [14]:
valid_times_map = pd.DataFrame(index=valid_times.index, columns=sorted([time for time, n in times.items() if n > 15])).fillna(False)
for province in valid_times.index:
    for time in valid_times.loc[province, '有效时期']:
        if time in valid_times_map.columns:
            valid_times_map.loc[province, time] = True
valid_times_map

Unnamed: 0_level_0,2020-1,2020-2,2020-3,2020-4,2020-5,2020-6,2021-3,2021-4
乘车站省份,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
西藏自治区,True,True,True,True,True,True,True,True
北京市,False,True,True,True,True,True,True,True
辽宁省,True,True,True,True,True,True,True,True
浙江省,True,True,True,True,True,True,True,True
广西壮族自治区,True,True,True,True,True,True,True,True
海南省,False,True,True,True,True,True,True,True
重庆市,False,True,True,True,True,True,True,True
河北省,False,True,True,True,True,True,True,True
福建省,True,True,True,True,True,True,True,True
湖南省,True,True,True,True,True,True,True,True


可以观察到，2020年1月的缺失省份还是比较多，且无规律，我们选择不用。此外，天津市在2020年2～6月无出发记录，在2021年3～4月有出发记录。

最终，我们选取2020年2～6月、2021年3～4月为主要分析时段。接下来我们将各时段的人口流动数据存储下来以便后续分析。

## 计算特定时段的人口流动

### 2020年2～6月

In [15]:
df_1 = df[(df.发车时间 >= '2020-02-01') & (df.发车时间 < '2020-07-01')] 

In [16]:
provinces = get_provincial_flows(df_1)
provinces['行政地理分区'] = provinces.index.map(lambda x: inv_sections[x])
provinces['GDP'] = provinces.index.map(get_gdp)
provinces['GDP增速'] = provinces.index.map(get_gdp_growth)
provinces['人均可支配收入'] = provinces.index.map(get_income)
provinces['总人口'] = provinces.index.map(get_population)
provinces.to_csv('../../数据/provincial_flows_1.csv')
provinces

Unnamed: 0_level_0,流入人次,流出人次,内部流动人次,行政地理分区,GDP,GDP增速,人均可支配收入,总人口
省份,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
西藏自治区,15543,29667,518929,西南,1902.74,7.8,21744.0,364.8
北京市,55507,1109,2,华北,36102.6,1.2,69434.0,2189.3
辽宁省,239272,773623,2207806,东北,25115.0,0.6,32738.0,4259.1
浙江省,488617,770677,5910976,华东,64613.0,3.6,52397.0,6456.8
广西壮族自治区,165728,12629,38914,华南,22156.69,3.7,24562.0,5012.7
海南省,42608,17886,500617,华南,5532.39,3.5,27904.0,1008.1
重庆市,224305,11493,16265,西南,25002.79,3.9,30824.0,3205.4
香港特别行政区,5435,0,0,华南,,,,
河北省,157311,16101,1,华北,36206.9,3.9,27136.0,7461.0
福建省,167044,30520,117765,华东,43903.89,3.3,37202.0,4154.0


In [17]:
sections = provinces.groupby('行政地理分区').agg('sum').drop(columns=['GDP增速', '人均可支配收入'])
sections.to_csv('../../数据/sectional_flows_1.csv')
sections

Unnamed: 0_level_0,流入人次,流出人次,内部流动人次,GDP,总人口
行政地理分区,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
东北,814438,1089444,2879432,51124.82,9851.4
华东,2450021,2336036,12794253,387437.57,42347.0
华中,469079,597393,916505,140222.02,22356.4
华北,541632,75668,174902,121404.96,16933.4
华南,470774,193816,1554371,138450.02,18622.1
西北,498612,557090,3156073,55922.61,10352.8
西南,654034,1049143,4107806,117852.79,20514.8


### 2021年3～4月

In [18]:
df_2 = df[(df.发车时间 >= '2021-03-01') & (df.发车时间 < '2021-05-01')]

In [19]:
provinces = get_provincial_flows(df_2)
provinces['行政地理分区'] = provinces.index.map(lambda x: inv_sections[x])
provinces['GDP'] = provinces.index.map(get_gdp)
provinces['GDP增速'] = provinces.index.map(get_gdp_growth)
provinces['人均可支配收入'] = provinces.index.map(get_income)
provinces['总人口'] = provinces.index.map(get_population)
provinces.to_csv('../../数据/provincial_flows_2.csv')
provinces

Unnamed: 0_level_0,流入人次,流出人次,内部流动人次,行政地理分区,GDP,GDP增速,人均可支配收入,总人口
省份,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
西藏自治区,2969,2959,125087,西南,1902.74,7.8,21744.0,364.8
北京市,15428,275,0,华北,36102.6,1.2,69434.0,2189.3
辽宁省,47830,217355,882940,东北,25115.0,0.6,32738.0,4259.1
浙江省,118908,9661,8470,华东,64613.0,3.6,52397.0,6456.8
广西壮族自治区,49720,5591,27745,华南,22156.69,3.7,24562.0,5012.7
海南省,4469,187,0,华南,5532.39,3.5,27904.0,1008.1
重庆市,54268,1148,0,西南,25002.79,3.9,30824.0,3205.4
香港特别行政区,13795,0,0,华南,,,,
河北省,44869,346,84,华北,36206.9,3.9,27136.0,7461.0
福建省,25436,31,225,华东,43903.89,3.3,37202.0,4154.0


In [20]:
sections = provinces.groupby('行政地理分区').agg('sum').drop(columns=['GDP增速', '人均可支配收入'])
sections.to_csv('../../数据/sectional_flows_2.csv')
sections

Unnamed: 0_level_0,流入人次,流出人次,内部流动人次,GDP,总人口
行政地理分区,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
东北,164942,226314,896214,51124.82,9851.4
华东,469885,209820,690530,387437.57,42347.0
华中,130637,509505,1273802,140222.02,22356.4
华北,155375,1020,282,121404.96,16933.4
华南,175453,68238,298205,138450.02,18622.1
西北,103830,98436,589621,55922.61,10352.8
西南,169862,256651,1100297,117852.79,20514.8


In [21]:
spark.stop()