# 第五次课后练习 之一

**负责助教：朱轩宇**

<span style="color:red; font-weight:bold;">请将作业文件命名为 第五次课后练习+姓名+学号.ipynb, 例如 第五次课后练习+张三+1000000000.ipynb</span>

<span style="color:red; font-weight:bold;">在作业过程中觉得有心得或者自己拓展学习到有价值内容的，可以在文件名最后加一个#号。例如第五次课后练习+张三+1000000000+#.ipynb</span>

<span style="color:red; font-weight:bold;">本次课同时发布课后练习和选做题，提交时请注意区分提交通道</span>

# 第零部分 代码理解

请认真阅读代码，理解代码的功能，先写出预想的结果。运行并检验结果是否如预期。如果不如预期，请分析理解其中的原因

## **0.1** 多进程编程，进程池，进程间通讯
    阅读理解下面代码，观察四次运行的结果，解释出现这个结果的原因。

下面这段代码使用了multiprocessing模块来实现多进程任务处理。主要功能是通过多个工作进程（CustomWorker）从任务队列中获取任务，处理任务后将结果放入结果队列。主进程负责管理任务队列、结果队列以及进程池的创建和销毁。

In [31]:
%%writefile multiprocessing_script.py
import multiprocessing
import time

#`Manager`类是Python `multiprocessing`模块中的一个工具，用于创建可以在多个进程之间
#共享的对象（如列表、字典、队列等），简化了进程间数据共享和通信的复杂性。
from multiprocessing import Pool, Manager

# 该类封装了工作进程的逻辑，包括任务处理、结果返回等。
# run方法是一个无限循环，从任务队列中获取任务并处理，直到接收到TERMINATE信号。
class CustomWorker:
    def __init__(self, worker_id, task_queue, result_queue, config):
        self.worker_id = worker_id
        self.task_queue = task_queue
        self.result_queue = result_queue
        self.__secret_key = config['key']
        self.__mode = config['mode']

    def run(self):
        # print(f"Worker {self.worker_id} started with mode {self.__mode}")
        while True:
            try:
                task = self.task_queue.get()
                if task == 'TERMINATE':
                    # print(f"Worker {self.worker_id} terminating")
                    break
                result = self.__process(task)
                self.result_queue.put({'worker': self.worker_id, 'result': result, 'task': task})
            except Exception as e:
                self.result_queue.put({'worker': self.worker_id, 'error': str(e), 'task': task})

    # 会根据具体每个worker实例的mode对输入的task做不同的操作
    def __process(self, task):
        if self.__mode == 'encrypt':
            return f"{task}_{self.__secret_key}"   # 拼接输入的密码
        elif self.__mode == 'hash':
            return hash(task + self.__secret_key)  # hash输入的密码
        else:
            raise ValueError("Invalid mode")

# 创建CustomWorker实例并调用其run方法。
# 每个工作进程根据其ID的奇偶生成不同的配置（encrypt和hash）。
def worker_process(task_queue, result_queue):
    worker_id = multiprocessing.current_process()._identity[0]
    config = {
        'key': f"KEY{worker_id}",
        'mode': 'encrypt' if worker_id % 2 == 0 else 'hash'
    }
    worker = CustomWorker(worker_id, task_queue, result_queue, config)
    worker.run()  

def main():
    manager = Manager()
    task_queue = manager.Queue()    # 任务队列
    result_queue = manager.Queue()  # 结果队列

    pool_size = 2  # 进程池大小
     
    expected_tasks = 5  # 期望处理的任务数
    tasks_processed = 0
    
    # 收集处理结果
    results = []
    
    # 在任务队列里先放入所需要加工的任务
    for i in range( expected_tasks):
        task_queue.put(f"task_{i}")
    
    # 在任务队列后添加足够的终止信号（每个进程会消耗掉一个）。这里可以用multiprocessing.Event代替更好
    # 这个方式某种情况下可能导致进程无法终结的问题？
    for _ in range(pool_size):
        task_queue.put("TERMINATE")

    # 创建并启动进程池
    pool = Pool(
        processes = pool_size,
        initializer = worker_process,
        initargs = (task_queue, result_queue)
    )
    
    while tasks_processed < expected_tasks:
        result = result_queue.get()
        tasks_processed += 1
        
        if 'error' in result:
            print(f"Error from worker {result['worker']} processing {result['task']}: {result['error']}")
        else:
            results.append(result)
            # print(f"Worker {result['worker']} processed {result['task']} → {result['result']}")
    
    # 等待所有进程完成
    pool.close()
    pool.join()
    
    # print(f"Processed {len(results)} tasks successfully")
    print(f"Final results: {results}")

if __name__ == '__main__':
    main()

Overwriting multiprocessing_script.py


In [32]:
import subprocess

for i in range(4):
    print(f"Running subprocess {i}")
    result = subprocess.run(
        ["python", "multiprocessing_script.py"], 
        capture_output=True, text=True)
    print("STDOUT:", result.stdout)  # 查看任务处理结果
    print(" ")
    print("STDERR:", result.stderr)  # 查看是否有错误
    print(" ")

Running subprocess 0
STDOUT: Final results: [{'worker': 3, 'result': 358975780942431332, 'task': 'task_0'}, {'worker': 3, 'result': 187842445111551740, 'task': 'task_2'}, {'worker': 2, 'result': 'task_1_KEY2', 'task': 'task_1'}, {'worker': 3, 'result': 8371617324782176147, 'task': 'task_3'}, {'worker': 2, 'result': 'task_4_KEY2', 'task': 'task_4'}]

 
STDERR: 
 
Running subprocess 1
STDOUT: Final results: [{'worker': 3, 'result': 3337686437969000200, 'task': 'task_0'}, {'worker': 2, 'result': 'task_1_KEY2', 'task': 'task_1'}, {'worker': 3, 'result': -2690394115718592284, 'task': 'task_2'}, {'worker': 2, 'result': 'task_3_KEY2', 'task': 'task_3'}, {'worker': 3, 'result': 3201207304949967064, 'task': 'task_4'}]

 
STDERR: 
 
Running subprocess 2
STDOUT: Final results: [{'worker': 3, 'result': 5509128965142684649, 'task': 'task_1'}, {'worker': 2, 'result': 'task_0_KEY2', 'task': 'task_0'}, {'worker': 3, 'result': 3581438424763580629, 'task': 'task_2'}, {'worker': 2, 'result': 'task_3_KEY2

首先multiprocessing_script.py程序不会报错error，所以result.stderr始终为“”。multiprocessing_script.py自身就是2个进程的，处理5个任务。偶数id的进程使用 encrypt 模式，拼接任务字符串与密钥。奇数id的进程使用 hash 模式，对任务字符串和密钥进行哈希操作。

每次运行脚本时，都会创建一个新的 Python 解释器实例，并独立地执行 multiprocessing_script.py。由于每个子进程中都创建了新的 task_queue 和 result_queue，并且有独立的进程池（进程之间不共享内存），因此每次运行的结果都是独立的。又由于multiprocessing_script.py是多进程并行的，所以输出顺序不稳定且hash值不同。

iD1被主进程占用，所以任务分别由id2，id3的进程处理。

不稳定的原因：偶数进程有开销小的调度优势（hash很慢），在大部分情况抢占所有任务，奇数就没有要消费的任务

## **0.2** 协程
    阅读理解下面代码，观察运行的结果，解释结果的原因

In [24]:
import time
 
#定义一个消费者，他有名字name
#因为里面有yield，本质上是一个生成器
def consumer(name): 
    print(f'{name}  准备吃包子啦！,呼吁店小二')
    while True:
        baozi=yield  #接收send传的值，并将值赋值给变量baozi
        print(f'包子 {baozi+1} 来了,被 {name} 吃了！')
 
#定义一个生产者，生产包子的店家，店家有一个名字name,并且有两个顾客c1 c2
def producer(name,c1,c2):
    next(c1)  #启动生成器c1
    next(c2)  #启动生成器c2
    print(f'{name} 开始准备做包子啦！')
    for i in range(3):
        time.sleep(1)
        print(f'做了第{i+1}包子，分成两半,你们一人一半')
        c1.send(i)
        c2.send(i)
        print('------------------------------------')
 
c1=consumer('张三') #把函数变成一个生成器
c2=consumer('李四')
producer('店小二',c1,c2)

张三  准备吃包子啦！,呼吁店小二
李四  准备吃包子啦！,呼吁店小二
店小二 开始准备做包子啦！
做了第1包子，分成两半,你们一人一半
包子 1 来了,被 张三 吃了！
包子 1 来了,被 李四 吃了！
------------------------------------
做了第2包子，分成两半,你们一人一半
包子 2 来了,被 张三 吃了！
包子 2 来了,被 李四 吃了！
------------------------------------
做了第3包子，分成两半,你们一人一半
包子 3 来了,被 张三 吃了！
包子 3 来了,被 李四 吃了！
------------------------------------


#### 结果：
```python
张三  准备吃包子啦！,呼吁店小二

李四  准备吃包子啦！,呼吁店小二

店小二 开始准备做包子啦！

（1s）

做了第1包子，分成两半,你们一人一半

包子 1 来了,被 张三 吃了！

包子 1 来了,被 李四 吃了！

------------------------------------

（1s）

做了第2包子，分成两半,你们一人一半

包子 2 来了,被 张三 吃了！

包子 2 来了,被 李四 吃了！

------------------------------------

（1s）

做了第3包子，分成两半,你们一人一半

包子 3 来了,被 张三 吃了！

包子 3 来了,被 李四 吃了！

------------------------------------
```

分析：调用producer后，首先执行两个next，使生成器consumer('张三')， consumer('李四')先运行到第一个yield处，故先输出两行“准备吃包子啦！,呼吁店小二”并等待在producer里接着send，然后producer中继续执行，输出“店小二 开始准备做包子啦！”后在循环中"输出做了第i+1包子，分成两半,你们一人一半",并send(i)到这两个consumer的yield处，此时consumer内部baozi被赋值i，输出“包子 i+1 来了,被 张三/李四 吃了！”，并接着执行到下一次yield处等待send。循环三次得到如上结果。

## **0.3** 正则表达式、网路编程和任务调度

    当运行此程序时，最终会输出哪些任务执行信息？说明具体输出顺序和原因

    解释正则表达式^(\d+):(.+)$的作用及其在代码中的具体应用场景

In [None]:
import re
import socket
import threading
from datetime import datetime, timedelta
import time

'''
TaskScheduler 类
功能：实现一个简单的任务调度器，支持添加延迟任务并按时执行。

关键点：
使用线程锁 (threading.Lock) 保证任务列表的线程安全。
使用 datetime 和 timedelta 计算任务的执行时间。
使用后台线程 (daemon=True) 持续检查并执行到期的任务。
'''
           
class TaskScheduler:
    def __init__(self):
        self.tasks = []  # 存储任务列表，每个任务是一个元组 (执行时间, 命令)
        self.lock = threading.Lock()  # 线程锁，用于保护任务列表的线程安全
    
    def add_task(self, delay, command):
        with self.lock:  # 加锁，确保线程安全 # 多线程一定要加锁！！！
            execute_time = datetime.now() + timedelta(seconds=delay)  # 计算任务的执行时间
            self.tasks.append((execute_time, command))  # 将任务添加到任务列表
            print(f"Scheduled: {command} at {execute_time.strftime('%H:%M:%S')}")  # 打印任务调度信息
    
    def start(self):
        threading.Thread(target=self._run, daemon=True).start()  # 启动后台线程运行任务调度器
    
    def _run(self):
        while True:  # 持续运行(提供服务)
            now = datetime.now()  # 获取当前时间
            with self.lock:  # 加锁，确保线程安全
                due_tasks = [(t, cmd) for t, cmd in self.tasks if t <= now]  # 筛选出到期的任务
                self.tasks = [(t, cmd) for t, cmd in self.tasks if t > now]  # 更新任务列表，移除已到期的任务
            
            for task in due_tasks:  # 遍历并执行到期的任务
                print(f"Executing: {task[1]} at {now.strftime('%H:%M:%S')}")  # 打印任务执行信息
                time.sleep(1)  # 模拟任务执行时间
            
            time.sleep(0.5)  # 每隔 0.5 秒检查一次任务列表

            
'''
handle_client 函数
功能：处理客户端连接，解析客户端发送的命令并添加到任务调度器中。

关键点：
使用正则表达式 (re.compile) 解析客户端发送的命令。
具体命令格式为 ? 
将解析后的任务添加到任务调度器中。
'''
def handle_client(conn, scheduler):
    pattern = re.compile(r'^(\d+):(.+)$')  # 正则表达式，匹配具体"？内容"
    while True:
        data = conn.recv(1024).decode()  # 接收客户端发送的数据
        if not data:  # 如果数据为空，断开连接
            break
        match = pattern.match(data.strip())  # 使用正则表达式匹配数据 # strip去掉首尾的连续空格
        if match:
            delay = int(match.group(1))  # 提取内容1
            cmd = match.group(2)  # 提取内容2
            scheduler.add_task(delay, cmd)  # 将任务添加到调度器
        else:
            print(f"Invalid command: {data}")  # 如果命令格式无效，打印错误信息

'''
server 函数
功能：启动服务器，监听客户端连接并为每个客户端创建一个线程处理请求。

关键点：
使用 socket 创建 TCP 服务器。
为每个客户端连接创建一个新线程，调用 handle_client 处理客户端请求。
'''
def server():
    scheduler = TaskScheduler()  # 创建任务调度器实例
    scheduler.start()  # 启动任务调度器
    
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:  # 创建 TCP 套接字
        s.bind(('localhost', 65432))  # 绑定到本地地址和端口
        s.listen()  # 开始监听连接
        print("Server started")  # 打印服务器启动信息
        while True:
            conn, addr = s.accept()  # 接受客户端连接
            print(f"Connected by {addr}")  # 打印客户端地址信息
            threading.Thread(target=handle_client, args=(conn, scheduler)).start()  # 为每个客户端创建新线程

'''
4. 主程序
功能：启动服务器并模拟客户端发送任务。

关键点：
使用 threading.Thread 启动服务器线程。
模拟客户端发送任务到服务器。
'''
if __name__ == "__main__":
    threading.Thread(target=server, daemon=True).start()  # 启动服务器线程
    time.sleep(1)  # 等待服务器启动
    
    # 模拟客户端发送任务
    def client(msg):
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:  # 创建客户端套接字
            s.connect(('localhost', 65432))  # 连接到服务器
            s.sendall(msg.encode())  # 发送任务消息
    
    client("3:Task1")  # 发送任务 1，
    time.sleep(1)
    client("2:Task2")  # 发送任务 2，
    client("1:Task3")  # 发送任务 3，
    
    time.sleep(10)  # 保持主线程运行，确保任务执行完成

Server started
Connected by ('127.0.0.1', 59923)
Scheduled: Task1 at 14:29:29
Connected by ('127.0.0.1', 59924)
Scheduled: Task2 at 14:29:29
Connected by ('127.0.0.1', 59925)
Scheduled: Task3 at 14:29:28
Executing: Task3 at 14:29:28
Executing: Task1 at 14:29:29
Executing: Task2 at 14:29:29


输出：

Server started

Connected by ('127.0.0.1', 59923)

Scheduled: Task1 at 14:29:29

Connected by ('127.0.0.1', 59924)

Scheduled: Task2 at 14:29:29

Connected by ('127.0.0.1', 59925)

Scheduled: Task3 at 14:29:28

Executing: Task3 at 14:29:28

Executing: Task1 at 14:29:29

Executing: Task2 at 14:29:29


创建服务器server线程并启动，并启动了任务调度器scheduler，然后服务器绑定到本地地址和端口并监听。并启动服务器，监听客户端。而在main中，3个客户端连接到服务器（输出Connected）并发送任务消息："3:Task1"，"2:Task2"，"1:Task3"，此时服务器为每个客户端创建一个线程handle_client并处理任务。然后
```python
pattern = re.compile(r'^(\d+):(.+)$')
```
这行的意思是限定开头（^）到结尾（$），分成两组，第一个为匹配的整数(":"前面只有一个整数)，然后匹配":"，第二个匹配:后面的内容（即“Task{i}”）,这里目的是在后面的match中找出任务的延长时间delay和任务名称cmd。然后在任务调度器中加入任务（此时时间+延长时间后放入任务列表）。故已启动的任务调度器scheduler会在持续运行的run函数中（每过0.5s就检查一次），对已到期的任务会具体执行（每次任务花费1s）。

假设初始时间为0。由于Task1先放入但延迟3s执行（第3s才执行），故先scheduled task1，然后等待1s后放入Task2和Task3（第1s输出），再输出scheduled task2, task3。而Task2延迟2s运行（第3s才执行），Task3延迟1s运行（第2s才执行）,故先execute task3，然后执行这个任务1s后到第3s，这1s依次execute task1, task2（因为FIFO先到先得策略）。

优先调度最早到期的策略，到期时间基本相同(<0.5s)则FIFO先到先得策略。

## **0.4** 网络爬虫

    阅读下面代码，解释输出

In [27]:
import requests                # 用于发送HTTP请求
from bs4 import BeautifulSoup  # 用于解析HTML文档
import re                      # 用于正则表达式操作
import threading               # 用于多线程编程

# 定义需要爬取的URL列表
urls = ["https://www.pku.edu.cn/", "https://its.pku.edu.cn/", "https://eecs.pku.edu.cn/"]

# 定义爬取函数
def crawl(url):
    print(f"Crawling {url}")  # 打印当前正在爬取的URL
    response = requests.get(url)  # 发送HTTP GET请求，获取网页内容
    response.encoding = response.apparent_encoding  # 根据网页内容自动设置编码
    html = response.text  # 获取网页的HTML内容

    soup = BeautifulSoup(html, 'html.parser')  # 使用BeautifulSoup解析HTML文档

    cookies = response.cookies  # 获取服务器返回的Cookies
    print(cookies)  # 打印Cookies

    # 将Cookies写入文件
    with open('cookies.txt', 'a') as f:  # 以追加模式打开文件
        f.write(url + ': ' + str(cookies) + '\n')  # 将URL和对应的Cookies写入文件

    # 提取网页中的纯文本内容，并去除多余的空格
    text = re.sub(r'\s+', ' ', soup.get_text())  # 使用正则表达式具体做了？
    print(text)  # 打印提取的文本内容

    # 将提取的文本内容写入文件
    with open('text.txt', 'a') as f:  # 以追加模式打开文件
        f.write(text + '\n')  # 将文本内容写入文件

# 创建线程列表，每个线程负责爬取一个URL
threads = [threading.Thread(target=crawl, args=(url, )) for url in urls]

# 启动所有线程
for t in threads:
    t.start()

# 等待所有线程完成
for t in threads:
    t.join()


Crawling https://www.pku.edu.cn/Crawling https://its.pku.edu.cn/

Crawling https://eecs.pku.edu.cn/
<RequestsCookieJar[<Cookie JSESSIONID=ADE5B3F0CFF600D6DF4325BFDA31D372 for its.pku.edu.cn/>]>
<RequestsCookieJar[]>
 北京大学网络服务 - 首页 CARSI 燕云直播 邮箱 门户 客户端 网费充值 CARSI 燕云直播 邮箱 门户 客户端 网费充值 北京大学网络服务 62751023 网络服务网络 信息服务信息 校园卡 高性能计算高性能 机房上机上机 联系我们 计算中心中心 查看IP 忘记密码 断开连接 选择断开 新学期快乐！ More > 新春快乐！ More > 新年快乐！ More > 第四届北京大学信息安全综合能力竞赛 More > 计算中心创新技术手段保障开学典礼网络服务，用心用情提升师生体验 More > More > 通知公告 03-07Gaussian 16、GaussView 6上线正版软件平台new 03-06校园智能体开发团队招募new 02-24关于校园无线网络设备维护的通知new 02-21关于畅春新园光纤改造的通知 新闻动态 03-11计算中心与教务部举行调研交流会 03-06党委宣传部与计算中心举行工作协调推进会 03-01数智化赋能出版，助力教学科研创新——出版社与计算中心开展联学共建活动 12-132024“京华杯”信息安全综合能力竞赛闭幕式暨颁奖典礼举行 2024“京华杯”信息安全综合能力竞赛闭幕式暨颁奖典礼举行 2024年11月29日下午，2024年“京华杯”信息安全综合能力竞赛在北京大学哲学楼201报告厅举行闭幕式暨颁奖典礼。 更多新闻 网络故障报修 CARSI eduroam 桌面防病毒 客户端 漏洞盒子平台 国家信息安全漏洞库 补天漏洞响应平台 教育漏洞报告平台 断开全部连接 关闭 
 北京大学信息科学技术学院 X 北京大学 院内门户 English 旧版网站 | 导航 X 学院首页 新闻动态 讲座信息 毕业合影 学院概况 院长寄语 学院简介 相关委员会 机构设置 师资队伍 简介 基础实验教学研究中心 计算机学院

输出：

cookies.txt:
```txt
https://www.pku.edu.cn/: <RequestsCookieJar[]>

https://eecs.pku.edu.cn/: <RequestsCookieJar[]>

https://its.pku.edu.cn/: <RequestsCookieJar[<Cookie JSESSIONID=0517C73EC9920A46CC66E1AF00520E0E for its.pku.edu.cn/>]>
```
text.txt：
很长不复制了。

分析：

爬虫多线程地爬到"https://www.pku.edu.cn/", "https://its.pku.edu.cn/", "https://eecs.pku.edu.cn/"这三个url上，然后每个线程中发送HTTP GET请求后获取网页的HTML内容，再用用BeautifulSoup解析赋值给soup。

首先打印服务器的cookie（response.cookies）,并在
```python
with open('cookies.txt', 'a') as f
```
中把这些cookie追加地写入文件cookies.txt（即如果文件已经存在，文件指针将会放在文件的结尾，新的内容将会被写入到已有内容之后，故依次输出url（顺序不稳定，因为这是多线程的并行））。然后
```python
text = re.sub(r'\s+', ' ', soup.get_text())
```
把soup的文本内容中的连续的 空格或换行或制表符等空白字符 替换成一个空格（使输出结果更有可读性），并同理输出并追加地写入文件text.txt。最后等待所有线程结束。

（注：由如果不加锁的话可能出现一个线程的部分数据与另一个线程的部分数据交错写入文件。只是这里write的内容短所以没有出现。）

print是原子操作，但是可能会交织地输出。

# 第一部分 基础练习

## **1.1** 进程的创建

In [4]:
%%writefile multiprocessing_script_task1_1.py
#导入模块
import multiprocessing
import time
 
#创建进程调用函数
def work1(interval):
	print('执行work1')
	time.sleep(interval)
	print('end work1')
 
def work2(interval):
	print('执行work2')
	time.sleep(interval)
	print('end work2')
 
if __name__ == "__main__":
	print('执行主进程')
	#代码填空：创建进程对象
	p1=multiprocessing.Process(target = work1, args = (3, ))
	p2=multiprocessing.Process(target = work2, args = (1, ))
	#代码填空：启动进程
	p1.start()
	p2.start()
	p1.join()
	p2.join()
	print('主进程结束')

Overwriting multiprocessing_script_task1_1.py


In [5]:
import subprocess
result = subprocess.run(
    ["python", "multiprocessing_script_task1_1.py"], 
    capture_output=True, text=True)
assert "执行主进程" in result.stdout
assert "执行work1" in result.stdout
assert "执行work2" in result.stdout
assert "end work1" in result.stdout
assert "end work2" in result.stdout
assert "主进程结束" in result.stdout


## **1.2** 进程池

In [16]:
%%writefile multiprocessing_script_task1_2.py
import multiprocessing

def square(x):
    return x * x

if __name__ == '__main__':
    # 代码填空：创建一个进程池，进程数为4
    with multiprocessing.Pool(4) as pool:
        results = [pool.apply_async(square, (i,)) for i in range(5)]
        output = [res.get() for res in results]
    print(output)  # 应该输出 [0, 1, 4, 9, 16]

Overwriting multiprocessing_script_task1_2.py


In [17]:
import subprocess
result = subprocess.run(
    ["python", "multiprocessing_script_task1_2.py"], 
    capture_output=True, text=True)
assert result.stdout == "[0, 1, 4, 9, 16]\n"

## **1.3** 进程消息传递

In [22]:
%%writefile multiprocessing_script_task1_3.py
import multiprocessing

def consumer(q):
    while True:
        # 代码填空：从队列中获取数据
        item = q.get()
        if item == 'STOP':
            break
        print(f"消费: {item}")

def producer(q):
    for i in range(3):
        q.put(f"产品{i}")
        # 代码填空：向队列中添加数据，内容为"产品i"

    q.put('STOP')

if __name__ == '__main__':
    # 代码填空：创建一个队列
    q = multiprocessing.Queue()
    p1 = multiprocessing.Process(target=producer, args=(q,))
    p2 = multiprocessing.Process(target=consumer, args=(q,))
    p1.start()
    p2.start()
    p1.join()
    p2.join()

Overwriting multiprocessing_script_task1_3.py


In [23]:
import subprocess
result = subprocess.run(
    ["python", "multiprocessing_script_task1_3.py"], 
    capture_output=True, text=True)
# print(result.stdout)
assert "消费: 产品0" in result.stdout
assert "消费: 产品1" in result.stdout
assert "消费: 产品2" in result.stdout


## **1.4** 线程的创建和传参

创建两个线程，分别计算`x`, `y`的和与积，存入`result`中

In [25]:
import threading
x, y = 3, 4

result = {}
def worker1(x, y):
    result['add'] = x + y
    
def worker2(x, y):
    result['multiply'] = x * y

# t1 = ... # TODO: create a thread
t1 = threading.Thread(target = worker1, args = (x, y,))
# t2 = ... # TODO: create a thread
t2 = threading.Thread(target = worker2, args = (x, y,))
t1.start()
t2.start()
t1.join()
t2.join()

assert result == {'add': 7, 'multiply': 12}

## **1.5** 线程锁

在以下代码用`#######`包含的部分中，在合适的位置添加和应用线程锁，保证`counter`线程安全

In [26]:
import threading
import time
import random

counter = []
lock = threading.Lock()  # 创建锁对象

##########################################
# TODO: add and apply a lock
def worker():
    with lock:
        for _ in range(100):
            counter.append(1)
            time.sleep(random.random() * 0.01)
            counter[-1] += 1
##########################################

threads = [threading.Thread(target=worker) for _ in range(10)]
for t in threads:
    t.start()
for t in threads:
    t.join()

assert counter == [2] * 1000

## **1.6** 生成器与yield

In [27]:
def coroutine_example():
    value = yield 0
    print(f'Received value: {value}')
    assert value == 3
    value = yield 1
    print(f'Received value: {value}')
    yield 5
    # 代码填空：返回5

c = coroutine_example()
next(c)
# 代码填空：发送3
c.send(3)
assert c.send(4) == 5

Received value: 3
Received value: 4


## **1.7** 基于协程求平均数

In [28]:
def average():
    total = 0.0  #数字的总和
    count = 0    #数字的个数
    avg = None   #平均值
    while True:
        ##############################
        # 代码填空：接收一个数字，并实现计算平均值的逻辑
        value = yield avg
        count += 1; total += value
        avg = total / count
        ##############################
 
#定义一个函数，通过这个函数向average函数发送数值
def sender(generator):
    print(next(generator))  #启动生成器
    assert generator.send(10)==10  # 10
    assert generator.send(20)==15  # 15
    assert generator.send(30)==20  # 20
    assert generator.send(40)==25  # 25
 
 
g = average()
sender(g)

None


## **1.8** 正则表达式

In [29]:
import re
text = "My phone number is 13345678901 and 15592883746"
# 代码填空：定义正则表达式，匹配手机号码
pattern = r'1\d{10}'
matches = re.findall(pattern, text)  
assert matches == ['13345678901', '15592883746']

## **1.9** 网络编程

In [43]:
%%writefile socket_server.py
# 服务端
import socket

# 代码填空：创建服务器套接字，绑定到本地地址，端口9999，监听连接，最大连接数为5
server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
host, port = 'localhost', 9999
server_socket.bind((host, port))
server_socket.listen(5)

print("服务器已启动，等待连接...")

while True:
    # 代码填空：接受客户端连接
    client_socket, addr = server_socket.accept()
    print(f"接收到来自 {addr} 的连接")
    
    # 代码填空：发送欢迎消息
    message = "欢迎！"
    client_socket.send(message.encode('utf-8'))
    
    # 关闭连接
    client_socket.close()

Overwriting socket_server.py


In [44]:
%%writefile socket_client.py
# 客户端
import socket

# 代码填空：创建客户端套接字
client_socket = socket.socket()

# 连接到服务器
client_socket.connect(('localhost', 9999))

# 代码填空：接收欢迎消息
message = client_socket.recv(1024).decode('utf-8')
print(message)

# 关闭连接
client_socket.close()

Overwriting socket_client.py


In [45]:
import subprocess
import time

# 启动服务器，正确捕获输出
server = subprocess.Popen(
    ["python", "socket_server.py"], 
    stdout=subprocess.PIPE, 
    stderr=subprocess.PIPE,
    text=True
)

# 给服务器一些启动时间
time.sleep(1)

# 运行客户端
client = subprocess.run(
    ["python", "socket_client.py"], 
    capture_output=True, 
    text=True
)

print(client.stdout)
server.kill()
assert "欢迎！" in client.stdout

欢迎！



# 第二部分 进阶练习

## **2.1** 正则表达式提取参考文献信息

    运用正则表达式从下面参考文献中提取作者列表，文章名称，发表时间。
    结果存为JSON格式文件。

## 参考文献

1. Agarwal, Nitin, Ravi Shankar Reddy, Kiran Gvr, and Carolyn Penstein Rosé. 2011. "Towards multi-document summarization of scientific articles: making interesting comparisons with SciSumm." In Proceedings of the Workshop on Automatic Summarization for Different Genres, Media, and Languages, pages 8-15, Portland, Oregon. Association for Computational Linguistics.

2. Beltagy, Iz, Kyle Lo, and Arman Cohan. 2019. "SciBERT: A pretrained language model for scientific text." In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pages 3615-3620, Hong Kong, China. Association for Computational Linguistics.

3. Beltagy, Iz, Matthew E Peters, and Arman Cohan. 2020. "Longformer: The long-document transformer." arXiv preprint arXiv:2004.05150.

4. Bornmann, Lutz, and Rüdiger Mutz. 2015. "Growth rates of modern science: A bibliometric analysis based on the number of publications and cited references." Journal of the Association for Information Science and Technology, 66(11): 2215-2222.

5. Brown, Tom, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. 2020. "Language models are few-shot learners." In Advances in Neural Information Processing Systems, volume 33, pages 1877-1901.

6. Cohan, Arman, Franck Dernoncourt, Doo Soon Kim, Trung Bui, Seokhwan Kim, Walter Chang, and Nazli Goharian. 2018. "A discourse-aware attention model for abstractive summarization of long documents." In Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers), pages 615-621, New Orleans, Louisiana. Association for Computational Linguistics.

7. Cohan, Arman, Guy Feigenblat, Dayne Freitag, Tirthankar Ghosal, Drahomira Herrmannova, Petr Knoth, Kyle Lo, Philipp Mayr, Michal Shmueli-Scheuer, Anita de Waard, and Lucy Lu Wang. 2022. "Overview of the third workshop on scholarly document processing." In Proceedings of the Third Workshop on Scholarly Document Processing, pages 1-6, Gyeongju, Republic of Korea. Association for Computational Linguistics.

8. Cohan, Arman, Sergey Feldman, Iz Beltagy, Doug Downey, and Daniel Weld. 2020. "SPECTER: Document-level representation learning using citation-informed transformers." In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 2270-2282, Online. Association for Computational Linguistics.

9. DeYoung, Jay, Iz Beltagy, Madeleine van Zuylen, Bailey Kuehl, and Lucy Lu Wang. 2021. "MS²: Multi-document summarization of medical studies." In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pages 7494-7513, Online and Punta Cana, Dominican Republic. Association for Computational Linguistics.

10. Erkan, Günes, and Dragomir R Radev. 2004. "LexRank: Graph-based lexical centrality as salience in text summarization." Journal of artificial intelligence research, 22: 457-479.

11. Fabbri, Alexander, Irene Li, Tianwei She, Suyi Li, and Dragomir Radev. 2019. "Multi-news: A large-scale multi-document summarization dataset and abstractive hierarchical model." In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pages 1074-1084, Florence, Italy. Association for Computational Linguistics.

12. Hallgren, Kevin A. 2012. "Computing inter-rater reliability for observational data: an overview and tutorial." Tutorials in quantitative methods for psychology, 8(1): 23-34.

13. Hossain, MD Zakir, Ferdous Sohel, Mohd Fairuz Shiratuddin, and Hamid Laga. 2019. "A comprehensive survey of deep learning for image captioning." ACM Computing Surveys, 51(6): 1-36.

14. Izacard, Gautier, and Edouard Grave. 2021. "Leveraging passage retrieval with generative models for open domain question answering." In Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, pages 874-880, Online. Association for Computational Linguistics.

15. Jaidka, Kokil, Christopher Khoo, and Jin-Cheon Na. 2013a. "Deconstructing human literature reviews–a framework for multi-document summarization." In Proceedings of the 14th European Workshop on Natural Language Generation, pages 125-135, Sofia, Bulgaria. Association for Computational Linguistics.

16. Jaidka, Kokil, Christopher SG Khoo, and Jin-Cheon Na. 2013b. "Literature review writing: how information is selected and transformed." In Aslib Proceedings, volume 65, pages 303-325.

17. Jaidka, Kokil, Christopher SG Khoo, and Jin-Cheon Na. 2019. "Characterizing human summarization strategies for text reuse and transformation in literature review writing." Scientometrics, 121(3): 1563-1582.

18. Ji, Ziwei, Nayeon Lee, Rita Frieske, Tiezheng Yu, Dan Su, Yan Xu, Etsuko Ishii, Yejin Bang, Andrea Madotto, and Pascale Fung. 2022. "Survey of hallucination in natural language generation." ACM Computing Surveys. Just Accepted.

19. Jiao, Licheng, and Jin Zhao. 2019. "A survey on the new generation of deep learning in image processing." IEEE Access, 7: 172231-172263.

20. Khan, Khalid S, Regina Kunz, Jos Kleijnen, and Gerd Antes. 2003. "Five steps to conducting a systematic review." Journal of the royal society of medicine, 96(3): 118-121.

21. Laga, Hamid. 2019. "A survey on deep learning architectures for image-based depth reconstruction." arXiv preprint arXiv:1906.06113.

22. Laskar, Md Tahmid Rahman, Enamul Hoque, and Jimmy Xiangji Huang. 2022. "Domain adaptation with pre-trained transformers for query-focused abstractive text summarization." Computational Linguistics, 48(2): 279-320.

23. LeCun, Yann, Bernhard Boser, John S Denker, Donnie Henderson, Richard E Howard, Wayne Hubbard, and Lawrence D Jackel. 1989. "Backpropagation applied to handwritten zip code recognition." Neural computation, 1(4): 541-551.

24. Lewis, Mike, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Veselin Stoyanov, and Luke Zettlemoyer. 2020. "BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension." In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 7871-7880, Online. Association for Computational Linguistics.

25. Lin, Chin-Yew. 2004. "ROUGE: A package for automatic evaluation of summaries." In Text Summarization Branches Out, pages 74-81, Barcelona, Spain. Association for Computational Linguistics.

26. Liu, Ruijun, Yuqian Shi, Changjiang Ji, and Ming Jia. 2019. "A survey of sentiment analysis based on transfer learning." IEEE Access, 7: 85401-85412.

27. Lo, Kyle, Lucy Lu Wang, Mark Neumann, Rodney Kinney, and Daniel Weld. 2020. "S2ORC: The semantic scholar open research corpus." In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 4969-4983, Online. Association for Computational Linguistics.

28. Lu, Yao, Yue Dong, and Laurent Charlin. 2020. "Multi-XScience: A large-scale dataset for extreme multi-document summarization of scientific articles." In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 8068-8074, Online.


In [1]:
text = """
1. Agarwal, Nitin, Ravi Shankar Reddy, Kiran Gvr, and Carolyn Penstein Rosé. 2011. "Towards multi-document summarization of scientific articles: making interesting comparisons with SciSumm." In Proceedings of the Workshop on Automatic Summarization for Different Genres, Media, and Languages, pages 8-15, Portland, Oregon. Association for Computational Linguistics.

2. Beltagy, Iz, Kyle Lo, and Arman Cohan. 2019. "SciBERT: A pretrained language model for scientific text." In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pages 3615-3620, Hong Kong, China. Association for Computational Linguistics.

3. Beltagy, Iz, Matthew E Peters, and Arman Cohan. 2020. "Longformer: The long-document transformer." arXiv preprint arXiv:2004.05150.

4. Bornmann, Lutz, and Rüdiger Mutz. 2015. "Growth rates of modern science: A bibliometric analysis based on the number of publications and cited references." Journal of the Association for Information Science and Technology, 66(11): 2215-2222.

5. Brown, Tom, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. 2020. "Language models are few-shot learners." In Advances in Neural Information Processing Systems, volume 33, pages 1877-1901.

6. Cohan, Arman, Franck Dernoncourt, Doo Soon Kim, Trung Bui, Seokhwan Kim, Walter Chang, and Nazli Goharian. 2018. "A discourse-aware attention model for abstractive summarization of long documents." In Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers), pages 615-621, New Orleans, Louisiana. Association for Computational Linguistics.

7. Cohan, Arman, Guy Feigenblat, Dayne Freitag, Tirthankar Ghosal, Drahomira Herrmannova, Petr Knoth, Kyle Lo, Philipp Mayr, Michal Shmueli-Scheuer, Anita de Waard, and Lucy Lu Wang. 2022. "Overview of the third workshop on scholarly document processing." In Proceedings of the Third Workshop on Scholarly Document Processing, pages 1-6, Gyeongju, Republic of Korea. Association for Computational Linguistics.

8. Cohan, Arman, Sergey Feldman, Iz Beltagy, Doug Downey, and Daniel Weld. 2020. "SPECTER: Document-level representation learning using citation-informed transformers." In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 2270-2282, Online. Association for Computational Linguistics.

9. DeYoung, Jay, Iz Beltagy, Madeleine van Zuylen, Bailey Kuehl, and Lucy Lu Wang. 2021. "MS²: Multi-document summarization of medical studies." In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pages 7494-7513, Online and Punta Cana, Dominican Republic. Association for Computational Linguistics.

10. Erkan, Günes, and Dragomir R Radev. 2004. "LexRank: Graph-based lexical centrality as salience in text summarization." Journal of artificial intelligence research, 22: 457-479.

11. Fabbri, Alexander, Irene Li, Tianwei She, Suyi Li, and Dragomir Radev. 2019. "Multi-news: A large-scale multi-document summarization dataset and abstractive hierarchical model." In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pages 1074-1084, Florence, Italy. Association for Computational Linguistics.

12. Hallgren, Kevin A. 2012. "Computing inter-rater reliability for observational data: an overview and tutorial." Tutorials in quantitative methods for psychology, 8(1): 23-34.

13. Hossain, MD Zakir, Ferdous Sohel, Mohd Fairuz Shiratuddin, and Hamid Laga. 2019. "A comprehensive survey of deep learning for image captioning." ACM Computing Surveys, 51(6): 1-36.

14. Izacard, Gautier, and Edouard Grave. 2021. "Leveraging passage retrieval with generative models for open domain question answering." In Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, pages 874-880, Online. Association for Computational Linguistics.

15. Jaidka, Kokil, Christopher Khoo, and Jin-Cheon Na. 2013a. "Deconstructing human literature reviews–a framework for multi-document summarization." In Proceedings of the 14th European Workshop on Natural Language Generation, pages 125-135, Sofia, Bulgaria. Association for Computational Linguistics.

16. Jaidka, Kokil, Christopher SG Khoo, and Jin-Cheon Na. 2013b. "Literature review writing: how information is selected and transformed." In Aslib Proceedings, volume 65, pages 303-325.

17. Jaidka, Kokil, Christopher SG Khoo, and Jin-Cheon Na. 2019. "Characterizing human summarization strategies for text reuse and transformation in literature review writing." Scientometrics, 121(3): 1563-1582.

18. Ji, Ziwei, Nayeon Lee, Rita Frieske, Tiezheng Yu, Dan Su, Yan Xu, Etsuko Ishii, Yejin Bang, Andrea Madotto, and Pascale Fung. 2022. "Survey of hallucination in natural language generation." ACM Computing Surveys. Just Accepted.

19. Jiao, Licheng, and Jin Zhao. 2019. "A survey on the new generation of deep learning in image processing." IEEE Access, 7: 172231-172263.

20. Khan, Khalid S, Regina Kunz, Jos Kleijnen, and Gerd Antes. 2003. "Five steps to conducting a systematic review." Journal of the royal society of medicine, 96(3): 118-121.

21. Laga, Hamid. 2019. "A survey on deep learning architectures for image-based depth reconstruction." arXiv preprint arXiv:1906.06113.

22. Laskar, Md Tahmid Rahman, Enamul Hoque, and Jimmy Xiangji Huang. 2022. "Domain adaptation with pre-trained transformers for query-focused abstractive text summarization." Computational Linguistics, 48(2): 279-320.

23. LeCun, Yann, Bernhard Boser, John S Denker, Donnie Henderson, Richard E Howard, Wayne Hubbard, and Lawrence D Jackel. 1989. "Backpropagation applied to handwritten zip code recognition." Neural computation, 1(4): 541-551.

24. Lewis, Mike, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Veselin Stoyanov, and Luke Zettlemoyer. 2020. "BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension." In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 7871-7880, Online. Association for Computational Linguistics.

25. Lin, Chin-Yew. 2004. "ROUGE: A package for automatic evaluation of summaries." In Text Summarization Branches Out, pages 74-81, Barcelona, Spain. Association for Computational Linguistics.

26. Liu, Ruijun, Yuqian Shi, Changjiang Ji, and Ming Jia. 2019. "A survey of sentiment analysis based on transfer learning." IEEE Access, 7: 85401-85412.

27. Lo, Kyle, Lucy Lu Wang, Mark Neumann, Rodney Kinney, and Daniel Weld. 2020. "S2ORC: The semantic scholar open research corpus." In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 4969-4983, Online. Association for Computational Linguistics.

28. Lu, Yao, Yue Dong, and Laurent Charlin. 2020. "Multi-XScience: A large-scale dataset for extreme multi-document summarization of scientific articles." In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 8068-8074, Online.
"""

In [5]:
import re
import json

# 分割为单独的参考文献条目
pattern_split = r'\n\d+\.'
references_list = re.split(pattern_split, text)
references_list = [ref for ref in references_list if ref.strip()]  # 移除空条目
print(references_list)
# 提取信息
results = []
for i, ref in enumerate(references_list):
    # 提取作者和年份
    author_year_match = re.search(r"(.+?)\. (\d{4}[a-z]?)", ref) # 分组提取 # 注意2013a.情况
    if author_year_match:
        authors = author_year_match.group(1).strip()
        year = author_year_match.group(2)
    else:
        authors = "未找到作者"
        year = "未找到年份"
    
    # 提取文章标题 (引号中的内容)
    title_match = re.search(r'"(.+)"', ref)
    if title_match:
        title = title_match.group(1)
    else:
        title = "未找到标题"
    results.append({
        "作者列表": authors,
        "发表时间": year,
        "文章名称": title
    })

# 转换为JSON
results_json = json.dumps(results, ensure_ascii = False, indent = 4)

# 保存json文件
with open('references.json', 'w') as f:
    f.write(results_json)

with open('references.json', 'r', encoding='utf-8') as f:
    loaded_results = json.load(f)
assert loaded_results == results
assert len(results) == 28
assert results[2] == {'作者列表': 'Beltagy, Iz, Matthew E Peters, and Arman Cohan', '发表时间': '2020', '文章名称': 'Longformer: The long-document transformer.'}

[' Agarwal, Nitin, Ravi Shankar Reddy, Kiran Gvr, and Carolyn Penstein Rosé. 2011. "Towards multi-document summarization of scientific articles: making interesting comparisons with SciSumm." In Proceedings of the Workshop on Automatic Summarization for Different Genres, Media, and Languages, pages 8-15, Portland, Oregon. Association for Computational Linguistics.\n', ' Beltagy, Iz, Kyle Lo, and Arman Cohan. 2019. "SciBERT: A pretrained language model for scientific text." In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pages 3615-3620, Hong Kong, China. Association for Computational Linguistics.\n', ' Beltagy, Iz, Matthew E Peters, and Arman Cohan. 2020. "Longformer: The long-document transformer." arXiv preprint arXiv:2004.05150.\n', ' Bornmann, Lutz, and Rüdiger Mutz. 2015. "Growth rates of modern science: A bibliometric analysis based on the number of p