In [2]:
import pandas as pd
from pathlib import Path

In [None]:
paper_question_path = Path('datasets/HLE-bio-paper-questionid.csv')
paper_question_df = pd.read_csv(paper_question_path)

In [36]:
# 获取 Path 路径的文件名和去掉扩展名的名字
paper_question_path.name, paper_question_path.stem

('HLE-bio-paper-questionid.csv', 'HLE-bio-paper-questionid')

In [37]:
# 替换文件名后缀
paper_question_path.with_suffix('.md')

PosixPath('datasets/HLE-bio-paper-questionid.md')

In [None]:
# 去重后的 questionid 数组
# 类型是 numpy.ndarray，说明 pandas 是建立在 numpy 包的基础上的吧
# numpy.ndarray 可以通过 tolist() 方法转换为 python 列表
unique_question_ids = paper_question_df['questionid'].unique()
unique_question_id_list = unique_question_ids.tolist()
type(unique_question_ids), type(unique_question_id_list)

(numpy.ndarray, list)

In [7]:
num = paper_question_df['questionid'].nunique()
num

74

In [9]:
paper_question_df.head()

Unnamed: 0,id,questionid,parsed_path
0,19269100,6701ca6bb773dec162bde23c,/mnt/seed20_bio/bio/datahub/corpus/HLE/HLE-pap...
1,MA20190102211802450847,66ff063787bfb80443d02df6,/mnt/seed20_bio/bio/datahub/corpus/HLE/HLE-pap...
2,72042926,671aa91f6a8e7c23ccbf902b,/mnt/seed20_bio/bio/datahub/corpus/HLE/HLE-pap...
3,MA20200721082727551672,671cbba2784e3ba406eefb34,/mnt/seed20_bio/bio/datahub/corpus/HLE/HLE-pap...
4,37253800,67155ca2675b57d8bbc9144d,/mnt/seed20_bio/bio/datahub/corpus/HLE/HLE-pap...


In [None]:
# 通过布尔索引来查找对应 id 的记录
# 布尔索引，很有趣的概念
# .equals 可以用来比较两个 dataframe 的值是否一样？
target_id = '6701ca6bb773dec162bde23c'
target_records = paper_question_df[paper_question_df['questionid'] == target_id]
target_records2 = paper_question_df.loc[paper_question_df['questionid'] == target_id]
assert(type(target_records) == type(target_records2))
assert(target_records.equals(target_records2))

In [None]:
# 可以用 id() 来查看 python 对象的内存地址
id(target_records), id(target_records2)

(5297383824, 5297382912)

In [None]:
# 多用 itertuples() 而不是 iterrows()
# 因为 itertuples() 更快，因为它返回的是一个 namedtuple，而不是一个 Series
# 这几种迭代器方法，他们的类型都不一样
simple_df = pd.DataFrame({
    'id': range(2),
    'name': ['a', 'b']
})
for row in simple_df.itertuples(name = None):
    print(type(row))

for row in simple_df.itertuples():
    print(type(row))

for _, row in simple_df.iterrows():
    print(type(row))

<class 'tuple'>
<class 'tuple'>
<class 'pandas.core.frame.Pandas'>
<class 'pandas.core.frame.Pandas'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [None]:
# 测试 pandas 迭代器性能 iterrows() vs itertuples()
import pandas as pd
import time

# 构造一个 10 万行的 DataFrame
df = pd.DataFrame({
    "id": range(100000),
    "value": range(100000)
})

# 测试 iterrows()
start = time.time()
for idx, row in df.iterrows():
    _ = row["id"] + 1
end = time.time()
print(f"iterrows:  {end - start:.4f} seconds")

# 测试 itertuples()
start = time.time()
for row in df.itertuples():
    _ = row.id + 1
end = time.time()
print(f"itertuples: {end - start:.4f} seconds")

# 测试 itertuples(name=None)（最快）
start = time.time()
for row in df.itertuples(name=None):
    _ = row[1] + 1   # row[0] 是 index，row[1] 是 id
end = time.time()
print(f"itertuples(name=None): {end - start:.4f} seconds")


iterrows:  0.8428 seconds
itertuples: 0.0296 seconds
itertuples(name=None): 0.0163 seconds
