In [1]:
import pandas as pd
import torch
import time
from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
from tqdm import tqdm  # 导入tqdm库

In [None]:
import sys
print(sys.executable)
print(sys.path)

/root/miniconda3/bin/python
['/root/autodl-tmp/FA_test_A100', '/root/miniconda3/lib/python38.zip', '/root/miniconda3/lib/python3.8', '/root/miniconda3/lib/python3.8/lib-dynload', '', '/root/miniconda3/lib/python3.8/site-packages']


In [9]:
!pip list

Package    Version
---------- -------
pip        23.2.1
setuptools 68.0.0
tqdm       4.66.1
wheel      0.38.4


In [2]:
# 检查是否有可用的GPU并设为默认设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 从Excel文件中读取数据
df = pd.read_excel('test_dim_data.xlsx', engine='openpyxl')

# 创建一个新的数据框用于保存运行时间
results = {
    'batchsize': [],
    'nheads': [],
    'headdim': [],
    'seqlen': [],
    'average_time': [],
    'all_times': []
}

In [3]:
device

device(type='cuda')

In [4]:
# 遍历每一行数据
for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing rows"):
    batch_size = int(row['batchsize'])
    nheads = int(row['nheads'])
    headdim = int(row['headdim'])
    seqlen = int(row['seqlen'])
    
    times = []

    # GPU预热
    q = torch.randn(batch_size, seqlen, nheads, headdim).to(device).half()
    k = torch.randn(batch_size, seqlen, nheads, headdim).to(device).half()
    v = torch.randn(batch_size, seqlen, nheads, headdim).to(device).half()
    flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=False)

    for _ in range(100):
        q = torch.randn(batch_size, seqlen, nheads, headdim).to(device).half()
        k = torch.randn(batch_size, seqlen, nheads, headdim).to(device).half()
        v = torch.randn(batch_size, seqlen, nheads, headdim).to(device).half()

        start_time = time.time()
        flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=False)
        torch.cuda.synchronize()  # CUDA同步
        end_time = time.time()

        times.append(end_time - start_time)

    times_sorted = sorted(times)
    times_filtered = times_sorted[1:-1]  # 移除最大和最小值
    average_time = sum(times_filtered) / (len(times_filtered))

    # 保存结果
    results['batchsize'].append(batch_size)
    results['nheads'].append(nheads)
    results['headdim'].append(headdim)
    results['seqlen'].append(seqlen)
    results['average_time'].append(average_time)
    results['all_times'].append(times_filtered)  # 保存筛选后的时间




Processing rows: 100%|█████████████████████████████████████████████████████| 16/16 [09:47<00:00, 36.75s/it]


In [5]:
# 将结果保存到新的Excel文件中
results_df = pd.DataFrame(results)
results_df.to_excel('test_dim_data_results_FA2.xlsx', index=False)