In [1]:
import pandas as pd
import glob
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load data
file_prefix='done_'
file_pattern = os.path.join("../data/jira_trem/", f'{file_prefix}*.csv')
csv_files = glob.glob(file_pattern)

dataframes = []
for file in csv_files:
    dataframes.append(pd.read_csv(file))
df = pd.concat(dataframes, ignore_index=True)

df['Created'] = pd.to_datetime(df['Created'])
df = df.sort_values(by='Created', ascending=True)

df.dropna(subset=['Assignee Id', 'Description', 'Sprint', 'Time Spent'], inplace=True)
# 重命名一些列名
df =df.rename(columns={
    'Custom field (Story Points)': 'storypoint',
    'Issue key': 'issuekey',
    'Summary': 'title',
    'Description': 'description'
})
# 选取有用的列
selected_cols = ['issuekey', 'title', 'description', 'storypoint']
data = df[selected_cols]

# 定义映射函数
def map_story_points(value):
    if value <= 1:
        return 1.0
    elif value <= 2:
        return 2.0
    elif value <= 3:
        return 3.0
    elif value <= 5:
        return 5.0
    elif value <= 8:
        return 8.0
    elif value <= 13:
        return 13.0
    elif value <= 21:
        return 21.0
    else:
        return -1

data['storypoint'] = data['storypoint'].apply(map_story_points)
data = data[data['storypoint'] != -1]

data = data.reset_index(drop=True)
data

Unnamed: 0,issuekey,title,description,storypoint
0,SAME-20,New architecture document v1,"each module description, integration points, t...",3.0
1,SAME-28,Integrating the frontend part for the i18n ser...,Specs: todo,3.0
2,SAME-38,Sign In - federated user,*AS A* Federated user\n\n*I WANT TO* Sign In\n...,13.0
3,SAME-40,Forgot password- non-federated users,*AS A* Non-federated user\n\n*I WANT TO* reset...,5.0
4,SAME-41,Legal Notice link,*AS A* User of this application\n\n*I WANT* to...,1.0
...,...,...,...,...
1098,SAME-12997,Updating the background page (Connected MFG),Can you please ask the team to update the back...,1.0
1099,SAME-13001,[BE] Update the static-resource-cache and widg...,"Currently, the images for static-resource-cach...",1.0
1100,SAME-13003,[CNUNV] [MES16] Bedford - SAML Integration wit...,Ticket cloned from 42Q Jira by Luis (DSR-4810)...,2.0
1101,SAME-13021,[Auth0] Test and fix the SAML signout flow,The SAML signout flow has been disabled for a ...,1.0


In [3]:
def split_data(data):
    num_rows = len(data)
    train_split = int(num_rows * 0.6)
    val_split = int(num_rows * 0.8)
    data['split_mark'] = ''
    # 标记前 60% 为 train
    data.loc[:train_split - 1, 'split_mark'] = 'train'
    # 标记 60% 到 80% 为 val
    data.loc[train_split:val_split - 1, 'split_mark'] = 'val'
    # 标记 80% 之后为 test
    data.loc[val_split:, 'split_mark'] = 'test'
    return data

data = split_data(data)
data.to_csv('../data/mes.csv', index=False)