In [6]:
import pandas as pd
import glob
import os
import warnings
warnings.filterwarnings('ignore')

In [7]:
# load data
file_prefix='Jira'
file_pattern = os.path.join("./data/jira/", f'{file_prefix}*.csv')
csv_files = glob.glob(file_pattern)

dataframes = []
for file in csv_files:
    dataframes.append(pd.read_csv(file))
df = pd.concat(dataframes, ignore_index=True)

df['Created'] = pd.to_datetime(df['Created'])
df = df.sort_values(by='Created', ascending=True)

df.dropna(subset=['Assignee Id', 'Description', 'Sprint', 'Time Spent'], inplace=True)
# 重命名一些列名
df =df.rename(columns={
    'Custom field (Story Points)': 'storypoint',
    'Issue key': 'issuekey',
    'Summary': 'title',
    'Description': 'description',
    'Time Spent': 'timespent'
})
# 选取有用的列
selected_cols = ['issuekey', 'title', 'description', 'storypoint', 'timespent']
data = df[selected_cols]

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4671 entries, 5048 to 0
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   issuekey     4671 non-null   object 
 1   title        4671 non-null   object 
 2   description  4671 non-null   object 
 3   storypoint   1197 non-null   float64
 4   timespent    4671 non-null   float64
dtypes: float64(2), object(3)
memory usage: 219.0+ KB


In [9]:
# 定义映射函数
def map_story_points(value):
    if value <= 1:
        return 1.0
    elif value <= 2:
        return 2.0
    elif value <= 3:
        return 3.0
    elif value <= 5:
        return 5.0
    elif value <= 8:
        return 8.0
    elif value <= 13:
        return 13.0
    elif value <= 21:
        return 21.0
    else:
        return -1

data['storypoint'] = data['storypoint'].apply(map_story_points)
#data = data[data['storypoint'] != -1]

#data = data.reset_index(drop=True)
data.to_csv('./data/mes_all.csv', index=False)

In [10]:
def split_data(data):
    num_rows = len(data)
    train_split = int(num_rows * 0.6)
    val_split = int(num_rows * 0.8)
    data['split_mark'] = ''
    # 标记前 60% 为 train
    data.loc[:train_split - 1, 'split_mark'] = 'train'
    # 标记 60% 到 80% 为 val
    data.loc[train_split:val_split - 1, 'split_mark'] = 'val'
    # 标记 80% 之后为 test
    data.loc[val_split:, 'split_mark'] = 'test'
    return data

#data = split_data(data)
#data.to_csv('./data/mes_all.csv', index=False)