In [1]:
import pandas as pd
import string, re, nltk
import spacy
from string import punctuation
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# RegexpTokenizer
regexp = RegexpTokenizer("[\w']+")

# 转换为小写
def convert_to_lowercase(text):
    return text.lower()

# 去除文本两边空格
def strip_text(text):
    return text.strip()

# 移除标点符号
def remove_punctuation(text):
    punct_str = string.punctuation
    punct_str = punct_str.replace("'", "")
    return re.sub(f"[{re.escape(punct_str)}]", " ", text)

# 移除标题的标点符号
def remove_summary_punctuation(text):
    punct_str = string.punctuation
    punct_str = punct_str.replace("'", "")
    punct_str = punct_str.replace("_", "")
    return re.sub(f"[{re.escape(punct_str)}]", " ", text)

# 移除数字token
def remove_number_token(text):
    words = text.split()
    # 过滤掉纯数字的词
    filtered_words = [word for word in words if not re.match(r'^\d+$', word)]
    # 将词按空格合并成句子
    combined_sentence = ' '.join(filtered_words).strip()
    return combined_sentence

# 移除html标签
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(' ', text)

# 移除表情
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags = re.UNICODE)
    return emoji_pattern.sub(' ', text)

# 移除http链接
def remove_http(text):
    http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
    pattern = r"({})".format(http) # creating pattern
    return re.sub(pattern, " ", text)

# Dictionary of acronyms
acronyms_url = './data/english_acronyms.json'
acronyms_dict = pd.read_json(acronyms_url, typ = 'series')
acronyms_list = list(acronyms_dict.keys())

# remove html tags
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(' ', text)

# 移除文本中包含的image tag
def remove_image_tags(text):
    # Define the regular expression pattern to match the image tags
    pattern = re.compile(r'!.*?!')
    cleaned_text = pattern.sub(' ', text)
    return cleaned_text

# 移除文本中{}的内容
def remove_bracket(text):
    pattern = re.compile(r'\{.*?\}')
    return pattern.sub(' ', text)

# 移除文本中||的内容
def remove_table(text):
    pattern = re.compile(r'\|.*?\|')
    return pattern.sub(' ', text)

# 移除文本中**的内容
def remove_star(text):
    pattern = re.compile(r'\*.*?\*')
    return pattern.sub(' ', text)

# convert contractions in a text
def convert_acronyms(text):
    words = []
    for word in regexp.tokenize(text):
        if word in acronyms_list:
            words = words + acronyms_dict[word].split()
        else:
            words = words + word.split()
    
    text_converted = " ".join(words)
    return text_converted

# Dictionary of contractions
contractions_url = './data/english_contractions.json'
contractions_dict = pd.read_json(contractions_url, typ = 'series')
# List of contractions
contractions_list = list(contractions_dict.keys())

# convert contractions in a text
def convert_contractions(text):
    words = []
    for word in regexp.tokenize(text):
        if word in contractions_list:
            words = words + contractions_dict[word].split()
        else:
            words = words + word.split()
    
    text_converted = " ".join(words)
    return text_converted

# 移除停用词
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    
    # 使用正则表达式 tokenizer 处理缩写和标点
    tokenizer = RegexpTokenizer(r'\w+\'?\w+|\w+')
    words = tokenizer.tokenize(text)
    
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Stemming 词干提取，stemming. 如 "running", "runner" 会被转换成 "run".
stemmer = PorterStemmer()
def text_stemmer(text):
    text_stem = " ".join([stemmer.stem(word) for word in regexp.tokenize(text)])
    return text_stem

# Lemmatization 词形还原. 如Better被还原为good
spacy_lemmatizer = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])
#lemmatizer = WordNetLemmatizer()
def text_lemmatizer(text):
    text_spacy = " ".join([token.lemma_ for token in spacy_lemmatizer(text)])
    #text_wordnet = " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(text)]) # regexp.tokenize(text)
    return text_spacy
    #return text_wordnet

# 移除非字母的词
def discard_non_alpha(text):
    word_list_non_alpha = [word for word in regexp.tokenize(text) if word.isalpha()]
    text_non_alpha = " ".join(word_list_non_alpha)
    return text_non_alpha

# 根据词性过滤单词, 如过滤连词(conjunctions), 介词(prepositions)。保留名词(nouns)、形容词和动词
def keep_pos(text):
    tokens = regexp.tokenize(text)
    tokens_tagged = nltk.pos_tag(tokens)
    #keep_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
    keep_tags = ['JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'FW', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    keep_words = [x[0] for x in tokens_tagged if x[1] in keep_tags]
    return " ".join(keep_words)

# Additional stopwords
alphabets = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
prepositions = ["about", "above", "across", "after", "against", "among", "around", "at", "before", "behind", "below", "beside", "between", "by", "down", "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on", "out", "over", "through", "to", "toward", "under", "up", "with"]
prepositions_less_common = ["aboard", "along", "amid", "as", "beneath", "beyond", "but", "concerning", "considering", "despite", "except", "following", "like", "minus", "onto", "outside", "per", "plus", "regarding", "round", "since", "than", "till", "underneath", "unlike", "until", "upon", "versus", "via", "within", "without"]
coordinating_conjunctions = ["and", "but", "for", "nor", "or", "so", "and", "yet"]
correlative_conjunctions = ["both", "and", "either", "or", "neither", "nor", "not", "only", "but", "whether", "or"]
subordinating_conjunctions = ["after", "although", "as", "as if", "as long as", "as much as", "as soon as", "as though", "because", "before", "by the time", "even if", "even though", "if", "in order that", "in case", "in the event that", "lest", "now that", "once", "only", "only if", "provided that", "since", "so", "supposing", "that", "than", "though", "till", "unless", "until", "when", "whenever", "where", "whereas", "wherever", "whether or not", "while"]
others = ["ã", "å", "ì", "û", "ûªm", "ûó", "ûò", "ìñ", "ûªre", "ûªve", "ûª", "ûªs", "ûówe"]
custom_words = ["thank", "hi", "hello", "regard", "issue", "please", "cc", "uat"]
additional_stops = alphabets + prepositions + prepositions_less_common + coordinating_conjunctions + correlative_conjunctions + subordinating_conjunctions + others + custom_words

def remove_additional_stopwords(text):
    return " ".join([word for word in regexp.tokenize(text) if word not in additional_stops])

def clean_text(text):
    # 按换行符分割文本
    lines = text.split('\n')
    # 过滤以#开始的句子
    filtered_lines = [line for line in lines if not line.startswith(('#', '*'))]
    # 将句子按空格合并为一个句子
    combined_sentence = ' '.join(filtered_lines).strip()
    return combined_sentence

def clean_log(text):
    # 按换行符分割文本
    lines = text.split('\n')
    # 正则表达式匹配时间日期格式
    date_pattern = r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
    # 过滤以时间日期开头的句子
    filtered_lines = [line for line in lines if not re.match(date_pattern, line)]
    # 将句子按空格合并为一个句子
    combined_sentence = ' '.join(filtered_lines).strip()
    return combined_sentence

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# 加载训练好的模型
model = BertForSequenceClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')

# 创建分类pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, truncation=True, max_length=512)

# 过滤掉不是句子的文本行
# {'sentence': 0, 'meaningless': 1, 'url': 2, 'greeting': 3, 'log': 4, 'shell': 5, 'image_tag': 6}
def filter_not_sentence(text):
    sentences = []
    formated_text = text.replace('\r\n', '\n').replace('\r', '\n')
    for sentence in formated_text.split('\n\n'):
        result = classifier(sentence)[0]
        if result['label'] == 'LABEL_0':
            sentences.append(sentence)

    return ' '.join(sentences) if len(sentences) != 0 else ''

# 处理标题
def summary_normalizer(text):
    text = strip_text(text)
    text = convert_to_lowercase(text)
    text = remove_summary_punctuation(text)
    text = remove_stopwords(text)
    # text = remove_number_token(text)
    text = discard_non_alpha(text)
    text = keep_pos(text)
    text = remove_additional_stopwords(text)
    text = strip_text(text)
    text = re.sub(' +', ' ', text)
    return text

# 处理内容
def description_normalizer(text):
    text = filter_not_sentence(text)
    text = clean_text(text)
    text = clean_log(text)
    text = convert_to_lowercase(text)
    text = strip_text(text)
    text = re.sub('\n', ' ', text) # converting text to one line
    text = re.sub('\[.*?\]', ' ', text) # removing square brackets
    text = remove_http(text)
    text = remove_image_tags(text)
    text = remove_bracket(text)
    text = remove_table(text)
    text = remove_star(text)
    text = remove_punctuation(text)
    text = remove_html(text)
    text = remove_emoji(text)
    text = convert_acronyms(text)
    text = convert_contractions(text)
    text = remove_stopwords(text)
    text = text_lemmatizer(text)
    text = discard_non_alpha(text)
    text = keep_pos(text)
    text = remove_additional_stopwords(text)
    text = re.sub(' +', ' ', text)  # replace multiple spaces with a single space
    text = strip_text(text)
    return text

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np
import pandas as pd
import glob
import os
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE, ADASYN
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# 读取数据并进行初步处理
file_prefix='JIRA_operation_'
file_pattern = os.path.join("./data/jira2/", f'{file_prefix}*.csv')
csv_files = glob.glob(file_pattern)

dataframes = []
for file in csv_files:
    dataframes.append(pd.read_csv(file))
df = pd.concat(dataframes, ignore_index=True)

cols = ['Summary', 'Issue key', 'Issue id', 'Issue Type', 'Status', 'Priority', 'Resolution', 'Assignee Id', 'Reporter Id', 
            'Creator Id', 'Created', 'Resolved', 'Fix versions', 'Components', 'Description', 'Environment','Custom field (Requested From:)','Custom field ([CHART] Date of First Response)',
            'Parent', 'Status Category', 'Status Category Changed']
df = df[cols]

df = df.rename(columns={
    'Custom field (Requested From:)': 'Request From',
    'Custom field ([CHART] Date of First Response)': 'First Response'
})
# 时间转换
df['Created'] = pd.to_datetime(df['Created'])
df['Resolved'] = pd.to_datetime(df['Resolved'])
df['First Response'] = pd.to_datetime(df['First Response'])
df['Status Category Changed'] = pd.to_datetime(df['Status Category Changed'])

# 时间间隔转换为小时
df['Response Time'] = (df['First Response'] - df['Created']).dt.total_seconds() / 60 / 60
df['Resolved Time'] = (df['Resolved'] - df['Created']).dt.total_seconds() / 60 / 60
df['Description'] = df['Description'].fillna('')
df['Summary Length'] = df['Summary'].str.len()
df['Description Length'] = df['Description'].str.len()
df['Parent'] = df['Parent'].fillna(0).astype(int)
df['Components'] = df['Components'].fillna('Unknown')
df['Assignee Id'] = df['Assignee Id'].fillna('Unknown')

df = df[df['Status'].isin(['Closed', 'Resolved'])]
df = df[df['Resolution'].isin(['Done'])]
df.drop(columns=['Environment', 'Fix versions', 'Request From', 'First Response', 'Response Time', 'Resolution', 'Status Category', 'Status', 'Status Category Changed'], inplace=True)
df.dropna(subset=['Description', 'Resolved Time'], inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df['parent'] = df['Parent'].apply(lambda x: 1 if x != 0 else 0)

# 编码分类特征和标准化数值特征
encoder = OneHotEncoder()
X_cat = encoder.fit_transform(df[['Issue Type', 'Reporter Id', 'Components']]).toarray()
numeric_features = df[['Summary Length', 'Description Length']]
scaler = StandardScaler()
X_numeric = scaler.fit_transform(numeric_features)

# 文本预处理（根据实际需要进行进一步优化）
df['normalized_summary'] = df['Summary'].apply(summary_normalizer)
df['normalized_description'] = df['Description'].apply(description_normalizer)
df['normalized_text'] = df['normalized_summary'] + ' ' + df['normalized_description']

  df['Created'] = pd.to_datetime(df['Created'])
  df['Resolved'] = pd.to_datetime(df['Resolved'])
  df['Status Category Changed'] = pd.to_datetime(df['Status Category Changed'])


In [3]:
# 文本特征提取
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
X_text = tfidf.fit_transform(df['normalized_text']).toarray()

y_priority = df['Priority']
X_combined = np.hstack((X_text, X_cat, X_numeric, df['parent'].values.reshape(-1, 1)))

# 编码目标变量
le = LabelEncoder()
y_priority = le.fit_transform(df['Priority'])

# 分割数据集，确保数据平衡
stratified_split = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in stratified_split.split(X_combined, y_priority):
    X_train, X_test, y_train, y_test = X_combined[train_index], X_combined[test_index], y_priority[train_index], y_priority[test_index]
    break

# 使用ADASYN处理数据不平衡问题
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# 计算类别权重
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_resampled), y=y_train_resampled)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# 定义XGBoost分类器
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', scale_pos_weight=class_weights)

# 设置参数网格
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 0.8, 0.9, 1.0]
}

# 自定义评分函数，重点关注highest和high优先级的准确性
def weighted_accuracy(y_true, y_pred):
    score = 0
    for true, pred in zip(y_true, y_pred):
        if true == pred:
            if true in [le.transform(['Highest'])[0]]:
                score += 3  
            elif true in [le.transform(['High'])[0]]:
                score += 2
            else:
                score += 1
    return score / len(y_true)

scorer = make_scorer(weighted_accuracy, greater_is_better=True)

# 使用GridSearchCV进行超参数调优
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, scoring=scorer, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

# 输出最佳参数和得分
print(f"Best: {grid_search.best_score_} using {grid_search.best_params_}")

# 使用最佳参数评估模型
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 打印分类报告
print(classification_report(y_test, y_pred, target_names=le.classes_))

Fitting 3 folds for each of 144 candidates, totalling 432 fits


Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.



Best: 1.600552093367473 using {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.9}
              precision    recall  f1-score   support

        High       0.30      0.14      0.20        69
     Highest       0.57      0.20      0.30        20
         Low       0.00      0.00      0.00         3
      Medium       0.81      0.93      0.86       338

    accuracy                           0.76       430
   macro avg       0.42      0.32      0.34       430
weighted avg       0.71      0.76      0.72       430



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# 使用SMOTETomek进行过采样和欠采样
smt = SMOTETomek(random_state=42)
X_train_smt, y_train_smt = smt.fit_resample(X_train, y_train)

# 模型融合
xgb_clf = xgb.XGBClassifier(learning_rate=0.1, max_depth=9, n_estimators=200, subsample=0.7, scale_pos_weight=class_weights)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
lr_clf = LogisticRegression(class_weight='balanced', max_iter=200)
svc_clf = SVC(probability=True, class_weight='balanced')

voting_clf = VotingClassifier(estimators=[
    ('xgb', xgb_clf),
    ('rf', rf_clf),
    ('lr', lr_clf),
    ('svc', svc_clf)
], voting='soft')

voting_clf.fit(X_train_smt, y_train_smt)

# 评估模型
y_pred = voting_clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))


Parameters: { "scale_pos_weight" } are not used.



              precision    recall  f1-score   support

        High       0.50      0.07      0.13        69
     Highest       0.44      0.20      0.28        20
         Low       1.00      0.33      0.50         3
      Medium       0.81      0.98      0.89       338

    accuracy                           0.79       430
   macro avg       0.69      0.40      0.45       430
weighted avg       0.74      0.79      0.73       430

