In [1]:
import numpy as np
import pandas as pd
import glob
import os
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

In [None]:
file_prefix='JIRA_'
file_pattern = os.path.join("./data/jira_par/", f'{file_prefix}*.csv')
csv_files = glob.glob(file_pattern)

dataframes = []
for file in csv_files:
    dataframes.append(pd.read_csv(file))
df = pd.concat(dataframes, ignore_index=True)
df.head()

In [3]:
# 合并以 'Labels' 开头的字段为数组形式
label_columns = [col for col in df.columns if col.startswith('Labels')]
df['Label_List'] = df[label_columns].apply(lambda row: [x for x in row if pd.notna(x) and x != ''], axis=1)

# 合并以 'Components' 开头的字段为数组形式
component_columns = [col for col in df.columns if col.startswith('Components')]
df['Component_List'] = df[label_columns].apply(lambda row: [x for x in row if pd.notna(x) and x != ''], axis=1)

In [None]:
cols = ['Summary', 'Issue id', 'Issue Type', 'Status', 'Priority', 'Resolution', 'Assignee Id', 'Reporter Id', 
        'Creator Id', 'Created', 'Resolved', 'Affects versions', 'Components',
        'Description',  'Custom field (Product Owner (PO))', 'Status Category', 'Custom field (Requested From:)', 'Label_List','Component_List']
df = df[cols]
df.head()

In [5]:
# 重命名一些列名
df =df.rename(columns={
    'Custom field (Product Owner (PO))': 'Product Owner',
    'Custom field (Requested From:)': 'Requested From'
})
df.dropna(subset=['Assignee Id', 'Description', 'Product Owner'], inplace=True)
# 进行时间转换
df['Created'] = pd.to_datetime(df['Created'])
df['Resolved'] = pd.to_datetime(df['Resolved'])
# 计算标题和详情的长度
df['Summary Length'] = df['Summary'].str.len()
df['Description Length'] = df['Description'].str.len()
#df['Week'] = df['Created'].dt.isocalendar().week
#df['Month'] = df['Created'].dt.month
df['Create Date'] = df['Created'].dt.strftime('%Y%m%d').astype(int)

In [None]:
data = df[df['Status Category'] != 'To Do']
data = data[data['Summary Length'] >= 10]
data.shape

In [7]:
# 选取有用的列
data = data[['Summary', 'Description', 'Assignee Id', 'Reporter Id', 'Components', 'Priority', 'Issue Type', 'Create Date']]

In [None]:
data['Assignee Id'].value_counts()

In [None]:
data.info()

In [None]:
data

文本处理

In [11]:
import string, re, nltk
import spacy
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [12]:
# RegexpTokenizer
regexp = RegexpTokenizer("[\w']+")

# 转换为小写
def convert_to_lowercase(text):
    return text.lower()

# 去除文本两边空格
def strip_text(text):
    return text.strip()

# 移除标点符号
def remove_punctuation(text):
    punct_str = string.punctuation
    punct_str = punct_str.replace("'", "")
    return re.sub(f"[{re.escape(punct_str)}]", " ", text)

# 移除标题的标点符号
def remove_summary_punctuation(text):
    punct_str = string.punctuation
    punct_str = punct_str.replace("'", "")
    punct_str = punct_str.replace("_", "")
    return re.sub(f"[{re.escape(punct_str)}]", " ", text)

# 移除数字token
def remove_number_token(text):
    words = text.split()
    # 过滤掉纯数字的词
    filtered_words = [word for word in words if not re.match(r'^\d+$', word)]
    # 将词按空格合并成句子
    combined_sentence = ' '.join(filtered_words).strip()
    return combined_sentence

# 移除html标签
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(' ', text)

# 移除表情
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags = re.UNICODE)
    return emoji_pattern.sub(' ', text)

# 移除http链接
def remove_http(text):
    http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
    pattern = r"({})".format(http) # creating pattern
    return re.sub(pattern, " ", text)

# Dictionary of acronyms
acronyms_url = './data/english_acronyms.json'
acronyms_dict = pd.read_json(acronyms_url, typ = 'series')
acronyms_list = list(acronyms_dict.keys())

# remove html tags
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(' ', text)

# 移除文本中包含的image tag
def remove_image_tags(text):
    # Define the regular expression pattern to match the image tags
    pattern = re.compile(r'!.*?!')
    cleaned_text = pattern.sub(' ', text)
    return cleaned_text

# 移除文本中{}的内容
def remove_bracket(text):
    pattern = re.compile(r'\{.*?\}')
    return pattern.sub(' ', text)

# 移除文本中||的内容
def remove_table(text):
    pattern = re.compile(r'\|.*?\|')
    return pattern.sub(' ', text)

# 移除文本中**的内容
def remove_star(text):
    pattern = re.compile(r'\*.*?\*')
    return pattern.sub(' ', text)

# convert contractions in a text
def convert_acronyms(text):
    words = []
    for word in regexp.tokenize(text):
        if word in acronyms_list:
            words = words + acronyms_dict[word].split()
        else:
            words = words + word.split()
    
    text_converted = " ".join(words)
    return text_converted

# Dictionary of contractions
contractions_url = './data/english_contractions.json'
contractions_dict = pd.read_json(contractions_url, typ = 'series')
# List of contractions
contractions_list = list(contractions_dict.keys())

# convert contractions in a text
def convert_contractions(text):
    words = []
    for word in regexp.tokenize(text):
        if word in contractions_list:
            words = words + contractions_dict[word].split()
        else:
            words = words + word.split()
    
    text_converted = " ".join(words)
    return text_converted

# 移除停用词
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    
    # 使用正则表达式 tokenizer 处理缩写和标点
    tokenizer = RegexpTokenizer(r'\w+\'?\w+|\w+')
    words = tokenizer.tokenize(text)
    
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Stemming 词干提取，stemming. 如 "running", "runner" 会被转换成 "run".
stemmer = PorterStemmer()
def text_stemmer(text):
    text_stem = " ".join([stemmer.stem(word) for word in regexp.tokenize(text)])
    return text_stem

# Lemmatization 词形还原. 如Better被还原为good
spacy_lemmatizer = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])
#lemmatizer = WordNetLemmatizer()
def text_lemmatizer(text):
    text_spacy = " ".join([token.lemma_ for token in spacy_lemmatizer(text)])
    #text_wordnet = " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(text)]) # regexp.tokenize(text)
    return text_spacy
    #return text_wordnet

# 移除非字母的词
def discard_non_alpha(text):
    word_list_non_alpha = [word for word in regexp.tokenize(text) if word.isalpha()]
    text_non_alpha = " ".join(word_list_non_alpha)
    return text_non_alpha

# 根据词性过滤单词, 如过滤连词(conjunctions), 介词(prepositions)。保留名词(nouns)、形容词和动词
def keep_pos(text):
    tokens = regexp.tokenize(text)
    tokens_tagged = nltk.pos_tag(tokens)
    #keep_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
    keep_tags = ['JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'FW', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    keep_words = [x[0] for x in tokens_tagged if x[1] in keep_tags]
    return " ".join(keep_words)

# Additional stopwords
alphabets = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
prepositions = ["about", "above", "across", "after", "against", "among", "around", "at", "before", "behind", "below", "beside", "between", "by", "down", "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on", "out", "over", "through", "to", "toward", "under", "up", "with"]
prepositions_less_common = ["aboard", "along", "amid", "as", "beneath", "beyond", "but", "concerning", "considering", "despite", "except", "following", "like", "minus", "onto", "outside", "per", "plus", "regarding", "round", "since", "than", "till", "underneath", "unlike", "until", "upon", "versus", "via", "within", "without"]
coordinating_conjunctions = ["and", "but", "for", "nor", "or", "so", "and", "yet"]
correlative_conjunctions = ["both", "and", "either", "or", "neither", "nor", "not", "only", "but", "whether", "or"]
subordinating_conjunctions = ["after", "although", "as", "as if", "as long as", "as much as", "as soon as", "as though", "because", "before", "by the time", "even if", "even though", "if", "in order that", "in case", "in the event that", "lest", "now that", "once", "only", "only if", "provided that", "since", "so", "supposing", "that", "than", "though", "till", "unless", "until", "when", "whenever", "where", "whereas", "wherever", "whether or not", "while"]
others = ["ã", "å", "ì", "û", "ûªm", "ûó", "ûò", "ìñ", "ûªre", "ûªve", "ûª", "ûªs", "ûówe"]
custom_words = ["thank", "hi", "hello", "regard", "issue", "please", "cc"]
additional_stops = alphabets + prepositions + prepositions_less_common + coordinating_conjunctions + correlative_conjunctions + subordinating_conjunctions + others + custom_words

def remove_additional_stopwords(text):
    return " ".join([word for word in regexp.tokenize(text) if word not in additional_stops])

def clean_text(text):
    # 按换行符分割文本
    lines = text.split('\n')
    # 过滤以#开始的句子
    filtered_lines = [line for line in lines if not line.startswith(('#', '*'))]
    # 将句子按空格合并为一个句子
    combined_sentence = ' '.join(filtered_lines).strip()
    return combined_sentence

def clean_log(text):
    # 按换行符分割文本
    lines = text.split('\n')
    # 正则表达式匹配时间日期格式
    date_pattern = r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
    # 过滤以时间日期开头的句子
    filtered_lines = [line for line in lines if not re.match(date_pattern, line)]
    # 将句子按空格合并为一个句子
    combined_sentence = ' '.join(filtered_lines).strip()
    return combined_sentence

def remove_comment_header(text):
    pattern = re.compile(r'(\d{2}/[A-Za-z]{3}/\d{2} \d{1,2}:\d{2} [APM]+);([0-9a-fA-F:.-]+)')
    return pattern.sub(' ', text)


In [13]:
# 处理标题
def summary_normalizer(text):
    text = strip_text(text)
    #text = convert_to_lowercase(text)
#    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = remove_number_token(text)
#    text = discard_non_alpha(text)
#    text = keep_pos(text)
#    text = remove_additional_stopwords(text)
    text = strip_text(text)
    text = re.sub(' +', ' ', text)
    return text

# 处理内容
def description_normalizer(text):
#    text = convert_to_lowercase(text)
    text = strip_text(text)
    text = re.sub('\n', ' ', text) # converting text to one line
#    text = re.sub('\[.*?\]', ' ', text) # removing square brackets
    text = remove_http(text)
    text = remove_image_tags(text)
    text = remove_bracket(text)
#    text = remove_table(text)
#    text = remove_star(text)
#    text = remove_punctuation(text)
    text = remove_html(text)
    text = remove_emoji(text)
#    text = convert_acronyms(text)
#    text = convert_contractions(text)
    text = remove_stopwords(text)
    text = remove_number_token(text)
#    text = text_lemmatizer(text)
#    text = discard_non_alpha(text)
#    text = keep_pos(text)
#    text = remove_additional_stopwords(text)
    text = re.sub(' +', ' ', text)  # replace multiple spaces with a single space
    text = strip_text(text)
    return text

In [14]:
data['normalized_summary'] = data['Summary'].apply(summary_normalizer)
data['normalized_description'] = data['Description'].apply(description_normalizer)

data['normalized_text'] = data['normalized_summary'] + ' ' + data['normalized_description']

In [15]:
assignee_counts = data['Assignee Id'].value_counts()
# 过滤出计数大于等于50的Assignee Id
filtered_assignees = assignee_counts[assignee_counts >= 50].index
# 使用isin过滤出满足条件的数据
filtered_data = data[data['Assignee Id'].isin(filtered_assignees)]

In [16]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP

In [None]:
texts = filtered_data['normalized_text'].to_list()
texts

In [None]:
sentence_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = sentence_model.encode(texts, show_progress_bar=True)

In [26]:
# Train BERTopic
topic_model = BERTopic(n_gram_range=(1,3), min_topic_size=5, nr_topics=30)
topics, probs = topic_model.fit_transform(texts, embeddings)
#topic_model.reduce_topics(texts, nr_topics=15)

In [None]:
# 查看生成的主题
topic_model.get_topic_info()

In [28]:
# 将主题和可能性加入DataFrame
filtered_data['Topic'] = topics
filtered_data['Prob'] = [
    max(prob) if isinstance(prob, (list, np.ndarray)) else prob 
    for prob in probs
]

In [29]:
filtered_data[['Assignee Id', 'Summary','normalized_summary','Description','normalized_description','Topic']].to_csv('/home/ryan/Downloads/data.csv', index=False)

In [None]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(texts, embeddings=embeddings)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# 准备数据：按 Assignee 和 Topic 分组，并统计每个 Assignee 在每个 Topic 下的数量
topic_distribution = filtered_data.groupby(['Assignee Id', 'Topic']).size().unstack(fill_value=0)

# 使用Seaborn绘制热力图
plt.figure(figsize=(12, 8))
sns.heatmap(topic_distribution, cmap="YlGnBu", annot=True, fmt='d')
plt.title("Topic Distribution for Each Assignee")
plt.xlabel("Topic")
plt.ylabel("Assignee Id")
plt.show()

In [None]:
x

In [24]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [25]:
# 2. 用SVM预测assignee
# 准备特征和标签
X = embeddings  # 使用向量化特征
y = filtered_data['Assignee Id']  # 标签

# 编码标签
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 划分训练集和测试集，确保类别比例均衡
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [None]:
# 训练SVM模型
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train, y_train)

In [None]:
# 评估模型
train_score = svm_model.score(X_train, y_train)
test_score = svm_model.score(X_test, y_test)

print(f"Train Accuracy: {train_score}")
print(f"Test Accuracy: {test_score}")

In [29]:
#filtered_data[['Summary','Description','Issue id', 'Assignee Id', 'normalized_summary','normalized_description', 'Topic']].to_csv('/home/ryan/Downloads/data.csv', index=False)

In [None]:
from scipy.stats import chi2_contingency

# 生成 Assignee 和 Priority 的交叉表
contingency_table = pd.crosstab(filtered_data['Assignee Id'], data['Reporter Id'])

# 执行卡方检验
chi2_stat, p, dof, expected = chi2_contingency(contingency_table)

# 计算 Cramér's V
n = contingency_table.sum().sum()  # 总样本数
cramers_v = np.sqrt(chi2_stat / (n * (min(contingency_table.shape) - 1)))

print(f"Cramér's V: {cramers_v}")