# urgent satd characterization process

## 1.按照紧急与非紧急的survival time分别随机取出50条数据

In [1]:
import pandas as pd

# 读取csv文件
df = pd.read_csv('output_file_gai.csv')  # 修改csv文件路径  

# 筛选survival_time为0和1的数据
df_0 = df[df['survive_time'] == 0]
df_1 = df[df['survive_time'] >= 1]

# 分别随机抽取50条样本
sample_0 = df_0.sample(n=50, random_state=42)  # random_state确保可重复性
sample_1 = df_1.sample(n=50, random_state=42)

# 将抽取的样本保存到新的csv文件
sample_0.to_csv('sample_0.csv', index=False)
sample_1.to_csv('sample_1.csv', index=False)

## 2.提取特征
- Churn Workload：改动的代码量，函数超过50%，则认为是不太容易修改的，未超过50%，则认为是容易修改的
- Commit Goal: SATD注释是否清晰，目标明确
- Working on release: 离上一次发布的版本的时间
- Project Startup: 离项目刚开始的时间
- Commit workload: 提交次数，代表开发人员做出的不同贡献的数量
- Tenure：作者关于这个项目的经验

#### Churn Workload

使用基于token的代码相似度分析

- 先将function_before 和 function_after 的代码转换成token序列

In [2]:
import pandas as pd
import io
import tokenize

# 读取csv文件
df_0 = pd.read_csv('sample_0.csv')  # 将'your_file.csv'替换为你的文件名
df_1 = pd.read_csv('sample_1.csv')


def tokenize_code(code):
    """
    将代码字符串转换为token列表。
  
    Args:
      code: 代码字符串。
  
    Returns:
      token列表。
    """
    tokens = []
    try:
        # 使用io.StringIO将字符串转换为类似文件的对象
        code_io = io.StringIO(code)
        # 使用tokenize库进行token化
        for toktype, tokval, _, _, _ in tokenize.generate_tokens(code_io.readline):
            tokens.append(tokval)
    except tokenize.TokenError:
        # 处理token化错误，例如代码语法错误
        print(f"Tokenization error for code: {code}")
        tokens.append("<ERROR>")  # 可以选择添加错误标记或其他处理方式
    return tokens

In [3]:
# 对 function_before 和 function_after 进行token化
df_0['function_before_tokenized'] = df_0['function_before'].apply(tokenize_code)
df_0['function_after_tokenized'] = df_0['function_after'].apply(tokenize_code)
df_1['function_before_tokenized'] = df_1['function_before'].apply(tokenize_code)
df_1['function_after_tokenized'] = df_1['function_after'].apply(tokenize_code)

# 打印结果
print(df_0[['function_before', 'function_before_tokenized', 'function_after', 'function_after_tokenized']].head())
print(df_1[['function_before', 'function_before_tokenized', 'function_after', 'function_after_tokenized']].head())

# 保存更新后的 DataFrame 到新的 CSV 文件
df_0.to_csv('sample_0.csv', index=False)  # 将'your_file_tokenized.csv'替换为你的文件名
df_1.to_csv('sample_1.csv', index=False)  # 将'your_file_tokenized.csv'替换为你的文件名

Tokenization error for code: public final void operationComplete(ChannelFuture f) throws Exception {
if (f.isSuccess()) {
Channel channel = f.getChannel();
channel.getPipeline().getContext(NettyAsyncHttpProvider.class).setAttachment(future);
SslHandler sslHandler = (SslHandler) channel.getPipeline().get(NettyAsyncHttpProvider.SSL_HANDLER);
if (!handshakeDone.getAndSet(true) && (sslHandler != null)) {
((SslHandler) channel.getPipeline().get(NettyAsyncHttpProvider.SSL_HANDLER)).handshake().addListener(this);
return;
}
HostnameVerifier v = config.getHostnameVerifier();
if (sslHandler != null && !AllowAllHostnameVerifier.class.isAssignableFrom(v.getClass())) {
// TODO: channel.getRemoteAddress()).getHostName() is very expensive. Should cache the result.
if (!v.verify(InetSocketAddress.class.cast(channel.getRemoteAddress()).getHostName(),
                        sslHandler.getEngine().getSession())) {
throw new ConnectException("HostnameVerifier exception.");
}
}
future.provider().writeRequ

使用 **余弦相似度** 来衡量两个token序列之间的相似度。

相似度超过50%，说明churn workload并不大，容易修改，则认为是urgent_satd
反之，则认为是not_urgent_satd

- 读取 sample_0.csv 和 sample_1.csv 文件。
- 使用 TfidfVectorizer 将token序列转换为数值向量。
- 计算 function_before_tokenized 和 function_after_tokenized 列对应的向量之间的余弦相似度。
- 将相似度分数添加到 DataFrame 中。
- 将结果保存到 sample_0_similarity.csv 文件。

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 读取csv文件

df_0 = pd.read_csv('sample_0.csv')  # 将'sample_0.csv'替换为你的文件名
df_1 = pd.read_csv('sample_1.csv')  # 将'sample_1.csv'替换为你的文件名

# 初始化 TfidfVectorizer
vectorizer = TfidfVectorizer()

# 将 function_before_tokenized 和 function_after_tokenized 列转换为字符串列表
# 因为TfidfVectorizer的输入需要是字符串
df_0['function_before_tokenized'] = df_0['function_before_tokenized'].apply(
    lambda x: ' '.join(eval(x)))  # eval(x)将字符串形式的列表转换为列表
df_0['function_after_tokenized'] = df_0['function_after_tokenized'].apply(lambda x: ' '.join(eval(x)))

df_1['function_before_tokenized'] = df_1['function_before_tokenized'].apply(
    lambda x: ' '.join(eval(x)))  # eval(x)将字符串形式的列表转换为列表
df_1['function_after_tokenized'] = df_1['function_after_tokenized'].apply(lambda x: ' '.join(eval(x)))

# 将 function_before_tokenized 和 function_after_tokenized 合并成一个列表
code_list_0 = df_0['function_before_tokenized'].tolist() + df_0['function_after_tokenized'].tolist()
code_list_1 = df_1['function_before_tokenized'].tolist() + df_1['function_after_tokenized'].tolist()

# 计算 TF-IDF 矩阵
tfidf_matrix_0 = vectorizer.fit_transform(code_list_0)
tfidf_matrix_1 = vectorizer.fit_transform(code_list_1)

# 计算 function_before_tokenized 和 function_after_tokenized 的余弦相似度
similarity_scores_0 = cosine_similarity(tfidf_matrix_0[:len(df_0)], tfidf_matrix_0[len(df_0):])
similarity_scores_1 = cosine_similarity(tfidf_matrix_1[:len(df_1)], tfidf_matrix_1[len(df_1):])

# 将相似度分数添加到 DataFrame 中
df_0['similarity'] = similarity_scores_0.diagonal()
df_1['similarity'] = similarity_scores_1.diagonal()

# 打印结果
print(df_0[['function_before_tokenized', 'function_after_tokenized', 'similarity']])
print(df_1[['function_before_tokenized', 'function_after_tokenized', 'similarity']])

# 保存更新后的 DataFrame 到新的 CSV 文件
df_0.to_csv('sample_0.csv', index=False)  # 将'sample_0_similarity.csv'替换为你的文件名
df_1.to_csv('sample_1.csv', index=False)  # 将'sample_0_similarity.csv'替换为你的文件名

                            function_before_tokenized  \
0   @ Test public void bodySuccess404 ( ) { \n ser...   
1   @ Override \n public void onRayoEvent ( JID fr...   
2   @ EventHandler \n public void onBlockRedstoneC...   
3   @ Test \n public void testUpdateRequest ( ) th...   
4   @ TargetApi ( Build . VERSION_CODES . O ) \n @...   
5   @ Test \n @ Ignore \n // TODO fix me \n public...   
6   private Container createTopPanel ( ) { \n Box ...   
7   private void readGlobalMetaData ( File inFile ...   
8   @ Test \n public void testLifecycle ( ) throws...   
9   public void configure ( Configuration configur...   
10  public void testCopy ( ) { \n DefaultExchange ...   
11  public void writeBinaryTransportIndex ( Binary...   
12  private void buildSequence ( ) { \n String seq...   
13  @ Override \n protected final void run ( Servi...   
14  public void launchNewConfigDialoge ( ) { \n wi...   
15  @ Override \n public void onApplicationEvent (...   
16  protected void onBeginReque

Mann-Whitney U 显著性检验

In [5]:
from scipy.stats import mannwhitneyu

# 读取csv文件
df_0 = pd.read_csv('sample_0.csv')  # 将'sample_0_similarity.csv'替换为你的文件名
df_1 = pd.read_csv('sample_1.csv')  # 将'sample_1_similarity.csv'替换为你的文件名

# 获取similarity数据
similarity_0 = df_0['similarity']
similarity_1 = df_1['similarity']

# 进行Mann-Whitney U检验
statistic, p_value = mannwhitneyu(similarity_0, similarity_1)

# 打印结果
print('Mann-Whitney U检验结果:')
print(f'统计量: {statistic}')
print(f'p值: {p_value}')

# 解释p值
alpha = 0.05  # 设置显著性水平
if p_value < alpha:
    print('拒绝原假设，两组数据的分布存在显著差异。')
else:
    print('无法拒绝原假设，两组数据的分布不存在显著差异。')

Mann-Whitney U检验结果:
统计量: 1346.0
p值: 0.5103069616303375
无法拒绝原假设，两组数据的分布不存在显著差异。
