In [1]:
import numpy as np
import pandas as pd
import csv
import os
import glob

In [2]:
def analysis(version_i_path, version_i_1_path, output_path, write_head,
             version_i, version_i_1, analysis_type):
    # read the version i and version i + 1
    df1 = pd.read_csv(version_i_path, dtype=str)
    df2 = pd.read_csv(version_i_1_path, dtype=str)
    
    primary_key1 = np.array(df1.iloc[:, 0])
    primary_key2 = np.array(df2.iloc[:, 0])
    
    # store in dictionary
    dic = {}
    dic2 = {}
    version_1_number = 0
    version_2_number = 0
    
    for i in primary_key1:  # create a dictionary for primary key. (Variable names might be repetitive)
        if i in dic:
            dic[i] += 1
        else:
            dic[i] = 1
        version_1_number += 1
        
    for i in primary_key2:     
        if i in dic2:
            dic2[i] += 1
        else:
            dic2[i] = 1
        version_2_number += 1
        
    data = []  # count how many other attributes are different
    
    col_data = []  # record the columnn names
    col_data.append("Comparision between Version i and Version j")
    
    # fields
    if analysis_type == "fields":
        col_data.append("Version i Total Variables")
        col_data.append("Version j Total Variables")
        col_data.append("Same Variables Number")
        col_data.append("Added Variables Number")
        col_data.append("Deleted Variables Number")
        for i in range(1, len(df1.columns)):
            col_data.append("Same Variable But '" + str(df1.columns[i]) + "' Is Different")
    # signatures
    elif analysis_type == "signatures":
        col_data.append("Version i Total signatures")
        col_data.append("Version j Total signatures")
        col_data.append("Same Signatures Number")
        col_data.append("Added Signatures Number")
        col_data.append("Deleted Signatures Number")
        for i in range(1, len(df1.columns)):
            col_data.append("Same Signature But Different " + str(df1.columns[i]))
    
    # counting for annotations
    col_data.append("Added Annotations Number")
    col_data.append("Changed Annotations Number Both Annotations Are Not None")
    col_data.append("Added Comments Number")
    col_data.append("Changed Comments Number Both Comments Are Not None")
    
    added_annotations = 0
    changed_annotations = 0
    added_comments = 0
    changed_comments = 0
    
    count_added = 0  # count added number
    count_deleted = 0  # count deleted number
    same_count = 0  # count the same number
    same_keys = []  # we get the same primary key
    for i in dic2:
        if i not in dic:
            count_added += dic2[i]  # calculate the added signatures
        else:
            same_count += min(dic[i], dic2[i])
            same_keys.append(i)
            
            if dic[i] > dic2[i]:
                count_deleted += dic[i] - dic2[i]
            if dic[i] < dic2[i]:
                count_added += dic2[i] - dic[i]
            
    for i in dic:
        if i not in dic2:
            count_deleted += dic[i]  # calculate the deleted signatures
    
    # necessary corresponding information
    data.append((version_i, version_i_1))
    data.append(version_1_number)
    data.append(version_2_number)
    data.append(same_count)
    data.append(count_added)
    data.append(count_deleted)
    
    col = len(df1.columns)  # 6 or 4
    
    for i in range(1, col):
        col_name = df1.columns[i]
        cnt = 0
        for key in same_keys:
            row1 = 0
            row2 = 0

            row1 = df1[df1[df1.columns[0]] == key].index[0]  # get the corresponding row number for the same key
            row2 = df2[df2[df2.columns[0]] == key].index[0]
            
            str1 = str(df1.loc[row1, col_name])
            str2 = str(df2.loc[row2, col_name])
            
            # Both are empty, no influence.
            if str1 == "nan" and str2 == "nan":
                continue;
                
            # print(str1, str2)

            # count the difference
            if str1 != str2:
                cnt += 1
                
                # count the added and changed annotations number
                if str(col_name).endswith("Annotation"):
                    if str1 == "nan" and str2 != "nan":
                        added_annotations += 1
                    elif str1 != "nan" and str2 != "nan":
                        changed_annotations += 1
                
                # count the added and changed comments number
                if str(col_name).endswith("Comment"):
                    if str1 == "nan" and str2 != "nan":
                        added_comments += 1
                    elif str1 != "nan" and str2 != "nan":
                        changed_comments += 1

        data.append(cnt)
        
    # add the annotations statistics
    data.append(added_annotations)
    data.append(changed_annotations)
    data.append(added_comments)
    data.append(changed_comments)
    
    with open(output_path, mode="a", newline='') as file:
        writer = csv.writer(file)
        if write_head == True:
            writer.writerow(col_data)
            
        writer.writerow(data)
    
    

In [4]:
def compare(directory_path, output_path1, output_path2):
    csvfiles = glob.glob(os.path.join(directory_path, "*.csv"))

    for i in range(len(csvfiles) - 2):
        file1 = csvfiles[i]
        file2 = csvfiles[i + 2]
        version_i = int(csvfiles[i].split("\\")[-1][:2])  # get the version number
        print(version_i)

        # fields
        if i % 2 == 0:
            if i == 0:
                analysis(file1, file2, output_path1, True, version_i, version_i + 1, "fields")
            else:
                analysis(file1, file2, output_path1, False, version_i, version_i + 1, "fields")
        # methods
        else:
            if i == 1:
                analysis(file1, file2, output_path2, True, version_i, version_i + 1, "signatures")
            else:
                analysis(file1, file2, output_path2, False, version_i, version_i + 1, "signatures")

files_path = "C:\\Users\\gtc\\Desktop\\AOSPRetrievedCSVFilesComparison\\AOSPRetrievedCSVFilesComparison"
output_path1 = "C:\\Users\\gtc\\Desktop\\AOSPRetrievedCSVFilesComparison\\AOSPRetrievedCSVFilesComparison\\statistics_field.csv"
output_path2 = "C:\\Users\\gtc\\Desktop\\AOSPRetrievedCSVFilesComparison\\AOSPRetrievedCSVFilesComparison\\statistics_signature.csv"
compare(files_path, output_path1, output_path2)

4
4
5
5
6
6
7
7
8
8
9
9
10
10
13
13
14
14
15
15
16
16
17
17
18
18
19
19
20
20
21
21
22
22
23
23
24
24
25
25
26
26
27
27
28
28
29
29
30
30
31
31
32
32
33


In [None]:
a = [1, 2, 3, 4, 5, 5, "a", "a", "nb"]
print(set(a))

In [9]:
# run this, change the path, get the 2 csv files.

file1 = "C:\\Users\\gtc\\Desktop\\AOSPRetrievedCSVFilesComparison\\AOSPRetrievedCSVFilesComparison\\28outputMethods.csv"
file2 = "C:\\Users\\gtc\\Desktop\\AOSPRetrievedCSVFilesComparison\\AOSPRetrievedCSVFilesComparison\\29outputMethods.csv"
output_path1 = "C:\\Users\\gtc\\Desktop\\res1.csv"
output_path2 = "C:\\Users\\gtc\\Desktop\\res2.csv"
# read the version i and version i + 1
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

primary_key1 = np.array(df1.iloc[:, 0])
primary_key2 = np.array(df2.iloc[:, 0])

# store in dictionary
dic = {}
dic2 = {}
for i in primary_key1:  # create a dictionary for primary key. (Variable names might be repetitive)
    if i in dic:
        dic[i] += 1
    else:
        dic[i] = 1
for i in primary_key2:
    if i in dic2:
        dic2[i] += 1
    else:
        dic2[i] = 1

same_keys = []  # we get the same primary key
for i in set(primary_key2):
    if i not in dic:
        continue
    else:
        same_keys.append(i)
        
for i in range(1, 6):
    col_name = df1.columns[i]
    cnt = 0
    for key in same_keys:
        row1 = 0
        row2 = 0

        row1 = df1[df1[df1.columns[0]] == key].index[0]  # get the corresponding row number for the same key
        row2 = df2[df2[df2.columns[0]] == key].index[0]

        str1 = str(df1.loc[row1, col_name])
        str2 = str(df2.loc[row2, col_name])

        # count the difference
        if str1 != str2 and str1 != 'nan' and  str2 != 'nan':
            data1 = []
            for i in range(6):
                data1.append(df1.loc[row1].iloc[i])
            with open(output_path1, mode="a", newline='') as file:
                writer = csv.writer(file)
                writer.writerow(data1)
                
            data2 = []
            for i in range(6):
                data2.append(df2.loc[row2].iloc[i])
            with open(output_path2, mode="a", newline='') as file:
                writer = csv.writer(file)
                writer.writerow(data2)
            break


KeyboardInterrupt: 

In [51]:
target_function = "setErrorCallback"
df1['Method Body'] = df1['Method Body'].fillna('')
result = df1.loc[df1['Method Body'].str.contains(target_function)]
if not result.empty:
    print(result['Method Signature'].iloc[2])
else:
    print("No matching rows found.")

No matching rows found.


Here is the comparision but the primary key is method instead of signature.

In [2]:
def analysis(version_i_path, version_i_1_path, output_path, write_head,
             version_i, version_i_1, analysis_type):
    # read the version i and version i + 1
    df1 = pd.read_csv(version_i_path)
    df2 = pd.read_csv(version_i_1_path)
    
    # swap the primary key to the first column
    if analysis_type == "signatures":
        df1[['Method Signature', 'Method Full Name']] = df1[['Method Full Name', 'Method Signature']]
        df2[['Method Signature', 'Method Full Name']] = df2[['Method Full Name', 'Method Signature']]
    
    primary_key1 = np.array(df1.iloc[:, 0])
    primary_key2 = np.array(df2.iloc[:, 0])
    
    # store in dictionary
    dic = {}
    dic2 = {}
    version_1_number = 0
    version_2_number = 0
    for i in primary_key1:  # create a dictionary for primary key. (Variable names might be repetitive)
        if i in dic:
            dic[i] += 1
        else:
            dic[i] = 1
        version_1_number += 1
    for i in primary_key2:
        if i in dic2:
            dic2[i] += 1
        else:
            dic2[i] = 1
        version_2_number += 1
        
    data = []  # count how many other attributes are different
    
    col_data = []  # record the columnn names
    col_data.append("Comparision between Version i and Version j")
    
    # fields
    if analysis_type == "fields":
        col_data.append("Version i Total Variables")
        col_data.append("Version j Total Variables")
        col_data.append("Same Variables Number")
        col_data.append("Added Variables Number")
        col_data.append("Deleted Variables Number")
        for i in range(1, len(df1.columns)):
            col_data.append("Same Variable But Different " + str(df1.columns[i]))
    # signatures
    elif analysis_type == "signatures":
        col_data.append("Version i Total signatures")
        col_data.append("Version j Total signatures")
        col_data.append("Same Signatures Number")
        col_data.append("Added Signatures Number")
        col_data.append("Deleted Signatures Number")
        for i in range(1, len(df1.columns)):
            col_data.append("Same Signature But Different " + str(df1.columns[i]))
    
    # counting for annotations
    col_data.append("Added Annotations Number")
    col_data.append("Changed Annotations Number Both Annotations Are Not None")
    col_data.append("Added Comments Number")
    col_data.append("Changed Comments Number Both Comments Are Not None")
    
    added_annotations = 0
    changed_annotations = 0
    added_comments = 0
    changed_comments = 0
    
    count_added = 0  # cound added number
    count_deleted = 0  # count deleted number
    same_count = 0  # count the same number
    same_keys = []  # we get the same primary key
    for i in set(primary_key2):
        if i not in dic:
            count_added += dic2[i]  # calculate the added signatures
        else:
            same_count += min(dic[i], dic2[i])
            same_keys.append(i)
            if dic[i] > dic2[i]:
                count_deleted += dic[i] - dic2[i]
            if dic[i] < dic2[i]:
                count_added += dic2[i] - dic[i]
            
    for i in set(primary_key1):
        if i not in dic2:
            count_deleted += dic[i]  # calculate the deleted signatures
    
    # necessary corresponding information
    data.append((version_i, version_i_1))
    data.append(version_1_number)
    data.append(version_2_number)
    data.append(same_count)
    data.append(count_added)
    data.append(count_deleted)
    
    col = len(df1.columns)  # 6 or 4
    
    for i in range(1, col):
        col_name = df1.columns[i]
        cnt = 0
        for key in same_keys:
            row1 = 0
            row2 = 0

            row1 = df1[df1[df1.columns[0]] == key].index[0]  # get the corresponding row number for the same key
            row2 = df2[df2[df2.columns[0]] == key].index[0]
            
            str1 = str(df1.loc[row1, col_name])
            str2 = str(df2.loc[row2, col_name])

            # count the difference
            if str1 != str2:
                cnt += 1
                
                # count the added and changed annotations number
                if str(col_name).endswith("Annotation"):
                    if str1 == "nan" and str2 != "nan":
                        added_annotations += 1
                    elif str1 != "nan" and str2 != "nan":
                        changed_annotations += 1
                
                # count the added and changed comments number
                if str(col_name).endswith("Comment"):
                    if str1 == "nan" and str2 != "nan":
                        added_comments += 1
                    elif str1 != "nan" and str2 != "nan":
                        changed_comments += 1

        data.append(cnt)
        
    # add the annotations statistics
    data.append(added_annotations)
    data.append(changed_annotations)
    data.append(added_comments)
    data.append(changed_comments)
    
    with open(output_path, mode="a", newline='') as file:
        writer = csv.writer(file)
        if write_head == True:
            writer.writerow(col_data)
            
        writer.writerow(data)

        
def compare_methodkey(directory_path, output_path1, output_path2):
    csvfiles = glob.glob(os.path.join(directory_path, "*.csv"))

    for i in range(len(csvfiles) - 2):
        file1 = csvfiles[i]
        file2 = csvfiles[i + 2]
        version_i = int(csvfiles[i].split("\\")[-1][:2])  # get the version number
        print(version_i)

        # fields
        if i % 2 == 0:
            if i == 0:
                analysis(file1, file2, output_path1, True, version_i, version_i + 1, "fields")
            else:
                analysis(file1, file2, output_path1, False, version_i, version_i + 1, "fields")
        # methods
        else:
            if i == 1:
                analysis(file1, file2, output_path2, True, version_i, version_i + 1, "signatures")
            else:
                analysis(file1, file2, output_path2, False, version_i, version_i + 1, "signatures")

In [3]:
# main 
directory_path = "C:\\Users\\gtc\\Desktop\\AOSPRetrievedCSVFilesComparison\\AOSPRetrievedCSVFilesComparison"  # a path to all csv files you want to compare
output_path1 = "C:\\Users\\gtc\\Desktop\\statistics_fields_methodkey.csv"  # field results path
output_path2 = "C:\\Users\\gtc\\Desktop\\statistics_signature_methodkey.csv"  # signature results path
# compare(directory_path, output_path1, output_path2)
compare_methodkey(directory_path, output_path1, output_path2)

4
4
5
5
6
6
7
7
8
8
9
9
10
10
13
13
14
14
15
15
16
16
17
17
18
18
19
19
20
20
21
21
22
22
23
23
24
24
25
25
26
26
27
27
28
28
29
29
30
30
31
31
32
32
33
33


KeyError: "None of [Index(['Method Full Name', 'Method Signature'], dtype='object')] are in the [columns]"

In [46]:
df_tmp = pd.DataFrame({'A': ['hello', 'int add(int a, int b) {return a + b;}',
                             'int add(int a, int b, int c) {return a + b + c;}',
                             'hello', 'int add(int a, int b) {return a + b;}']})
print(df_tmp['A'].count())
print(df_tmp['A'].nunique())

5
3


In [49]:
# int cound_same_method_name_number(file_path):
#     df = pd.read_csv(file_path)

#     print(df['Method Full Name'].count())
#     print(df['Method Full Name'].nunique())

#     for i in range(df['Method Full Name'].count()):
#         for j in range(i + 1, df['Method Full Name'].count()):
#             if df['Method Full Name'][i] == df['Method Full Name'][j]:
#                 print(i, j)
#                 print(df['Method Full Name'][i], df['Method Full Name'][j])
    

# An example for version i, how many same method name we have
file_path = "C:\\Users\\gtc\\Desktop\\AOSPRetrievedCSVFilesComparison\\AOSPRetrievedCSVFilesComparison"

for files in os.listdir(file_path):
    if files.endswith("outputMethods.csv"):
        # print(files)
        full_path = os.path.join(file_path, files)
        df = pd.read_csv(full_path)
        tot = df["Method Full Name"].count()
        uni = df["Method Full Name"].nunique()
        # print(f'version: {files[:2]}, total: {tot}, unique: {uni}, minues: {tot - uni}')

In [59]:
# search for empty method body
file_path = "C:\\Users\\gtc\\Desktop\\AOSPRetrievedCSVFilesComparison\\AOSPRetrievedCSVFilesComparison\\13outputMethods.csv"

# for files in os.listdir(file_path):
#     if files.endswith("outputMethods.csv"):
        # print(files)
# full_path = os.path.join(file_path, files)
df = pd.read_csv(file_path)
cnt_init = 0
cnt_other = 0
cnt = 0
for i in range(df['Method Body'].count()):
    if pd.isna(df['Method Body'][i]):
        cnt += 1
        if "<init>" in df['Method Signature'][i]:
            cnt_init += 1
        else:
            cnt_other += 1
        # print(df['Method Signature'][i], df['Method Class'][i])
        
print(cnt_init, cnt_other, cnt, df['Method Body'].count())

805 621 1426 11506
