In [None]:
import pandas as pd
import os

# Read CSV safely
df = pd.read_csv("/content/LLM_message.csv", engine="python")

# Strip any hidden spaces from column names
df.columns = df.columns.str.strip()

# Total commits & files
print("Total number of commits: ", df["Hash"].nunique())
print("Total number of files: ", df["Filename"].nunique())

# Average modified files per commit
fpc = df.groupby("Hash")["Filename"].nunique()
mean = fpc.mean()
print("Average modified files per commit: ", mean)

# Distribution of fix types
if "LLM Inference" in df.columns:
    print("Distribution of fix types: ")
    print(df["LLM Inference"].value_counts())
    print()

# Most frequently modified files
print("Most frequently modified files: ")
print(df["Filename"].value_counts().head(1))
print()

# Safely extract extensions (handle NaN or non-strings)
df["Extension"] = df["Filename"].apply(
    lambda x: os.path.splitext(str(x))[1] if pd.notnull(x) else ""
)

print("Most frequently modified extension: ")
print(df["Extension"].value_counts().head(1))


Total number of commits:  10631
Total number of files:  499
Average modified files per commit:  0.2073182202991252
Distribution of fix types: 
LLM Inference
add missing docstrings                          51
add missing plist entries                       50
add missing docstring                           39
add missing import                              37
add missing comment                             34
                                                ..
add parse_call_block and parse_filter blocks     1
improve code generation in frame.py              1
add docstrings for staticloopcontext             1
add docs for tag_rules                           1
add more comments and comments to lexers         1
Name: count, Length: 694, dtype: int64

Most frequently modified files: 
Filename
self.filename)    424
Name: count, dtype: int64

Most frequently modified extension: 
Extension
    75155
Name: count, dtype: int64


In [None]:
!pip install radon



In [None]:
import pandas as pd
from radon.metrics import mi_visit
from radon.complexity import cc_visit
from radon.raw import analyze

df=pd.read_csv(r"/content/LLM_message.csv")

def metrics(code):
    code=str(code)

    try:
        mi=mi_visit(code, True)
    except:
        mi=None


    try:
        list=cc_visit(code)
        cc=0
        if(list):
            total=0
            for i in list:
                total+=i.complexity
            cc=total/len(list)
    except:
        cc=None

    try:
        loc=analyze(code).loc
    except:
        loc=None

    return mi, cc, loc

list=["MI_Before", "CC_Before", "LOC_Before"]
df[list]=pd.DataFrame(df["Source Code (before)"].apply(metrics).tolist(), index=df.index)

list=["MI_After", "CC_After", "LOC_After"]
df[list]=pd.DataFrame(df["Source Code (current)"].apply(metrics).tolist(), index=df.index)

df["MI_Change"]=df["MI_After"]-df["MI_Before"]
df["CC_Change"]=df["CC_After"]-df["CC_Before"]
df["LOC_Change"]=df["LOC_After"]-df["LOC_Before"]


df=df.drop(columns=["MI_Before", "CC_Before", "LOC_Before","MI_After", "CC_After", "LOC_After"])

df.to_csv(r"Lab3_c.csv", index=False)

  df=pd.read_csv(r"/content/LLM_message.csv")


In [None]:
column_names = df.columns
print(column_names)

Index(['Hash', 'Message', 'Filename', 'Source Code (before)',
       'Source Code (current)', 'Diff', 'LLM Inference', 'Unnamed: 7',
       'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12',
       'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16',
       'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20',
       'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24',
       'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28',
       'Unnamed: 29', 'MI_Change', 'CC_Change', 'LOC_Change'],
      dtype='object')


In [None]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


d

In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel
import sacrebleu

df=pd.read_csv("/content/Lab3_c.csv", encoding='utf-8')



tokenizer=RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model=RobertaModel.from_pretrained("microsoft/codebert-base")

semantic=[]
token=[]

for i in range(len(df)):
    print(i)
    before=str(df.loc[i, "Source Code (before)"])
    after=str(df.loc[i, "Source Code (current)"])

    try:
        i1=tokenizer(before, return_tensors="pt",truncation=True,max_length=512)
        i2=tokenizer(after, return_tensors="pt",truncation=True,max_length=512)
        with torch.no_grad():
            out1=model(**i1)
            out2=model(**i2)
        emb1=out1.last_hidden_state[0,0,:]
        emb2=out2.last_hidden_state[0,0,:]
        cos_sim=torch.nn.functional.cosine_similarity(emb1, emb2, dim=0).item()
        semantic.append(cos_sim)
    except:
        semantic.append(None)

    try:
        bleu=sacrebleu.sentence_bleu(after, [before])
        token.append(bleu.score/100)
    except:
        token.append(None)

df["Semantic_Similarity"]=semantic
df["Token_Similarity"]=token

df.to_csv(r"Lab3_d.csv", index=False)

  df=pd.read_csv("/content/Lab3_c.csv", encoding='utf-8')
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
221999
222000
222001
222002
222003
222004
222005
222006
222007
222008
222009
222010
222011
222012
222013
222014
222015
222016
222017
222018
222019
222020
222021
222022
222023
222024
222025
222026
222027
222028
222029
222030
222031
222032
222033
222034
222035
222036
222037
222038
222039
222040
222041
222042
222043
222044
222045
222046
222047
222048
222049
222050
222051
222052
222053
222054
222055
222056
222057
222058
222059
222060
222061
222062
222063
222064
222065
222066
222067
222068
222069
222070
222071
222072
222073
222074
222075
222076
222077
222078
222079
222080
222081
222082
222083
222084
222085
222086
222087
222088
222089
222090
222091
222092
222093
222094
222095
222096
222097
222098
222099
222100
222101
222102
222103
222104
222105
222106
222107
222108
222109
222110
222111
222112
222113
222114
222115
222116
222117
222118
222119
222120
222121
222122
222123
222124
222125
222126
222127
222128
222129
222130
222131
2221

e


In [None]:
import pandas as pd

df=pd.read_csv(r"/content/Lab3_d.csv", encoding='utf-8')

sem=[]
tok=[]
agree=[]

for i in range(len(df)):
    print(i)
    s=df.loc[i, "Semantic_Similarity"]
    t=df.loc[i, "Token_Similarity"]

    if pd.notna(s):
        if s>0.995:
            sem.append("Minor")
        else:
            sem.append("Major")
    else:
        sem.append(None)

    if pd.notna(t):
        if t>0.9:
            tok.append("Minor")
        else:
            tok.append("Major")
    else:
        tok.append(None)

    if sem[-1] is not None and tok[-1] is not None:
        if(sem[-1]==tok[-1]):
            agree.append("YES")
        else:
            agree.append("NO")
    else:
        agree.append(None)

df["Semantic_Class"]=sem
df["Token_Class"]=tok
df["Classes_Agree"]=agree

df.to_csv(r"C:\Users\HP\STT\Lab3\bugs_diffs_with_classes.csv", index=False)

  df=pd.read_csv(r"/content/Lab3_d.csv", encoding='utf-8')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
221999
222000
222001
222002
222003
222004
222005
222006
222007
222008
222009
222010
222011
222012
222013
222014
222015
222016
222017
222018
222019
222020
222021
222022
222023
222024
222025
222026
222027
222028
222029
222030
222031
222032
222033
222034
222035
222036
222037
222038
222039
222040
222041
222042
222043
222044
222045
222046
222047
222048
222049
222050
222051
222052
222053
222054
222055
222056
222057
222058
222059
222060
222061
222062
222063
222064
222065
222066
222067
222068
222069
222070
222071
222072
222073
222074
222075
222076
222077
222078
222079
222080
222081
222082
222083
222084
222085
222086
222087
222088
222089
222090
222091
222092
222093
222094
222095
222096
222097
222098
222099
222100
222101
222102
222103
222104
222105
222106
222107
222108
222109
222110
222111
222112
222113
222114
222115
222116
222117
222118
222119
222120
222121
222122
222123
222124
222125
222126
222127
222128
222129
222130
222131
2221

In [None]:
df=pd.read_csv(r"/content/C:\Users\HP\STT\Lab3\bugs_diffs_with_classes.csv")
column_names = df.columns
print(column_names)

  df=pd.read_csv(r"/content/C:\Users\HP\STT\Lab3\bugs_diffs_with_classes.csv")


Index(['Hash', 'Message', 'Filename', 'Source Code (before)',
       'Source Code (current)', 'Diff', 'LLM Inference', 'Unnamed: 7',
       'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12',
       'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16',
       'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20',
       'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24',
       'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28',
       'Unnamed: 29', 'MI_Change', 'CC_Change', 'LOC_Change',
       'Semantic_Similarity', 'Token_Similarity', 'Semantic_Class',
       'Token_Class', 'Classes_Agree'],
      dtype='object')


In [None]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.to_csv(r"STT-Lab3_Final.csv", index=False)

In [None]:
df=pd.read_csv(r"/content/STT-Lab3_Final.csv")
name=df.columns
print(name)

Index(['Hash', 'Message', 'Filename', 'Source Code (before)',
       'Source Code (current)', 'Diff', 'LLM Inference', 'MI_Change',
       'CC_Change', 'LOC_Change', 'Semantic_Similarity', 'Token_Similarity',
       'Semantic_Class', 'Token_Class', 'Classes_Agree'],
      dtype='object')
