In [235]:
import os
import ast
from difflib import SequenceMatcher
import tokenize
from io import BytesIO
from sklearn.cluster import DBSCAN
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import graphviz # Often used with pydot for rendering, though not directly used for plotting here.
import seaborn as sns
from pathlib import Path
import os
import tempfile
import re

# Use a non-interactive backend for Matplotlib, which is good for generating images without a display.
# This is crucial in environments where a GUI might not be available (e.g., servers, automated scripts).
import matplotlib
matplotlib.use("Agg")

In [236]:
def load_codes_from_folder(folder_path):
    code_files = [f for f in os.listdir(folder_path) if f.endswith(".c")]
    codes = {}
    for file in code_files:
        with open(os.path.join(folder_path, file), 'r') as f:
            codes[file] = f.read()
    return codes

In [237]:
folder_path_str = "kadane algo in c"

In [238]:
code=load_codes_from_folder(folder_path_str)

In [239]:
code

{'s1.c': '#include <stdio.h>\n\nint max(int a, int b) {\n    return (a > b) ? a : b;\n}\n\nint maxSubArray(int* nums, int numsSize) {\n    int maxSum = nums[0];\n    int currentSum = nums[0];\n\n    for (int i = 1; i < numsSize; ++i) {\n        currentSum = max(nums[i], currentSum + nums[i]);\n        maxSum = max(maxSum, currentSum);\n    }\n\n    return maxSum;\n}\n\nint main() {\n    int nums[] = {-2, 1, -3, 4, -1, 2, 1, -5, 4};\n    int size = sizeof(nums) / sizeof(nums[0]);\n\n    int result = maxSubArray(nums, size);\n    printf("Max Subarray Sum: %d\\n", result);\n\n    return 0;\n}\n',
 's2.c': '#include <stdio.h>\n#include <limits.h>  // For INT_MIN\n\nint maxSubArray(int* nums, int numsSize) {\n    int max_sum = INT_MIN;  // Equivalent to float(\'-inf\')\n    int current_sum = 0;\n\n    for (int i = 0; i < numsSize; ++i) {\n        current_sum += nums[i];\n\n        if (current_sum > max_sum) {\n            max_sum = current_sum;\n        }\n\n        if (current_sum < 0) {\n

In [240]:
a=token_similarity(code['s1.c'], code['s2.c'])

In [241]:
import tokenize
from io import BytesIO


In [242]:
import os
import matplotlib.pyplot as plt
from sctokenizer import CTokenizer, TokenType
# Assuming you have the c_tokenizer function defined as in our last conversation.
# If not, make sure to include it.

# --- Re-define c_tokenizer (from our previous successful attempt) ---
def c_tokenizer(c_code_string: str) -> list:
    """
    Tokenizes a C code string and categorizes tokens into a simplified list.
    """
    tokenizer = CTokenizer()
    tokens = tokenizer.tokenize(c_code_string)
    processed_tokens = []
    for token in tokens:
        if token.token_type == TokenType.COMMENT_SYMBOL:
            continue
        if token.token_type == TokenType.IDENTIFIER:
            processed_tokens.append("IDENTIFIER")
        elif token.token_type == TokenType.OPERATOR:
            processed_tokens.append("OPERATOR")
        elif token.token_type == TokenType.CONSTANT:
            processed_tokens.append('NUMBER')
        elif token.token_type == TokenType.STRING:
            processed_tokens.append('STRING')
        elif token.token_type == TokenType.KEYWORD:
            processed_tokens.append("KEYWORD")
        elif token.token_type == TokenType.SPECIAL_SYMBOL:
            processed_tokens.append("SPECIAL_SYMBOL")
        elif token.token_type == TokenType.OTHER:
            if token.token_value.startswith('#'):
                processed_tokens.append("PREPROCESSOR")
            else:
                processed_tokens.append("OTHER")
    return processed_tokens

# --- Define tokenize_code to use c_tokenizer (as discussed previously) ---
def tokenize_code(code_string):
    """Tokenizes C code string using c_tokenizer."""
    return c_tokenizer(code_string)

# --- Define token_similarity (as discussed previously) ---
def token_similarity(code1, code2):
    """Calculates Jaccard similarity between two code snippets based on their tokens."""
    tokens1, tokens2 = tokenize_code(code1), tokenize_code(code2)
    set1 = set(tokens1)
    set2 = set(tokens2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0.0
    else:
        return intersection / union

# --- Updated visualize_tokens function ---
def visualize_tokens(code: str, filename: str, output_folder: str = "output") -> str:
    """
    Tokenizes code, calculates token frequencies, and saves a bar chart
    to a specified output folder.

    Args:
        code: The source code string to visualize.
        filename: The desired name for the output image file (e.g., "token_freq.png").
        output_folder: The directory where the image will be saved.
                       Defaults to "output".

    Returns:
        The full path to the saved image file.
    """
    tokens = tokenize_code(code)
    token_counts = {token: tokens.count(token) for token in set(tokens)}

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    plt.figure(figsize=(10, 5))
    plt.bar(token_counts.keys(), token_counts.values(), color='skyblue')
    plt.title(f"Token Frequency for '{filename}'") # Add filename to title for clarity
    plt.xlabel("Token Type")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha='right') # Rotate for long token names

    plt.tight_layout() # Adjust layout to prevent labels from overlapping
    filepath = os.path.join(output_folder, filename) # Join folder and filename
    plt.savefig(filepath)
    plt.close() # Close the plot to free up memory

    print(f"Token frequency chart saved to: {filepath}")
    return filepath

# --- Example Usage ---

# C Code Snippets for demonstration
c_code_example1 = """
/* My first C program */
#include <stdlib.h>

void calculate_sum(int a, int b) {
    int sum = a + b;
    // Done calculation
}
"""

c_code_example2 = """
// Another C program
#include <stdlib.h>

void calculate_sum(int a, int b) {
    int sum = a + b;
    // Done calculation
}
"""

# 1. Visualize tokens for c_code_example1
output_dir = "output_plots" # Define your output folder name
visualize_tokens(c_code_example1, "code1_token_frequency.png", output_dir)

# 2. Visualize tokens for c_code_example2
visualize_tokens(c_code_example2, "code2_token_frequency.png", output_dir)

# 3. Calculate similarity between them
similarity = token_similarity(c_code_example1, c_code_example2)
print(f"\nSimilarity between code1 and code2: {similarity:.4f}")

Token frequency chart saved to: output_plots\code1_token_frequency.png
Token frequency chart saved to: output_plots\code2_token_frequency.png

Similarity between code1 and code2: 1.0000


In [291]:
from pycparser import c_parser
from pycparser.c_ast import Node, ID, Constant, FuncCall, Decl, For, While
from itertools import zip_longest
from pycparser import c_parser
import re
# --- Your earlier code for cleaning C code ---
def clean_c_code(code, add_stdio_stubs=True):
    code = re.sub(r'^\s*#include.*$', '', code, flags=re.MULTILINE)
    code = re.sub(r'//.*', '', code)
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    stub_funcs = ""
    if add_stdio_stubs:
        if re.search(r'\bprintf\b', code):
            stub_funcs += "void printf(char*, ...);\n"
        if re.search(r'\bscanf\b', code):
            stub_funcs += "void scanf(char*, ...);\n"
    return stub_funcs + code

# --- The pycparser parser object ---
parser = c_parser.CParser()

parser = c_parser.CParser()

weights = {
    'match': 1.0,
    'insert_delete': -0.7,
    'update': -0.5,
    'soft_update': -0.2,
    'match_For': 1.5,
    'update_For': -1.0,
}

def compare_ast_nodes(node1, node2, weights):
    if node1 is None and node2 is None:
        return (0, 0)
    if node1 is None or node2 is None:
        return (weights["insert_delete"], 1)
    if type(node1) != type(node2):
        type_name = "update_" + type(node1).__name__
        return (weights.get(type_name, weights["update"]), 1)

    score = weights.get("match_" + type(node1).__name__, weights["match"])
    max_score = 1

    for attr in node1.attr_names:
        if attr in ("name", "value", "op"):
            if getattr(node1, attr, None) != getattr(node2, attr, None):
                score += weights["soft_update"]

    children1 = list(node1.children())
    children2 = list(node2.children())

    from itertools import zip_longest
    for pair1, pair2 in zip_longest(children1, children2):
        c1 = pair1[1] if pair1 is not None else None
        c2 = pair2[1] if pair2 is not None else None
        s, m = compare_ast_nodes(c1, c2, weights)
        score += s
        max_score += m

    return score, max_score


def semantic_similarity(ast1, ast2):
    score, max_score = compare_ast_nodes(ast1, ast2, weights)
    return max(min(score / max_score, 1.0), 0.0) if max_score > 0 else 0.0  # Clamp safely


In [290]:
ast1 = parser.parse(clean_c_code("""int main() {
    int sum = 0;
    for (int i = 1; i <= 10; i++) {
        sum = sum + i;
    }
    return sum;
}
"""))
ast2 = parser.parse(clean_c_code("""int main() {
    int total = 0;
    int number = 1;
    while (number <= 10) {
        total += number;
        number++;
    }
    return total;
}
"""))

sim = semantic_similarity(ast1, ast2)
print(f"✓ Semantic similarity = {sim:.4f}")  # Should be in [0.0, 1.0]


✓ Semantic similarity = 0.6143
