In [2]:
import os
import ast
from difflib import SequenceMatcher
import tokenize
from io import BytesIO
from sklearn.cluster import DBSCAN
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import graphviz # Often used with pydot for rendering, though not directly used for plotting here.
import seaborn as sns
from pathlib import Path
import os
import tempfile
import re

# Use a non-interactive backend for Matplotlib, which is good for generating images without a display.
# This is crucial in environments where a GUI might not be available (e.g., servers, automated scripts).
import matplotlib
matplotlib.use("Agg")

In [3]:
def load_codes_from_folder(folder_path):
    code_files = [f for f in os.listdir(folder_path) if f.endswith(".c")]
    codes = {}
    for file in code_files:
        with open(os.path.join(folder_path, file), 'r') as f:
            codes[file] = f.read()
    return codes

In [4]:
folder_path_str = "kadane algo in c"

In [5]:
code=load_codes_from_folder(folder_path_str)

In [6]:
code

{'s1.c': '#include <stdio.h>\n\nint max(int a, int b) {\n    return (a > b) ? a : b;\n}\n\nint maxSubArray(int* nums, int numsSize) {\n    int maxSum = nums[0];\n    int currentSum = nums[0];\n\n    for (int i = 1; i < numsSize; ++i) {\n        currentSum = max(nums[i], currentSum + nums[i]);\n        maxSum = max(maxSum, currentSum);\n    }\n\n    return maxSum;\n}\n\nint main() {\n    int nums[] = {-2, 1, -3, 4, -1, 2, 1, -5, 4};\n    int size = sizeof(nums) / sizeof(nums[0]);\n\n    int result = maxSubArray(nums, size);\n    printf("Max Subarray Sum: %d\\n", result);\n\n    return 0;\n}\n',
 's2.c': '#include <stdio.h>\n#include <limits.h>  // For INT_MIN\n\nint maxSubArray(int* nums, int numsSize) {\n    int max_sum = INT_MIN;  // Equivalent to float(\'-inf\')\n    int current_sum = 0;\n\n    for (int i = 0; i < numsSize; ++i) {\n        current_sum += nums[i];\n\n        if (current_sum > max_sum) {\n            max_sum = current_sum;\n        }\n\n        if (current_sum < 0) {\n

NameError: name 'token_similarity' is not defined

In [8]:
import tokenize
from io import BytesIO


In [9]:
import os
import matplotlib.pyplot as plt
from sctokenizer import CTokenizer, TokenType
# Assuming you have the c_tokenizer function defined as in our last conversation.
# If not, make sure to include it.

# --- Re-define c_tokenizer (from our previous successful attempt) ---
def c_tokenizer(c_code_string: str) -> list:
    """
    Tokenizes a C code string and categorizes tokens into a simplified list.
    """
    tokenizer = CTokenizer()
    tokens = tokenizer.tokenize(c_code_string)
    processed_tokens = []
    for token in tokens:
        if token.token_type == TokenType.COMMENT_SYMBOL:
            continue
        if token.token_type == TokenType.IDENTIFIER:
            processed_tokens.append("IDENTIFIER")
        elif token.token_type == TokenType.OPERATOR:
            processed_tokens.append("OPERATOR")
        elif token.token_type == TokenType.CONSTANT:
            processed_tokens.append('NUMBER')
        elif token.token_type == TokenType.STRING:
            processed_tokens.append('STRING')
        elif token.token_type == TokenType.KEYWORD:
            processed_tokens.append("KEYWORD")
        elif token.token_type == TokenType.SPECIAL_SYMBOL:
            processed_tokens.append("SPECIAL_SYMBOL")
        elif token.token_type == TokenType.OTHER:
            if token.token_value.startswith('#'):
                processed_tokens.append("PREPROCESSOR")
            else:
                processed_tokens.append("OTHER")
    return processed_tokens

# --- Define tokenize_code to use c_tokenizer (as discussed previously) ---
def tokenize_code(code_string):
    """Tokenizes C code string using c_tokenizer."""
    return c_tokenizer(code_string)

# --- Define token_similarity (as discussed previously) ---
def token_similarity(code1, code2):
    """Calculates Jaccard similarity between two code snippets based on their tokens."""
    tokens1, tokens2 = tokenize_code(code1), tokenize_code(code2)
    set1 = set(tokens1)
    set2 = set(tokens2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0.0
    else:
        return intersection / union

# --- Updated visualize_tokens function ---
def visualize_tokens(code: str, filename: str, output_folder: str = "output") -> str:
    """
    Tokenizes code, calculates token frequencies, and saves a bar chart
    to a specified output folder.

    Args:
        code: The source code string to visualize.
        filename: The desired name for the output image file (e.g., "token_freq.png").
        output_folder: The directory where the image will be saved.
                       Defaults to "output".

    Returns:
        The full path to the saved image file.
    """
    tokens = tokenize_code(code)
    token_counts = {token: tokens.count(token) for token in set(tokens)}

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    plt.figure(figsize=(10, 5))
    plt.bar(token_counts.keys(), token_counts.values(), color='skyblue')
    plt.title(f"Token Frequency for '{filename}'") # Add filename to title for clarity
    plt.xlabel("Token Type")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha='right') # Rotate for long token names

    plt.tight_layout() # Adjust layout to prevent labels from overlapping
    filepath = os.path.join(output_folder, filename) # Join folder and filename
    plt.savefig(filepath)
    plt.close() # Close the plot to free up memory

    print(f"Token frequency chart saved to: {filepath}")
    return filepath

# --- Example Usage ---

# C Code Snippets for demonstration
c_code_example1 = """
/* My first C program */
#include <stdlib.h>

void calculate_sum(int a, int b) {
    int sum = a + b;
    // Done calculation
}
"""

c_code_example2 = """
// Another C program
#include <stdlib.h>

void calculate_sum(int a, int b) {
    int sum = a + b;
    // Done calculation
}
"""

# 1. Visualize tokens for c_code_example1
output_dir = "output_plots" # Define your output folder name
visualize_tokens(c_code_example1, "code1_token_frequency.png", output_dir)

# 2. Visualize tokens for c_code_example2
visualize_tokens(c_code_example2, "code2_token_frequency.png", output_dir)

# 3. Calculate similarity between them
similarity = token_similarity(c_code_example1, c_code_example2)
print(f"\nSimilarity between code1 and code2: {similarity:.4f}")

Token frequency chart saved to: output_plots\code1_token_frequency.png
Token frequency chart saved to: output_plots\code2_token_frequency.png

Similarity between code1 and code2: 1.0000


In [None]:
from pycparser import c_parser
from pycparser.c_ast import Node, ID, Constant, FuncCall, Decl, For, While
from itertools import zip_longest
from pycparser import c_parser
import re
# --- Your earlier code for cleaning C code ---
def clean_c_code(code, add_stdio_stubs=True):
    code = re.sub(r'^\s*#include.*$', '', code, flags=re.MULTILINE)
    code = re.sub(r'//.*', '', code)
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    stub_funcs = ""
    if add_stdio_stubs:
        if re.search(r'\bprintf\b', code):
            stub_funcs += "void printf(char*, ...);\n"
        if re.search(r'\bscanf\b', code):
            stub_funcs += "void scanf(char*, ...);\n"
    return stub_funcs + code

# --- The pycparser parser object ---
parser = c_parser.CParser()

parser = c_parser.CParser()

weights = {
    'match': 1.0,
    'insert_delete': -0.7,
    'update': -0.5,
    'soft_update': -0.2,
    'match_For': 1.5,
    'update_For': -1.0,
}

def compare_ast_nodes(node1, node2, weights):
    if node1 is None and node2 is None:
        return (0, 0)
    if node1 is None or node2 is None:
        return (weights["insert_delete"], 1)
    if type(node1) != type(node2):
        type_name = "update_" + type(node1).__name__
        return (weights.get(type_name, weights["update"]), 1)

    score = weights.get("match_" + type(node1).__name__, weights["match"])
    max_score = 1

    for attr in node1.attr_names:
        if attr in ("name", "value", "op"):
            if getattr(node1, attr, None) != getattr(node2, attr, None):
                score += weights["soft_update"]

    children1 = list(node1.children())
    children2 = list(node2.children())

    from itertools import zip_longest
    for pair1, pair2 in zip_longest(children1, children2):
        c1 = pair1[1] if pair1 is not None else None
        c2 = pair2[1] if pair2 is not None else None
        s, m = compare_ast_nodes(c1, c2, weights)
        score += s
        max_score += m

    return score, max_score


def semantic_similarity(ast1, ast2):
    score, max_score = compare_ast_nodes(ast1, ast2, weights)
    return max(min(score / max_score, 1.0), 0.0) if max_score > 0 else 0.0  # Clamp safely


In [40]:
ast1 = parser.parse(clean_c_code("""int main() {
    int sum = 0;
    for (int i = 1; i <= 10; i++) {
        sum = sum + i;
    }
    return sum;
}
"""))
ast2 = parser.parse(clean_c_code("""int main() {
    int a=0;
    int s=1;
    for (int i = 1; i <= 10; i++) {
        a = a + s;
    }
    scanf("%d", &a);
    unsigned long long answer = a;
    return 0;
}"""))


sim = semantic_similarity(ast1, ast2)
print(f"✓ Semantic similarity = {sim:.4f}")  # Should be in [0.0, 1.0]


✓ Semantic similarity = 0.0000


In [41]:
# Necessary Imports
import networkx as nx
import matplotlib.pyplot as plt
from pycparser import CParser, c_ast
import os

# --- Configuration for pycparser (Crucial for parsing C with standard library calls) ---
# A simplified stdio.h for pycparser to understand printf, etc.
fake_libc_headers = """
typedef unsigned long size_t;
typedef unsigned long long __gnuc_va_list;
typedef unsigned long long __builtin_va_list;

// Simplified printf declaration
int printf(const char *format, ...);
// Add other common functions if needed, e.g., scanf, malloc, free, etc.
// int scanf(const char *format, ...);
// void *malloc(size_t size);
// void free(void *ptr);
"""

# Initialize the C parser
parser = CParser()

# --- C Code Snippets to Visualize ---

# C Code 1: Summation loop
c_code_1 = """
int main() {
    int sum = 0;
    for (int i = 1; i <= 10; i++) {
        sum = sum + i;
    }
    return sum;
}
"""
output_filename_1 = "c_ast_sum_loop.png"

# C Code 2: Factorial calculation (the one that caused ParseError before)
c_code_2 = """
int main() {
    int input;
    printf("Input any integer: ");
    input=5;
    unsigned long long answer = input;
    printf("The factorial is: %llu\\n", answer);
    return 0;
}
"""
output_filename_2 = "c_ast_factorial.png"

# C Code 3: If-else statement
c_code_3 = """
int max(int a, int b) {
    if (a > b) {
        return a;
    } else {
        return b;
    }
}
"""
output_filename_3 = "c_ast_ifelse.png"

# Invalid C code (to test error handling)
invalid_c_code = """
int main { // Missing parentheses for main
    printf("Hello");
"""
output_filename_invalid = "c_ast_invalid.png"

# --- List of (C Code, Output Filename) pairs to process ---
code_snippets = [
    (c_code_1, output_filename_1),
    (c_code_2, output_filename_2),
    (c_code_3, output_filename_3),
    (invalid_c_code, output_filename_invalid)
]

# --- Main Logic to Parse and Visualize Each C Code Snippet ---
for c_code, filename in code_snippets:
    print(f"\nProcessing: {filename}")
    print("----------------------------------------")
    print(c_code)
    print("----------------------------------------")

    try:
        # Parse the C code using pycparser, prepending fake headers
        c_ast_tree = parser.parse(clean_c_code(fake_libc_headers + c_code))

        graph = nx.DiGraph()

        # Recursive function to add nodes and edges to the NetworkX graph
        def add_nodes_edges_c(node, parent_id=None):
            node_label = type(node).__name__
            node_attrs = []

            # Add relevant attributes to the node label for better insight
            if hasattr(node, 'name') and node.name is not None:
                node_attrs.append(f"name: {node.name}")
            if hasattr(node, 'op') and node.op is not None:
                node_attrs.append(f"op: {node.op}")
            if hasattr(node, 'value') and node.value is not None:
                node_attrs.append(f"value: {repr(node.value)}") # Use repr for string values

            if node_attrs:
                node_label += "\n(" + ", ".join(node_attrs) + ")"

            current_node_id = str(id(node))
            graph.add_node(current_node_id, label=node_label)

            if parent_id:
                graph.add_edge(parent_id, current_node_id)

            # Iterate through child nodes using pycparser's `children()` method
            for name, child in node.children():
                if isinstance(child, c_ast.Node): # Ensure it's an AST node
                    add_nodes_edges_c(child, current_node_id)
                elif isinstance(child, list): # Handle lists of nodes (e.g., statements in a block)
                    for item in child:
                        if isinstance(item, c_ast.Node):
                            add_nodes_edges_c(item, current_node_id)

        # Start building the graph from the root of the AST
        add_nodes_edges_c(c_ast_tree)

        # Plotting the graph
        labels = nx.get_node_attributes(graph, 'label')

        # A fixed seed ensures reproducible layouts
        pos = nx.spring_layout(graph, seed=42)

        plt.figure(figsize=(15, 10)) # Adjust figure size for better readability
        nx.draw(graph, pos, labels=labels, with_labels=True,
                node_color="lightblue", node_shape="o", # Circle nodes
                edge_color="gray", width=0.8, arrowsize=10,
                node_size=4000, font_size=8, font_weight="bold",
                font_color="black", alpha=0.9
               )

        plt.title(f"C AST Visualization: {filename}", size=15)
        plt.tight_layout() # Adjust layout to prevent labels overlapping
        plt.savefig(filename, format='png', dpi=300)
        plt.close() # Close the plot to free memory

        print(f"✓ C AST visualization saved to {filename}")

    except Exception as e: # Catch any exception during parsing or drawing
        print(f"✗ Error visualizing C AST for {filename}: {e}")

print("\n--- All visualizations attempted ---")


Processing: c_ast_sum_loop.png
----------------------------------------

int main() {
    int sum = 0;
    for (int i = 1; i <= 10; i++) {
        sum = sum + i;
    }
    return sum;
}

----------------------------------------


  plt.tight_layout() # Adjust layout to prevent labels overlapping


✓ C AST visualization saved to c_ast_sum_loop.png

Processing: c_ast_factorial.png
----------------------------------------

int main() {
    int input;
    printf("Input any integer: ");
    input=5;
    unsigned long long answer = input;
    printf("The factorial is: %llu\n", answer);
    return 0;
}

----------------------------------------
✓ C AST visualization saved to c_ast_factorial.png

Processing: c_ast_ifelse.png
----------------------------------------

int max(int a, int b) {
    if (a > b) {
        return a;
    } else {
        return b;
    }
}

----------------------------------------
✓ C AST visualization saved to c_ast_ifelse.png

Processing: c_ast_invalid.png
----------------------------------------

int main { // Missing parentheses for main
    printf("Hello");

----------------------------------------
✗ Error visualizing C AST for c_ast_invalid.png: : At end of input

--- All visualizations attempted ---
