In [1]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Load JSON Data
with open('/home/dheena/Downloads/Intiliee/output/output_code_data.json', 'r') as f:
    data = json.load(f)

# Step 2: Normalize and Extract Features
records = []
for item in data:
    file_name = item['file_name']
    functions = item['functions']
    classes = item['classes']
    code_snippets_functions = item['code_snippets']['functions']
    code_snippets_classes = item['code_snippets']['classes']
    
    # Add function records with the correct code snippet for each function
    for idx, func in enumerate(functions):
        # Check if there's a corresponding code snippet for the function
        code_snippet = code_snippets_functions[idx] if idx < len(code_snippets_functions) else None
        records.append({
            'file_name': file_name,
            'name': func,
            'type': 'function',
            'code_snippet': code_snippet
        })
        
    # Add class records with the correct code snippet for each class
    for idx, cls in enumerate(classes):
        # Check if there's a corresponding code snippet for the class
        code_snippet = code_snippets_classes[idx] if idx < len(code_snippets_classes) else None
        records.append({
            'file_name': file_name,
            'name': cls,
            'type': 'class',
            'code_snippet': code_snippet
        })


# Convert to DataFrame
df = pd.DataFrame(records)

# Step 3: Text Processing
# Example: Using TF-IDF on code snippets (if available)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['code_snippet'].fillna(''))

# You can also extract names as features if required
X_names = vectorizer.fit_transform(df['name'])

# Step 4: Combine Features
# Ensure dimensions match before combining
# This can be adjusted based on the features you need
if X.shape[0] == X_names.shape[0]:
    from scipy.sparse import hstack
    X_combined = hstack([X, X_names])
else:
    print("Dimension mismatch between code snippets and names!")

# Step 5: Save Preprocessed Data
df.to_csv('/home/dheena/Downloads/Intiliee/output/preprocessed_data.csv', index=False)

print("Preprocessing complete. Preprocessed data saved.")


Preprocessing complete. Preprocessed data saved.
