In [15]:
import json
import math
from collections import defaultdict
import numpy as np  

type_distribution = defaultdict(int)
string_subtypes = defaultdict(int)
collection_subtypes = defaultdict(int)

numeric_stats = {
    'int': {
        'count': 0,
        'sum': 0,
        'min': math.inf,
        'max': -math.inf,
    },
    'float': {
        'count': 0,
        'sum': 0,
        'min': math.inf,
        'max': -math.inf,
    }
}


numeric_values = {
    'int': [],
    'float': []
}

length_values = {
    'list': [],
    'tuple': [],
    'dict': [],
    'set': []
}


with open('/home/XXX/CodeSemantic/CodeSemantic/dataset/statement_prediction_dataset.jsonl', 'r') as f:
    for line in f:
        data = json.loads(line)
        value = data["Value After Statement Execution"]
        

        try:
            evaluated_value = eval(value)
        except:
            evaluated_value = value
        

        value_type = type(evaluated_value).__name__
        if value_type == 'NoneType':
            print(f"NoneType value found in line: {data['Selected Statement']},{value}")
        type_distribution[value_type] += 1
        

        if isinstance(evaluated_value, (int, float)):
            num_type = 'int' if isinstance(evaluated_value, int) else 'float'
            

            numeric_stats[num_type]['count'] += 1
            numeric_stats[num_type]['sum'] += evaluated_value
            numeric_stats[num_type]['min'] = min(numeric_stats[num_type]['min'], evaluated_value)
            numeric_stats[num_type]['max'] = max(numeric_stats[num_type]['max'], evaluated_value)
            

            numeric_values[num_type].append(evaluated_value)
        elif isinstance(evaluated_value, bool):
            type_distribution['bool'] += 1
        
        elif isinstance(evaluated_value, str):
            if len(evaluated_value) == 0:
                string_subtypes["empty"] += 1
            elif evaluated_value.isalpha():
                string_subtypes["alphabetic"] += 1
            elif evaluated_value.isdigit():
                string_subtypes["numeric"] += 1
            else:
                string_subtypes["mixed"] += 1
                

        elif isinstance(evaluated_value, (list, tuple, dict, set)):
            collection_type = type(evaluated_value).__name__
            if len(evaluated_value) == 0:
                collection_subtypes[f"empty {collection_type}"] += 1
            else:
                collection_subtypes[f"non-empty {collection_type}"] += 1
                length_values[collection_type].append(len(evaluated_value))


print("Type Distribution:")
for k, v in type_distribution.items():
    print(f"{k}: {v}")


print("\nString Subtypes:")
for k, v in string_subtypes.items():
    print(f"{k}: {v}")


print("\nCollection Subtypes:")
for k, v in collection_subtypes.items():
    print(f"{k}: {v}")


print("\nNumeric Statistics:")
for num_type in numeric_stats:
    if numeric_stats[num_type]['count'] > 0:
        avg = numeric_stats[num_type]['sum'] / numeric_stats[num_type]['count']
        print(f"\n{num_type.upper()}:")
        print(f"  Count: {numeric_stats[num_type]['count']}")
        print(f"  Min: {numeric_stats[num_type]['min']}")
        print(f"  Max: {numeric_stats[num_type]['max']}")
        print(f"  Average: {avg:.2f}")
    else:
        print(f"\n{num_type.upper()}: No values found")


print("\nQuartile Statistics:")
for num_type in numeric_values:
    values = sorted(numeric_values[num_type])
    if values:
        q1 = np.percentile(values, 25)
        q2 = np.percentile(values, 50)  
        q3 = np.percentile(values, 75)
        print(f"\n{num_type.upper()}:")
        print(f"  Q1 (25th percentile): {q1}")
        print(f"  Q2 (Median): {q2}")
        print(f"  Q3 (75th percentile): {q3}")
    else:
        print(f"\n{num_type.upper()}: No values for quartile calculation")
        

print("\nCollection Length Quartile Statistics:")
for col_type, lengths in length_values.items():
    if lengths:
        lengths_sorted = sorted(lengths)
        q1 = np.percentile(lengths_sorted, 25)
        q2 = np.percentile(lengths_sorted, 50)
        q3 = np.percentile(lengths_sorted, 75)
        print(f"\n{col_type.upper()}:")
        print(f"  Q1 (25th percentile): {q1}")
        print(f"  Q2 (Median): {q2}")
        print(f"  Q3 (75th percentile): {q3}")
    else:
        print(f"\n{col_type.upper()}: No values for quartile calculation")



NoneType value found in line: supernet = self.__parent_supernet[ip_object],None
NoneType value found in line: dtype = htype_overwrite["dtype"],None
NoneType value found in line: assert all(esub == psub for esub, psub in zip(expected, parser.subs_)),None
NoneType value found in line: thread = events.get(key, None),None
NoneType value found in line: assert key.startswith(key_value),None
NoneType value found in line: assert split_options(param) == expected,None
NoneType value found in line: assert len(txt_da) == inputs[1][0],None
NoneType value found in line: line_with_git_path = next((line for line in current_set if line.endswith(git_path)), None),None
NoneType value found in line: predefined_if_name_dict = context.predefined_names.get(if_stmt),None
NoneType value found in line: assert grad(u, axis=i) == pytest.approx(2*xc[i][slices]),None
NoneType value found in line: assert target == pytest.approx(target_calc, abs=abs),None
NoneType value found in line: assert len(max_volume) == nchann