In [10]:
import json
from collections import Counter


### trial 1 --falied

In [8]:
def filter_skills(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    filtered_skills = []
    for skill in data['skills']:
        # Remove 'preferred' connections
        skill['connections'].pop('preferred', None)
        
        # Filter 'required' connections
        required_connections = skill['connections'].get('required', [])
        filtered_required = [req for req in required_connections if required_connections.count(req) >= 2]
        if filtered_required:
            skill['connections']['required'] = filtered_required
        else:
            skill['connections'].pop('required', None)
        
        # Add the skill to the filtered list if it has any 'required' connections left
        if skill['connections']:
            filtered_skills.append(skill)
    
    # Update the data with filtered skills
    data['skills'] = filtered_skills
    
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=4)


### trial 2 ---counter

In [22]:
def extract_jobs_from_subcategories(input_file):
    """
    Extract all jobs from the subcategories structure in the input file.
    """
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    jobs = []
    for subcategory in data.get('subcategories', []):
        jobs.extend(subcategory.get('jobs', []))
    
    return jobs

def count_skill_occurrences(jobs):
    """
    Count the occurrences of each skill in all required_skills across jobs.
    """
    skill_counter = Counter()
    for job in jobs:
        required_skills = job.get("required_skills", {})
        for category, skills in required_skills.items():
            skill_counter.update(skills)
    return skill_counter

def filter_skills(input_file, output_file):
    # Extract jobs data from subcategories
    jobs = extract_jobs_from_subcategories(input_file)
    
    # Count skill occurrences
    skill_counts = count_skill_occurrences(jobs)

    # Load skills data from the input file
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    # Filter skills based on occurrences
    filtered_skills = []
    for skill in data['skills']:
        skill_name = skill['name']
        occurrence_count = skill_counts.get(skill_name, 0)

        # Remove 'preferred' connections
        skill['connections'].pop('preferred', None)
        
        # Filter 'required' connections if the skill appears in at least 5 jobs
        if occurrence_count >= 100:
            filtered_skills.append(skill)
    
    # Update the skills data with the filtered skills
    data['skills'] = filtered_skills
    
    # Save the filtered data to the output file
    with open(output_file, 'w') as out_file:
        json.dump(data, out_file, indent=4)
    
    print(f"Filtered skills data saved to {output_file}")

In [23]:
input_file = "processed/combined_skillsDash_1114.json"

output_file = "processed/filtered_nonpop_1129_100.json"
filter_skills(input_file, output_file)



Filtered skills data saved to processed/filtered_nonpop_1129_100.json
