In [137]:
import pandas as pd
import csv
import re
import json

In [138]:
df = pd.read_json('archive_2/data.jsonl', lines=True, encoding='utf-8')

In [139]:
def analyze_question_difficulty(question):
    """Analyze a question and return its difficulty level and whether it's a coding question"""
    question = question.lower()
    
    # Non-coding question patterns
    non_coding_patterns = [
        r"write a python program to",
        r"create a python script to",
        r"develop a python function to",
        r"implement a python function to",
        r"write a function to",
        r"create a function to",
        r"develop a function to",
        r"implement a function to",
        r"write an algorithm to",
        r"create an algorithm to",
        r"develop an algorithm to",
        r"implement an algorithm to"
    ]
    
    # Difficulty patterns and keywords
    difficulty_patterns = {
        "extreme": [
            r"np-hard problem",
            r"complex optimization",
            r"system optimization",
            r"advanced concurrency",
            r"parallel processing",
            r"complex mathematical",
            r"advanced algorithm",
            r"complex data structure",
            r"system programming",
            r"complex graph",
            r"advanced graph",
            r"complex tree",
            r"advanced tree",
            r"complex recursion",
            r"advanced recursion"
        ],
        "hard": [
            r"dynamic programming",
            r"graph algorithm",
            r"tree algorithm",
            r"complex algorithm",
            r"advanced algorithm",
            r"complex data structure",
            r"advanced data structure",
            r"complex recursion",
            r"advanced recursion",
            r"complex sorting",
            r"advanced sorting",
            r"complex file",
            r"advanced file",
            r"complex api",
            r"advanced api",
            r"complex web scraping",
            r"advanced web scraping"
        ],
        "medium": [
            r"moderate difficulty",
            r"intermediate level",
            r"complex string",
            r"complex list",
            r"complex array",
            r"multiple conditions",
            r"file operations",
            r"api operations",
            r"web scraping",
            r"basic recursion",
            r"basic sorting",
            r"basic data structure",
            r"basic algorithm"
        ],
        "easy": [
            r"basic operations",
            r"simple operations",
            r"basic string",
            r"simple string",
            r"basic list",
            r"simple list",
            r"basic array",
            r"simple array",
            r"basic function",
            r"simple function",
            r"basic sorting",
            r"simple sorting",
            r"basic input",
            r"simple input",
            r"basic output",
            r"simple output"
        ]
    }
    
    # Check if it's a non-coding question
    is_non_coding = False
    for pattern in non_coding_patterns:
        if re.search(pattern, question):
            is_non_coding = True
            break
    
    # Determine difficulty
    difficulty = "easy"
    for level, patterns in reversed(list(difficulty_patterns.items())):
        for pattern in patterns:
            if re.search(pattern, question):
                difficulty = level
                break
    
    # Additional checks for common patterns
    if "fibonacci" in question:
        if "recursive" in question:
            difficulty = "medium"
        else:
            difficulty = "easy"
    
    if "binary search" in question:
        difficulty = "medium"
    
    if "linked list" in question:
        difficulty = "medium"
    
    if "tree" in question and "binary" in question:
        difficulty = "hard"
    
    if "graph" in question:
        difficulty = "hard"
    
    if "sql" in question or "database" in question:
        difficulty = "medium"
    
    if "api" in question and "rate limiting" in question:
        difficulty = "hard"
    
    if "web scraping" in question and "anti-scraping" in question:
        difficulty = "hard"
    
    return difficulty, is_non_coding

def generate_difficulty_array(csv_file_path):
    """Generate an array of difficulties for each question in the CSV file"""
    difficulties = []
    
    with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        
        for row in reader:
            if len(row) > 0:
                question = row[0]  # First column is the question
                input = row[1]  # Second column is the input
                difficulty, is_non_coding = analyze_question_difficulty(question)
                if is_non_coding:
                    continue
                difficulties.append((question, input, difficulty))
    
    return difficulties

In [145]:

def analyze_question(question_text):
    # Initialize result dictionary
    result = {
        'programming_language': "",
        'is_coding_question': False,
        'requires_code': False
    }
    
    # Convert to lowercase for case-insensitive matching
    question = question_text.lower()
    
    # Check for specific programming languages
    language_patterns = {
        'python': r'\bpython\b',
        'javascript': r'\bjavascript\b',
        'java': r'\bjava\b',
        'c': r'\bc\+\+\b|\bc\b',
        'sql': r'\bsql\b',
        'typescript': r'\btypescript\b',
        'ruby': r'\bruby\b',
        'go': r'\bgo\b',
        'rust': r'\brust\b',
        'csharp': r'\bc#\b'
    }
    
    for lang, pattern in language_patterns.items():
        if re.search(pattern, question):
            result['programming_language'] = lang
            break
    
    # Check if it's a coding question
    coding_keywords = [
        'write', 'implement', 'create', 'design', 'function', 'algorithm',
        'code', 'program', 'script', 'method', 'class', 'loop', 'if statement',
        'for', 'while', 'array', 'list', 'dictionary', 'object', 'variable'
    ]
    
    if any(keyword in question for keyword in coding_keywords):
        result['is_coding_question'] = True
    
    # Check if it requires writing code
    code_patterns = [
        r'\bwrite\b.*\bfunction\b',
        r'\bimplement\b.*\balgorithm\b',
        r'\bcreate\b.*\bprogram\b',
        r'\bdesign\b.*\bsolution\b',
        r'\bcode\b.*\bthat\b',
        r'\bfunction\b.*\bthat\b',
        r'\bmethod\b.*\bthat\b',
        r'\balgorithm\b.*\bthat\b',
        r'\bclass\b.*\bthat\b'
    ]
    
    if any(re.search(pattern, question) for pattern in code_patterns):
        result['requires_code'] = True
    
    return result

# Example usage with your JSONL file
def process_jsonl_file(file_path):
    res = []
    with open(file_path, 'r') as f:
        for line in f:
            try:
                data = json.loads(line)
                question = data.get('response', '')
                analysis = analyze_question(question)
                res.append((question, analysis['programming_language'], analysis['requires_code']))
                print(f"Question: {question}")
                print(f"Analysis: {analysis}")
                print("-" * 80)
            except json.JSONDecodeError:
                continue
    return res


In [146]:
# Call the function with your file path
result = process_jsonl_file('archive_2/data.jsonl')

Question: "Let's say you have an array of numbers and you want to find the second-largest number in it. Write a function using a programming language of your choice that takes the array as input and returns the second-largest number."
Analysis: {'programming_language': '', 'is_coding_question': True, 'requires_code': True}
--------------------------------------------------------------------------------
Question: What is the output of the following Python code:

for i in range(5):
    if i % 2 == 0:
        print(i)
Analysis: {'programming_language': 'python', 'is_coding_question': True, 'requires_code': False}
--------------------------------------------------------------------------------
Question: What is the purpose of the "for" loop in programming, and how does it differ from the "while" loop?
Analysis: {'programming_language': '', 'is_coding_question': True, 'requires_code': False}
--------------------------------------------------------------------------------
Question: Write a p

In [130]:


csv_file_path = 'archive_1/Python Programming Questions Dataset.csv'
difficulties = generate_difficulty_array(csv_file_path)

# To analyze accuracy
easy_count = sum(1 for d in difficulties if d[2] == 'easy')
medium_count = sum(1 for d in difficulties if d[2] == 'medium')
hard_count = sum(1 for d in difficulties if d[2] == 'hard')
extreme_count = sum(1 for d in difficulties if d[2] == 'extreme')
print(set(difficulties))
print(f"Easy questions: {easy_count}")
print(f"Medium questions: {medium_count}")
print(f"Hard questions: {hard_count}")
print(f"Extreme questions: {extreme_count}")

Easy questions: 10553
Medium questions: 174
Hard questions: 47
Extreme questions: 0


In [147]:
df

Unnamed: 0,response
0,"""Let's say you have an array of numbers and yo..."
1,What is the output of the following Python cod...
2,"What is the purpose of the ""for"" loop in progr..."
3,Write a program to calculate the area and peri...
4,How would you implement an algorithm to find t...
...,...
9995,What is the difference between deep and shallo...
9996,"What is the purpose and syntax of the ""for"" lo..."
9997,Write a function that takes a list of numbers ...
9998,Write a function that takes in two strings as ...


In [132]:
import random


hashmap = {
    "introductory": "easy",
    "interview": "medium",
    "competition": random.choice(["hard", "extreme"])
}

In [148]:
with open('pb_out/problemset.csv', 'a', encoding='utf-8', newline='') as outfile:
    fieldnames = ['question', 'difficulty', 'problem_io', 'source', 'is_coding', 'language']
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    # writer.writeheader()
    for i in range(len(result)):
        question = result[i][0]
        difficulty = random.choice(['medium', 'hard'])
        problem_io = ""
        source = ""
        is_coding = 1 if result[i][2] else 0
        language = result[i][1]
        writer.writerow({
            'question': question,
            'difficulty': difficulty,
            'problem_io': problem_io,
            'source': source,
            'is_coding': is_coding,
            'language': language
        })

        
        