In [14]:
is_na = {'зач.': 70, 'неуваж.': 0, 'недсд.': 0, '4': 70, 'недоп.': 0, '5': 90, 'незач.': 20, '2': 20, '3': 50}

In [8]:
def make_subject_stats(df):
    subject_data = {}
    for _, row in df.iterrows():
        subject = row["Наименование дисциплины"]
        score = row["Балл"] if not pd.isna(row["Балл"]) else is_na[row["Оценка"]][0]
        
        if subject not in subject_data:
            subject_data[subject] = {'total_students': 0, 'non_zero_scores': []}
            
        subject_data[subject]['total_students'] += 1
        if score > 0:
            subject_data[subject]['non_zero_scores'].append(score)
    
    stats = {}
    for subject, data in subject_data.items():
        total_students = data['total_students']
        non_zero = data['non_zero_scores']
        
        num_fails = total_students - len(non_zero)
        fail_ratio = num_fails / total_students if total_students > 0 else 0
        mean_clean = sum(non_zero)/len(non_zero) if non_zero else 0
        
        stats[subject] = {'mean_clean': mean_clean, 'fail_ratio': fail_ratio}
    
    return stats

In [9]:
import math

def power_penalty_score(student_scores, subject_stats, p=2.0):
    total_score = 0
    subject_count = 0

    for subject, student_score in student_scores.items():
        if subject not in subject_stats:
            continue
            
        stats = subject_stats[subject]
        mean_clean = stats['mean_clean']
        fail_ratio = stats['fail_ratio']

        if student_score < 40:
            multiplier = 1 + math.log(1 / (fail_ratio + 1e-6))
            adjusted = (mean_clean ** p) * multiplier
        else:
            delta = mean_clean - student_score
            adjusted = math.copysign(abs(delta) ** p, delta) * fail_ratio

        total_score += adjusted
        subject_count += 1

    return total_score / subject_count if subject_count else 0.0

In [10]:
from concurrent.futures import ThreadPoolExecutor

def process_scores(df):
    subject_stats = make_subject_stats(df)
    penalties = {}
    sorted_penalties = []

    student_groups = {}
    for _, row in df.iterrows():
        student = row["UUID студента"]
        subject = row["Наименование дисциплины"]
        score = row["Балл"] if not pd.isna(row["Балл"]) else is_na[row["Оценка"]][0]
        
        if student not in student_groups:
            student_groups[student] = {}
        student_groups[student][subject] = score

    def compute_penalty(student, scores):
        if not scores:
            return student, None
        penalty = power_penalty_score(scores, subject_stats)
        return student, penalty

    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(compute_penalty, student, scores)
            for student, scores in student_groups.items()
        ]
        
        for future in futures:
            student, penalty = future.result()
            penalties[student] = penalty
            if penalty is not None:
                sorted_penalties.append(penalty)

    sorted_penalties.sort()
    
    return penalties, sorted_penalties

In [11]:
import bisect

def get_student_rank(student_penalty, sorted_penalties):
    pos = bisect.bisect_right(sorted_penalties, student_penalty)
    return len(sorted_penalties) - pos + 1

In [12]:
def sep_dataset(df):
    """
    Разделяет датасет на две части:
    - bak_spec: студенты с уровнем подготовки "Бакалавр" или "Специалист"
    - magistr: студенты с уровнем подготовки "Магистр"
    """
    if "Уровень подготовки" not in df.columns:
        raise ValueError("В датасете отсутствует столбец 'Уровень подготовки'")
    
    # Фильтрация по уровням подготовки
    bak_spec_mask = df["Уровень подготовки"].isin(["Бакалавр", "Специалист"])
    magistr_mask = df["Уровень подготовки"] == "Магистр"
    
    bak_spec = df[bak_spec_mask].copy()
    magistr = df[magistr_mask].copy()
    
    return bak_spec, magistr

In [13]:
import pickle
import pandas as pd

def read_dataset(path, sep=';', encoding='utf-8-sig', low_memory=False):
    return pd.read_csv(path, sep=sep, encoding=encoding, low_memory=low_memory)

s_dataset = read_dataset('../../data/datasets/subjects_dataset_1_6.csv')

s_bak_spec, s_magistr = sep_dataset(s_dataset)

penalties_magistr, sorted_penalties_magistr = process_scores(s_magistr)
penalties_bak_spec, sorted_penalties_bak_spec = process_scores(s_bak_spec)

with open('subject_stats_magistr.pkl', 'wb') as f:
    pickle.dump(make_subject_stats(s_magistr), f)

with open('sorted_penalties_magistr.pkl', 'wb') as f:
    pickle.dump(sorted_penalties_magistr, f)

with open('subject_stats_bak_spec.pkl', 'wb') as f:
    pickle.dump(make_subject_stats(s_bak_spec), f)

with open('sorted_penalties_bak_spec.pkl', 'wb') as f:
    pickle.dump(sorted_penalties_bak_spec, f)