### data filtering

#### imports

In [72]:
import pandas as pd
import re
import os
import pandasql as ps

#### functions

In [73]:
def get_next_filename(base_filename, folder):
    """
    Генерує унікальну назву файлу, додаючи +1 до номера.
    """
    if not os.path.exists(folder):
        os.makedirs(folder)

    files = os.listdir(folder)

    matching_files = [f for f in files if f.startswith(base_filename) and f.endswith(".csv")]

    max_number = 0
    for file in matching_files:
        try:
            number = int(file.replace(base_filename, "").replace(".csv", "").strip("_"))
            if number > max_number:
                max_number = number
        except ValueError:
            continue

    next_number = max_number + 1
    return os.path.join(folder, f"{base_filename}_{next_number}.csv")
def process_dataset(df):
    """
    Processes the dataset according to the specified requirements:
    1. Filters rows where position contains 'Data Scientist' or 'Data Science'.
    2. Sorts rows by the 'period' field.
    3. Adds a new column 'final_seniority' with the first non-empty value from seniority -> local experience seniority -> general experience seniority.
    4. Adds a new column 'final_salary' as a concatenation of salary, bonuses flag, and bonuses amount.
    5. Repeats rows based on the 'frequency' column value (converted to int). Defaults to 1 if frequency is missing or NaN.
    6. Keeps only the columns: 'period', 'final_seniority', 'final_salary'.

    :param df: Input DataFrame.
    :return: Processed DataFrame.
    """
    # Step 1: Filter rows where position contains 'Data Scientist' or 'Data Science'
    filtered_df = df[df['position'].str.contains('Data Scientist|Data Science', na=False)]

    # Step 2: Sort rows by the 'period' field
    sorted_df = filtered_df.sort_values(by='period')

    # Step 3: Add 'final_seniority' column
    sorted_df['final_seniority'] = sorted_df.apply(
        lambda row: next(
            (val for val in [row['seniority'], row['local experience seniority'], row['general experience seniority']] if pd.notna(val)),
            None
        ),
        axis=1
    )
    # 
    # Step 4: Add 'final_salary' column
    sorted_df['final_salary'] = sorted_df.apply(
        lambda row: f"{row['salary']} {row['bonuses flag']} {row['bonuses amount']}",
        axis=1
    )
    processed_df = apply_frequency(sorted_df)
    final_columns = ['period', 'position', "final_seniority", "final_salary"]
    combined_df = processed_df[final_columns]
    return combined_df


def apply_frequency(df):
    # Заповнюємо пропущені значення у 'frequency' значенням 1.0
    df['frequency'] = df['frequency'].fillna(1.0)

    # Замінюємо від'ємні значення на 0
    df['frequency'] = df['frequency'].apply(lambda x: max(x, 0))

    # Перетворюємо значення у 'frequency' на цілі числа
    df['frequency'] = df['frequency'].astype(int)

    # Повторюємо рядки на основі значення у frequency
    df = df.loc[df.index.repeat(df['frequency'])].reset_index(drop=True)

    return df

#### variables

In [74]:
input_path = "raw_salary_compile/combined_salary_9.csv"
output_folder = "filtered_dataset" 

#### main functions

In [75]:
# Usage Example
# Assuming `df` is your input DataFrame

output_path = get_next_filename("filtered_dataset", output_folder)
df = pd.read_csv(input_path, low_memory=False)  # Load your dataset

processed_df = process_dataset(df)  # Process the dataset
processed_df.to_csv(output_path, index=False)