In [None]:
import pandas as pd
import re
import os
import sys

path_base = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(path_base)
print("Base path added to sys.path:", path_base)

data_raw_dir = os.path.join(path_base, "data", "raw")
print("Data/raw directory:", data_raw_dir)

In [None]:
# Load all CSV files
df_clients = pd.read_csv(os.path.join(data_raw_dir, "clients.csv"))
df_data = pd.read_csv(os.path.join(data_raw_dir, "data.csv"))
df_schedules = pd.read_csv(os.path.join(data_raw_dir, "schedules.csv"))
df_translators = pd.read_csv(os.path.join(data_raw_dir, "translatorsCostPairs.csv"))

print("Clients shape:", df_clients.shape)
print("Data shape:", df_data.shape)
print("Schedules shape:", df_schedules.shape)
print("Translators shape:", df_translators.shape)

### Clients

In [None]:
df_clients = df_clients.drop_duplicates()
df_clients = df_clients.dropna()
print("Clients shape after cleaning:", df_clients.shape)

# Save
df_clients.to_csv(os.path.join(data_raw_dir, "clients.csv"), index=False)
print("Saved cleaned clients to:", os.path.join(data_raw_dir, "clients.csv"))

### Data

In [None]:
df_data = df_data.drop_duplicates()
print("Data shape after cleaning:", df_data.shape)

# Save
df_data.to_csv(os.path.join(data_raw_dir, "data.csv"), index=False)
print("Saved cleaned data to:", os.path.join(data_raw_dir, "data.csv"))

### Schedules

In [None]:
# Define a regex pattern for HH:MM:SS format
time_pattern = re.compile(r"^\d{2}:\d{2}:\d{2}$")

# Check for invalid START or END times before cleaning
invalid_rows_before = df_schedules[
    ~df_schedules["START"].str.match(time_pattern) | ~df_schedules["END"].str.match(time_pattern)
]
print("Invalid rows before cleaning:")
# print(invalid_rows_before)

# Remove the "YYYY-MM-DD " part from START and END columns
df_schedules["START"] = df_schedules["START"].str.split(" ").str[-1]
df_schedules["END"] = df_schedules["END"].str.split(" ").str[-1]

# Check for invalid START or END times after cleaning
invalid_rows_after = df_schedules[
    ~df_schedules["START"].str.match(time_pattern) | ~df_schedules["END"].str.match(time_pattern)
]

print("\nInvalid rows after cleaning:")
print(invalid_rows_after)

print("\nData shape after cleaning:", df_data.shape)

# Save
df_schedules.to_csv(os.path.join(data_raw_dir, "schedules.csv"), index=False)
print("Saved cleaned schedules to:", os.path.join(data_raw_dir, "schedules.csv"))

### Translators Cost Pairs

In [None]:
df_translators = df_translators.drop_duplicates()
df_translators = df_translators.dropna()
print("Translators shape after cleaning:", df_translators.shape)

# Save
df_translators.to_csv(os.path.join(data_raw_dir, "translatorsCostPairs.csv"), index=False)
print("Saved cleaned translators to:", os.path.join(data_raw_dir, "translatorsCostPairs.csv"))