# Data Analysis

### Setup

In [None]:
# Libraries
import os
import sys
import re

import time
import datetime

import difflib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Set display options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 50)
pd.set_option("display.width", 1000)

In [None]:
# Define path to the CSV files
data_path = "../data/raw/"

In [None]:
# Load the CSV data into a pandas DataFrame
try: 
    clients_df = pd.read_csv(os.path.join(data_path, "clients.csv"))
    sample_df = pd.read_csv(os.path.join(data_path, "sample.csv"))
    schedules_df = pd.read_csv(os.path.join(data_path, "schedules.csv"))
    translators_df = pd.read_csv(os.path.join(data_path, "translatorsCostPairs.csv"))
except FileNotFoundError as e:
    print(f"File not found: {e}")

### Overview

In [None]:
print("CSV Client Data")
display(clients_df.head())

print("CSV Sample Data")
display(sample_df.head())

print("CSV Schedules Data")
display(schedules_df.head())

print("CSV Translators Data")
display(translators_df.head())

### Inspection

In [None]:
tuples_df = {
    "Clients": clients_df,
    "Sample": sample_df,
    "Schedules": schedules_df,
    "Translators": translators_df
}

In [None]:
print("MISSING VALUES", "\n")

for name, df in tuples_df.items():
    print(f"{name} Dataset")
    print(f"\t{df.isnull().sum().sum()} missing values")
    print()

In [None]:
print("DUPLICATED VALUES", "\n")

for name, df in tuples_df.items():
    print(f"{name} Dataset")
    print(f"\t{df.duplicated().sum()} duplicated values")
    print()

In [None]:
print("UNIQUE VALUES", "\n")

for name, df in tuples_df.items():
    print(f"{name} Dataset")
    print(f"\tUnique Values:")
    for col in df.columns:
        print(f"\t\t{col}: {df[col].nunique()}")    
    print()

In [None]:
print("DATASETS", "\n")

for name, df in tuples_df.items():
    print(f"{name} Dataset")
    print(f"\t{df.shape[0]} rows and {df.shape[1]} columns")
    print(f"\tData Types:")
    for col, dtype in df.dtypes.items():
        print(f"\t\t{col}: {dtype}")
    print()

### CSV

In [None]:
def get_unique_values(df):
    """
    Get unique values for each column in a DataFrame

    Parameters:
    df (pd.DataFrame): DataFrame to get unique values from

    Returns:
    dict: Unique values for each column
    """

    unique_values = {}
    for col in df.columns:
        unique_values[col] = df[col].unique()
    return unique_values


In [None]:
cols_useful = {
    "Clients": ["CLIENT_NAME", "WILDCARD"],
    "Sample": [
        "TASK_TYPE",
        "SOURCE_LANG",
        "TARGET_LANG",
        "TRANSLATOR",
        "MANUFACTURER",
        "MANUFACTURER_SECTOR",
        "MANUFACTURER_INDUSTRY_GROUP",
        "MANUFACTURER_INDUSTRY",
        "MANUFACTURER_SUBINDUSTRY",
    ],
    "Schedules": ["NAME"],
    "Translators": ["TRANSLATOR", "SOURCE_LANG", "TARGET_LANG", "HOURLY_RATE"],
}

In [None]:
print("UNIQUE VALUES")
print()

# Unique values for each column in each DataFrame
unique_values = {}

for name, df in tuples_df.items():
    unique_values[name] = get_unique_values(df[cols_useful[name]])   

for name, values in unique_values.items():
    print(f"{name} Dataset")
    for col, unique_vals in values.items():
        display(pd.DataFrame({col: unique_vals}))
    print()

#### CSV Clients

In [None]:
print(f"Number of unique clients: {clients_df['CLIENT_NAME'].nunique()}")

In [None]:
# Distribution - Selling prices & Minimum quality requirements
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(clients_df["SELLING_HOURLY_PRICE"], kde=True)
plt.title("Distribution of Selling Hourly Prices")
plt.xlabel("Price")
plt.ylabel("Count")

plt.subplot(1, 2, 2)
sns.histplot(clients_df["MIN_QUALITY"], kde=True)
plt.title("Distribution of Minimum Quality Requirements")
plt.xlabel("Minimum Quality")
plt.ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
# Analyze relationship - Selling price & Minimum quality
plt.figure(figsize=(10, 6))
sns.scatterplot(x="MIN_QUALITY", y="SELLING_HOURLY_PRICE", data=clients_df)
plt.title("Relationship between Minimum Quality and Selling Price")
plt.xlabel("Minimum Quality")
plt.ylabel("Selling Hourly Price")
plt.show()

#### CSV Schedule

In [None]:
# Parse dates
if schedules_df["START"].dtype == "object":
    schedules_df["START"] = pd.to_datetime(schedules_df["START"], errors="coerce")
    schedules_df["END"] = pd.to_datetime(schedules_df["END"], errors="coerce")

In [None]:
print("Basic information:")
schedules_df.info()

In [None]:
print("Statistical summary:")
display(schedules_df.describe().round(2))

In [None]:
print("CSV Schedules Data")
display(schedules_df.head())

In [None]:
# TODO - Fix this error in preprocessing step

# Duration in hours for each row
schedules_df["HOURS"] = (schedules_df["END"] - schedules_df["START"]).dt.total_seconds() / 3600

# Group by NAME and sum the hours
total_hours_per_name = schedules_df.groupby("NAME")["HOURS"].sum().reset_index()


# Obtain ONLY
negative_hours = total_hours_per_name[total_hours_per_name["HOURS"] < 0]


print("Total Hours Per Name:")
display(total_hours_per_name)

if not negative_hours.empty:
    print("Names with Negative Hours (Incorrect Data):")
    display(negative_hours)
else:
    print("No negative hours detected")


In [None]:
# TODO - Overworking hours...

# Calculate weekday and weekend hours directly
weekday_hours = schedules_df[["MON", "TUES", "WED", "THURS", "FRI"]].sum(axis=1)
weekend_hours = schedules_df[["SAT", "SUN"]].sum(axis=1)

# Create a heatmap of weekday vs weekend hours
heatmap_data, x_edges, y_edges = np.histogram2d(weekday_hours, weekend_hours, bins=(6, 3))

# Plot the heatmap
plt.figure(figsize=(10, 6))
plt.imshow(heatmap_data.T, origin="lower", cmap="YlGnBu", aspect="auto",
           extent=[x_edges[0], x_edges[-1], y_edges[0], y_edges[-1]])
plt.colorbar(label="Frequency")
plt.title("Heatmap of Weekday vs Weekend Availability")
plt.xlabel("Weekday Hours")
plt.ylabel("Weekend Hours")
plt.xticks(range(int(x_edges[0]), int(x_edges[-1]) + 1))
plt.yticks(range(int(y_edges[0]), int(y_edges[-1]) + 1))
plt.show()


#### CSV Translators Cost Pairs

In [None]:
print(f"Number of unique translators: {translators_df['TRANSLATOR'].nunique()}")
print(f"Number of source languages: {translators_df['SOURCE_LANG'].nunique()}")
print(f"Number of target languages: {translators_df['TARGET_LANG'].nunique()}")
print(f"Number of unique language pairs: {translators_df.groupby(['SOURCE_LANG', 'TARGET_LANG']).ngroups}")

#### CSV Sample

In [None]:
print("Basic information:")
sample_df.info()

In [None]:
print("Missing values by column:")
print(sample_df.isnull().sum())

In [None]:
# Count unique values for categorical columns
categorical_cols = ["PM", "TASK_TYPE", "SOURCE_LANG", "TARGET_LANG", 
                   "TRANSLATOR", "MANUFACTURER", "MANUFACTURER_SECTOR",
                   "MANUFACTURER_INDUSTRY_GROUP", "MANUFACTURER_INDUSTRY", 
                   "MANUFACTURER_SUBINDUSTRY"]

print("Unique values for Categorical columns:")
print()
for col in categorical_cols:
    print(f"{col}: {sample_df[col].nunique()} unique values")

In [None]:
# Columns with possible repetitive information
sample_df[["MANUFACTURER", "MANUFACTURER_SECTOR", "MANUFACTURER_INDUSTRY_GROUP", "MANUFACTURER_INDUSTRY", "MANUFACTURER_SUBINDUSTRY"]].head()

In [None]:
print(f"Number of unique projects: {sample_df['PROJECT_ID'].nunique()}")
print(f"Number of unique tasks: {sample_df['TASK_ID'].nunique()}")
print(f"Number of unique translators: {sample_df['TRANSLATOR'].nunique()}")
print(f"Number of source languages: {sample_df['SOURCE_LANG'].nunique()}")
print(f"Number of target languages: {sample_df['TARGET_LANG'].nunique()}")
print(f"Number of unique task types: {sample_df['TASK_TYPE'].nunique()}")

In [None]:
# Check distribution of quality evaluations (quality control scores)
plt.figure(figsize=(10, 6))
sns.histplot(sample_df["QUALITY_EVALUATION"].dropna(), bins=10, kde=True, alpha=0.75)
plt.title("Distribution of Quality Evaluations")
plt.xlabel("Quality Score")
plt.ylabel("Count")
plt.show()


In [None]:
# Distribution of hourly rates and costs
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(sample_df["HOURLY_RATE"].dropna())
plt.title("Distribution of Hourly Rates")
plt.xlabel("Hourly Rate")
plt.ylabel("Count")

plt.subplot(1, 2, 2)
sns.histplot(sample_df["COST"].dropna())
plt.title("Distribution of Costs")
plt.xlim(0, 500)
plt.xlabel("Cost")
plt.ylabel("Count")

plt.tight_layout()
plt.show()

## Data Cleaning and Preprocessing

In [None]:
def standardize_language(lang, database):
    """
    Standardize a language name to match a reference list using Levenshtein distance

    Parameters:
    lang (str): Language to standardize
    database (list): List of reference language names

    Returns:
    str: Standardized language name
    """
    # Lowercase the language name
    reference_list = [x.lower() for x in database if isinstance(x, str)]
    
    # Levenshtein distance to get the closest match
    closest = difflib.get_close_matches(lang.lower(), reference_list, n=1, cutoff=0)[0]
    
    # Get the index of the closest match and return the original case from reference_list
    index = reference_list.index(closest)
    
    return database[index]

# Database
all_languages = np.sort(pd.concat([
    sample_df["SOURCE_LANG"],
    sample_df["TARGET_LANG"],
    translators_df["SOURCE_LANG"],
    translators_df["TARGET_LANG"]
]).dropna().unique())

In [None]:
all_languages

```python
# Example
new_input_lang = "Spanish (Iberiano)"

# Standardize the language
standardized_lang = standardize_language(new_input_lang, all_languages)
print(standardized_lang) 


# Example using Dataframe
new_record = pd.DataFrame([{
    "PROJECT_ID": 213495,
    "PM": "JSM",
    "TASK_ID": 10048286,
    "START": "2014-11-27 09:00:00",
    "END": "2014-11-27 13:00:00",
    "TASK_TYPE": "Translation",
    "SOURCE_LANG": "Spanishh (Iberian)",
    "TARGET_LANG": "Englisasdh",
    "TRANSLATOR": "Carlos",
    "ASSIGNED": "2014-11-27 08:30:00",
    "CLOSE": "2014-11-27 14:00:00",
    "FORECAST": 0.5,
    "HOURLY_RATE": 30,
    "COST": 15.0,
    "QUALITY_EVALUATION": 8,
    "MANUFACTURER": "Global Tech",
    "MANUFACTURER_SECTOR": "Technology",
    "MANUFACTURER_INDUSTRY_GROUP": "Software & Services",
    "MANUFACTURER_INDUSTRY": "IT Services",
    "MANUFACTURER_SUBINDUSTRY": "Data Processing & Outsourced Services"
}])

# Dataframe
new_record_df = new_record

# Display results
display(new_record_df)

new_record_df["SOURCE_LANG"] = new_record_df["SOURCE_LANG"].apply(standardize_language, database=all_languages)
new_record_df["TARGET_LANG"] = new_record_df["TARGET_LANG"].apply(standardize_language, database=all_languages)

display(new_record_df)


````