# Data Analysis

### Setup

In [132]:
# Libraries
import os
import sys
import re

import time
import datetime

import difflib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
# Define path to the CSV files
data_path = "../data/raw/"

In [14]:
# Load the CSV data into a pandas DataFrame
clients_df = pd.read_csv(os.path.join(data_path, "clients.csv"))
sample_df = pd.read_csv(os.path.join(data_path, "sample.csv"))
schedules_df = pd.read_csv(os.path.join(data_path, "schedules.csv"))
translators_df = pd.read_csv(os.path.join(data_path, "translatorsCostPairs.csv"))

### Overview

In [15]:
print("CSV Client Data")
display(clients_df.head())

print("CSV Sample Data")
display(sample_df.head())

print("CSV Schedules Data")
display(schedules_df.head())

print("CSV Translators Data")
display(translators_df.head())

CSV Client Data


Unnamed: 0,CLIENT_NAME,SELLING_HOURLY_PRICE,MIN_QUALITY,WILDCARD
0,Accesstra,25,7.0,Quality
1,AccuBank Industries,30,7.0,Deadline
2,AccuBuild Industries,35,7.5,Quality
3,AccuPulse,25,7.5,Deadline
4,Accura Systems,40,7.5,Price


CSV Sample Data


Unnamed: 0,PROJECT_ID,PM,TASK_ID,START,END,TASK_TYPE,SOURCE_LANG,TARGET_LANG,TRANSLATOR,ASSIGNED,...,CLOSE,FORECAST,HOURLY_RATE,COST,QUALITY_EVALUATION,MANUFACTURER,MANUFACTURER_SECTOR,MANUFACTURER_INDUSTRY_GROUP,MANUFACTURER_INDUSTRY,MANUFACTURER_SUBINDUSTRY
0,213494,KMT,10048285,2014-11-26 10:36:00,2014-11-26 15:30:00,Engineering,English,Portuguese (Brazil),Estela,2014-11-26 16:34:23,...,2014-11-26 17:51:48,0.25,24,6.0,7,Coastal Cottage,Consumer Discretionary,Consumer Services,"Hotels, Restaurants & Leisure","Hotels, Resorts & Cruise Lines"
1,214198,KMT,10048285,2015-09-09 17:29:00,2015-09-10 11:00:00,Engineering,English,Spanish (Iberian),Jeronimo,2015-09-09 17:31:52,...,2015-09-10 17:33:07,1.5,20,30.0,7,HealthyLife,Health Care,Health Care Providers,Health Care Facilities,Long-Term Care Facilities
2,213094,PMT,10048285,2014-05-01 19:27:00,2014-05-02 19:00:00,Engineering,Catalan,Catalan,Octavi,2014-05-02 13:29:33,...,2014-05-02 13:29:44,0.33,15,4.95,6,AeroSysTech,Information Technology,Software & Services,Application Software,Systems Software
3,211967,KMT,10048285,2013-07-17 17:40:00,2013-08-31 18:00:00,Management,English,Spanish (Iberian),Ramiro Josafat,2013-07-22 15:35:11,...,2013-08-31 16:28:18,0.5,20,10.0,6,MetaPro,Industrials,Electrical Equipment,Electrical Components & Equipment,Electrical Components & Equipment
4,212331,PMT,10048285,2013-11-01 13:13:00,2013-10-31 19:00:00,Miscellaneous,Catalan,Catalan,Victor,2013-10-31 13:14:35,...,2013-10-31 13:18:34,0.0,11,0.0,8,SunTech,Information Technology,Technology Hardware & Equipment,"Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals"


CSV Schedules Data


Unnamed: 0,NAME,START,END,MON,TUES,WED,THURS,FRI,SAT,SUN
0,Abigail,8:00:00 AM,6:00:00 PM,1,1,1,1,1,0,0
1,Abelardo,6:00:00 AM,4:00:00 PM,1,1,1,1,1,1,1
2,Margarita,8:00:00 AM,6:00:00 PM,1,1,1,1,1,1,1
3,Davide,9:00:00 AM,7:00:00 PM,1,1,1,1,1,1,1
4,Paul,7:00:00 AM,5:00:00 PM,1,1,1,1,1,1,1


CSV Translators Data


Unnamed: 0,TRANSLATOR,SOURCE_LANG,TARGET_LANG,HOURLY_RATE
0,Aaron,English,Basque,27
1,Aaron,English,Catalan,17
2,Aaron,English,Galician,20
3,Aaron,English,German,32
4,Aaron,English,Portuguese (Brazil),21


### Inspection

In [89]:
tuples_df = {
    "Clients": clients_df,
    "Sample": sample_df,
    "Schedules": schedules_df,
    "Translators": translators_df
}

In [91]:
print("MISSING VALUES", "\n")

for name, df in tuples_df.items():
    print(f"{name} Dataset")
    print(f"\t{df.isnull().sum().sum()} missing values")
    print()

MISSING VALUES 

Clients Dataset
	0 missing values

Sample Dataset
	0 missing values

Schedules Dataset
	0 missing values

Translators Dataset
	0 missing values



In [92]:
print("DUPLICATED VALUES", "\n")

for name, df in tuples_df.items():
    print(f"{name} Dataset")
    print(f"\t{df.duplicated().sum()} duplicated values")
    print()

DUPLICATED VALUES 

Clients Dataset
	0 duplicated values

Sample Dataset
	0 duplicated values

Schedules Dataset
	0 duplicated values

Translators Dataset
	0 duplicated values



In [93]:
print("UNIQUE VALUES", "\n")

for name, df in tuples_df.items():
    print(f"{name} Dataset")
    print(f"\tUnique Values:")
    for col in df.columns:
        print(f"\t\t{col}: {df[col].nunique()}")    
    print()

UNIQUE VALUES 

Clients Dataset
	Unique Values:
		CLIENT_NAME: 2646
		SELLING_HOURLY_PRICE: 14
		MIN_QUALITY: 4
		WILDCARD: 3

Sample Dataset
	Unique Values:
		PROJECT_ID: 1720
		PM: 4
		TASK_ID: 31009
		START: 18318
		END: 6440
		TASK_TYPE: 6
		SOURCE_LANG: 11
		TARGET_LANG: 18
		TRANSLATOR: 228
		ASSIGNED: 29949
		READY: 30984
		WORKING: 30977
		DELIVERED: 30985
		RECEIVED: 29801
		CLOSE: 29665
		FORECAST: 2472
		HOURLY_RATE: 35
		COST: 6824
		QUALITY_EVALUATION: 11
		MANUFACTURER: 250
		MANUFACTURER_SECTOR: 14
		MANUFACTURER_INDUSTRY_GROUP: 70
		MANUFACTURER_INDUSTRY: 85
		MANUFACTURER_SUBINDUSTRY: 92

Schedules Dataset
	Unique Values:
		NAME: 983
		START: 22
		END: 22
		MON: 2
		TUES: 2
		WED: 2
		THURS: 2
		FRI: 2
		SAT: 2
		SUN: 2

Translators Dataset
	Unique Values:
		TRANSLATOR: 983
		SOURCE_LANG: 40
		TARGET_LANG: 80
		HOURLY_RATE: 54



In [94]:
print("DATASETS", "\n")

for name, df in tuples_df.items():
    print(f"{name} Dataset")
    print(f"\t{df.shape[0]} rows and {df.shape[1]} columns")
    print(f"\tData Types:")
    for col, dtype in df.dtypes.items():
        print(f"\t\t{col}: {dtype}")
    print()

DATASETS 

Clients Dataset
	2646 rows and 4 columns
	Data Types:
		CLIENT_NAME: object
		SELLING_HOURLY_PRICE: int64
		MIN_QUALITY: float64
		WILDCARD: object

Sample Dataset
	31017 rows and 24 columns
	Data Types:
		PROJECT_ID: object
		PM: object
		TASK_ID: int64
		START: object
		END: object
		TASK_TYPE: object
		SOURCE_LANG: object
		TARGET_LANG: object
		TRANSLATOR: object
		ASSIGNED: object
		READY: object
		WORKING: object
		DELIVERED: object
		RECEIVED: object
		CLOSE: object
		FORECAST: float64
		HOURLY_RATE: int64
		COST: float64
		QUALITY_EVALUATION: int64
		MANUFACTURER: object
		MANUFACTURER_SECTOR: object
		MANUFACTURER_INDUSTRY_GROUP: object
		MANUFACTURER_INDUSTRY: object
		MANUFACTURER_SUBINDUSTRY: object

Schedules Dataset
	983 rows and 10 columns
	Data Types:
		NAME: object
		START: datetime64[ns]
		END: datetime64[ns]
		MON: int64
		TUES: int64
		WED: int64
		THURS: int64
		FRI: int64
		SAT: int64
		SUN: int64

Translators Dataset
	4690 rows and 4 columns
	Data Type

### CSV

In [95]:
def get_unique_values(df):
    """
    Get unique values for each column in a DataFrame

    Parameters:
    df (pd.DataFrame): DataFrame to get unique values from

    Returns:
    dict: Unique values for each column
    """

    unique_values = {}
    for col in df.columns:
        unique_values[col] = df[col].unique()
    return unique_values


In [101]:
cols_useful = {
    "Clients": ["CLIENT_NAME", "WILDCARD"],
    "Sample": [
        "TASK_TYPE",
        "SOURCE_LANG",
        "TARGET_LANG",
        "TRANSLATOR",
        "MANUFACTURER",
        "MANUFACTURER_SECTOR",
        "MANUFACTURER_INDUSTRY_GROUP",
        "MANUFACTURER_INDUSTRY",
        "MANUFACTURER_SUBINDUSTRY",
    ],
    "Schedules": ["NAME"],
    "Translators": ["TRANSLATOR", "SOURCE_LANG", "TARGET_LANG", "HOURLY_RATE"],
}

In [None]:
print("UNIQUE VALUES", "\n")

# Unique values for each column in each DataFrame
unique_values = {}

for name, df in tuples_df.items():
    unique_values[name] = get_unique_values(df[cols_useful[name]])   

for name, values in unique_values.items():
    print(f"{name} Dataset")
    for col, unique_vals in values.items():
        display(pd.DataFrame({col: unique_vals}))
    print()

UNIQUE VALUES 

Clients Dataset


Unnamed: 0,CLIENT_NAME
0,Accesstra
1,AccuBank Industries
2,AccuBuild Industries
3,AccuPulse
4,Accura Systems
...,...
2641,Zenith Zippers
2642,ZenithSoft
2643,ZenithWorks Industrial Systems
2644,Zentech Machines


Unnamed: 0,WILDCARD
0,Quality
1,Deadline
2,Price



Sample Dataset


Unnamed: 0,TASK_TYPE
0,Engineering
1,Management
2,Miscellaneous
3,Translation
4,ProofReading
5,DTP


Unnamed: 0,SOURCE_LANG
0,English
1,Catalan
2,Spanish (Iberian)
3,French
4,German
5,Spanish (Global)
6,Dutch
7,Italian
8,Spanish (LA)
9,Portuguese (Brazil)


Unnamed: 0,TARGET_LANG
0,Portuguese (Brazil)
1,Spanish (Iberian)
2,Catalan
3,Galician
4,Spanish (Global)
5,English
6,Basque
7,Spanish (LA)
8,Portuguese (Iberian)
9,French


Unnamed: 0,TRANSLATOR
0,Estela
1,Jeronimo
2,Octavi
3,Ramiro Josafat
4,Victor
...,...
223,Gregorio Luis
224,Greta
225,Donato
226,Fiamma


Unnamed: 0,MANUFACTURER
0,Coastal Cottage
1,HealthyLife
2,AeroSysTech
3,MetaPro
4,SunTech
...,...
245,VidaCore Biotech
246,NexaPharm
247,ProximaMed
248,Workhorse Industries


Unnamed: 0,MANUFACTURER_SECTOR
0,Consumer Discretionary
1,Health Care
2,Information Technology
3,Industrials
4,Technology Hardware
5,Consumer Staples
6,Communication Services
7,Utilities
8,Materials
9,Financials


Unnamed: 0,MANUFACTURER_INDUSTRY_GROUP
0,Consumer Services
1,Health Care Providers
2,Software & Services
3,Electrical Equipment
4,Technology Hardware & Equipment
...,...
65,Public Fundation
66,IT Services
67,Health Care Equipment & Svcs.
68,Internet & Direct Marketing Retail


Unnamed: 0,MANUFACTURER_INDUSTRY
0,"Hotels, Restaurants & Leisure"
1,Health Care Facilities
2,Application Software
3,Electrical Components & Equipment
4,"Technology Hardware, Storage & Peripherals"
...,...
80,Public Fundation
81,Broadcasting & Cable TV
82,IT Consulting & Other Services
83,Health Care Providers & Services


Unnamed: 0,MANUFACTURER_SUBINDUSTRY
0,"Hotels, Resorts & Cruise Lines"
1,Long-Term Care Facilities
2,Systems Software
3,Electrical Components & Equipment
4,"Technology Hardware, Storage & Peripherals"
...,...
87,Capital Markets
88,Health Care Providers & Services
89,Internet Retail
90,"Tissue, Paper & Forest Products"



Schedules Dataset


Unnamed: 0,NAME
0,Abigail
1,Abelardo
2,Margarita
3,Davide
4,Paul
...,...
978,Genoveva
979,Sergi
980,Acolmiztli
981,Ana Clara



Translators Dataset


Unnamed: 0,TRANSLATOR
0,Aaron
1,Abdon
2,Abdon Isaias
3,Abdon Luis
4,Abel Irene
...,...
978,Zacarias Casio
979,Zacarias Marcelino
980,Zachary
981,Zlatan


Unnamed: 0,SOURCE_LANG
0,English
1,French
2,German
3,Italian
4,Catalan
5,Spanish (Global)
6,Spanish (Iberian)
7,Spanish (LA)
8,Dutch
9,English (UK)


Unnamed: 0,TARGET_LANG
0,Basque
1,Catalan
2,Galician
3,German
4,Portuguese (Brazil)
...,...
75,Qeqchi
76,Fulah
77,Scottish
78,German (Switzerland)


Unnamed: 0,HOURLY_RATE
0,27
1,17
2,20
3,32
4,21
5,18
6,14
7,19
8,25
9,24





### CSV Clients

In [115]:
print(f"Number of unique clients: {clients_df['CLIENT_NAME'].nunique()}")

Number of unique clients: 2646


### CSV Schedule

In [116]:
# Parse dates
if schedules_df["START"].dtype == "object":
    schedules_df["START"] = pd.to_datetime(schedules_df["START"], errors="coerce")
    schedules_df["END"] = pd.to_datetime(schedules_df["END"], errors="coerce")

### CSV Translators Cost Pairs

In [117]:
print(f"Number of unique translators: {translators_df['TRANSLATOR'].nunique()}")
print(f"Number of source languages: {translators_df['SOURCE_LANG'].nunique()}")
print(f"Number of target languages: {translators_df['TARGET_LANG'].nunique()}")
print(f"Number of unique language pairs: {translators_df.groupby(['SOURCE_LANG', 'TARGET_LANG']).ngroups}")

Number of unique translators: 983
Number of source languages: 40
Number of target languages: 80
Number of unique language pairs: 300


### CSV Sample

In [118]:
print(f"Number of unique projects: {sample_df['PROJECT_ID'].nunique()}")
print(f"Number of unique tasks: {sample_df['TASK_ID'].nunique()}")
print(f"Number of unique translators: {sample_df['TRANSLATOR'].nunique()}")
print(f"Number of source languages: {sample_df['SOURCE_LANG'].nunique()}")
print(f"Number of target languages: {sample_df['TARGET_LANG'].nunique()}")
print(f"Number of unique task types: {sample_df['TASK_TYPE'].nunique()}")

Number of unique projects: 1720
Number of unique tasks: 31009
Number of unique translators: 228
Number of source languages: 11
Number of target languages: 18
Number of unique task types: 6


## Data Cleaning and Preprocessing

In [173]:
def standardize_language(lang, database):
    """
    Standardize a language name to match a reference list using Levenshtein distance

    Parameters:
    lang (str): Language to standardize
    database (list): List of reference language names

    Returns:
    str: Standardized language name
    """
    # Lowercase the language name
    reference_list = [x.lower() for x in database if isinstance(x, str)]
    
    # Levenshtein distance to get the closest match
    closest = difflib.get_close_matches(lang.lower(), reference_list, n=1, cutoff=0)[0]
    
    # Get the index of the closest match and return the original case from reference_list
    index = reference_list.index(closest)
    
    return database[index]

# Database
all_languages = np.sort(pd.concat([
    sample_df["SOURCE_LANG"],
    sample_df["TARGET_LANG"],
    translators_df["SOURCE_LANG"],
    translators_df["TARGET_LANG"]
]).dropna().unique())

In [175]:
all_languages

array(['Afrikaans', 'Arabic', 'Asturian', 'Aymara', 'Azeri', 'Bahasa',
       'Basque', 'Belorussian', 'Bulgarian', 'Catalan',
       'Chinese (Simplified)', 'Chinese(Traditional)', 'Chuj', 'Croat',
       'Czech', 'Danish', 'Dutch', 'English', 'English (UK)',
       'English (US)', 'Estonian', 'Finnish', 'Flemish', 'French',
       'French (Canadian)', 'French (Morocco)', 'French (Switzerland)',
       'Fulah', 'Galician', 'German', 'German (Switzerland)', 'Greek',
       'Guaraní', 'Hebrew', 'Hindi', 'Hungarian', 'Indonesian', 'Irish',
       'Italian', 'Japanese', 'Kaqchikel', 'Kazakh', 'Korean', 'Latvian',
       'Lithuanian', 'Majorcan', 'Maltese', 'Mam', 'Mixteco', 'Nahuatl',
       'Norwegian', 'Persian', 'Polish', 'Portuguese (Brazil)',
       'Portuguese (Iberian)', 'Portuguese (SOURCE)', 'Qeqchi', 'Quechua',
       'Quiche', 'Romanian', 'Russian', 'Scottish', 'Serbian', 'Slovak',
       'Slovenian', 'Spanish (Argentina)', 'Spanish (Chile)',
       'Spanish (Global)', 'Spanish

```python
# Example
new_input_lang = "Spanish (Iberiano)"

# Standardize the language
standardized_lang = standardize_language(new_input_lang, all_languages)
print(standardized_lang) 


# Example using Dataframe
new_record = pd.DataFrame([{
    "PROJECT_ID": 213495,
    "PM": "JSM",
    "TASK_ID": 10048286,
    "START": "2014-11-27 09:00:00",
    "END": "2014-11-27 13:00:00",
    "TASK_TYPE": "Translation",
    "SOURCE_LANG": "Spanishh (Iberian)",
    "TARGET_LANG": "Englisasdh",
    "TRANSLATOR": "Carlos",
    "ASSIGNED": "2014-11-27 08:30:00",
    "CLOSE": "2014-11-27 14:00:00",
    "FORECAST": 0.5,
    "HOURLY_RATE": 30,
    "COST": 15.0,
    "QUALITY_EVALUATION": 8,
    "MANUFACTURER": "Global Tech",
    "MANUFACTURER_SECTOR": "Technology",
    "MANUFACTURER_INDUSTRY_GROUP": "Software & Services",
    "MANUFACTURER_INDUSTRY": "IT Services",
    "MANUFACTURER_SUBINDUSTRY": "Data Processing & Outsourced Services"
}])

# Dataframe
new_record_df = new_record

# Display results
display(new_record_df)

new_record_df["SOURCE_LANG"] = new_record_df["SOURCE_LANG"].apply(standardize_language, database=all_languages)
new_record_df["TARGET_LANG"] = new_record_df["TARGET_LANG"].apply(standardize_language, database=all_languages)

display(new_record_df)


````