In [12]:
"""
This script reads a CSV file containing original and corrected sentences in English.
It then cleans the dataset by:
1. Removing uncorrected rows and rows with missing data.
2. Ensuring both the "original" and "corrected" columns only contain string values.
3. Removing non-English characters from the sentences.
4. Filtering out rows that are not genuine corrections based on a threshold.
"""
import pandas as pd
import re
# from scripts.cosine_lev import calculate_cosine_similarity
# from scripts.lev_distance import levenshtein_distance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from multiprocessing import Pool, cpu_count


# # Get the number of available CPU cores
# num_cores = multiprocessing.cpu_count()

# # Calculate the number of processors to use as max processors minus 1
# num_processors = num_cores - 1

# pool = multiprocessing.Pool()

# Read the csv file
file_path = "/Users/stefanhall/Documents/Studies/MDSI/ANLP/AT2/anlp-at2-gpt45/2. Raw Data/Japanese_to_English.csv"
df = pd.read_csv(file_path)
print(df.shape)

(2408334, 2)


In [13]:
def drop_uncorrected_and_missing_rows(df):
    """
    Remove rows with uncorrected sentences and missing data, and ensure both "original"
    and "corrected" columns only contain string values.
    
    Args:
        df (pd.DataFrame): DataFrame containing the original and corrected sentences.
    
    Returns:
        pd.DataFrame: DataFrame with uncorrected, missing and non-string rows removed.
    """
    df = df[df["corrected"] != "Uncorrected"]
    df = df.dropna(subset=["original", "corrected"])
    df = df[df[["original", "corrected"]].applymap(lambda x: isinstance(x, str)).all(axis=1)]
    return df
df_cleaned = drop_uncorrected_and_missing_rows(df)
print(df_cleaned.shape)

(1011172, 2)


In [14]:
def remove_non_english_chars(df):
    """
    Remove non-English characters from the "original" and "corrected" columns.
    
    Args:
        df (pd.DataFrame): DataFrame containing the original and corrected sentences.
    
    Returns:
        pd.DataFrame: DataFrame with non-English characters removed.
    """
    df["original"] = df["original"].apply(lambda x: re.sub(r"[^a-zA-Z0-9\s.,!?'-]+", "", x) if isinstance(x, str) else x)
    df["corrected"] = df["corrected"].apply(lambda x: re.sub(r"[^a-zA-Z0-9\s.,!?'-]+", "", x) if isinstance(x, str) else x)
    return df
print(df_cleaned.shape)
df_cleaned = remove_non_english_chars(df_cleaned)
print(df_cleaned.shape)

(1011172, 2)
(1011172, 2)


In [15]:
def filter_genuine_corrections(df, lower_threshold=0.75, upper_threshold=1.25):
    """
    Filter out rows that are not genuine corrections based on lower and upper thresholds.
    The thresholds allows difference in character lengths per row
    
    Args:
        df (pd.DataFrame): DataFrame containing the original and corrected sentences.
        lower_threshold (float): Lower threshold for considering a row as a genuine correction.
        upper_threshold (float): Upper threshold for considering a row as a genuine correction.
        
    Returns:
        pd.DataFrame: DataFrame containing genuine corrections only.
    """
    
    return df[df.apply(lambda row: lower_threshold * len(row["original"]) <= len(row["corrected"]) <= upper_threshold * len(row["original"]), axis=1)]
print(df_cleaned.shape)
df_cleaned = filter_genuine_corrections(df_cleaned)
print(df_cleaned.shape)


(1011172, 2)
(789199, 2)


In [16]:
df_cleaned.to_csv("../2. Raw Data/Japanese_to_English_cosine.csv", index=False)