In [4]:
"""
This script reads a CSV file containing original and corrected sentences in English.
It then cleans the dataset by:
1. Removing uncorrected rows and rows with missing data.
2. Ensuring both the "original" and "corrected" columns only contain string values.
3. Removing non-English characters from the sentences.
4. Filtering out rows that are not genuine corrections based on a threshold.
"""
import pandas as pd
import re

# Read the csv file
file_path = "/Users/stefanhall/Documents/Studies/MDSI/ANLP/AT2/anlp-at2-gpt45/2. Raw Data/Japanese_to_English.csv"
df = pd.read_csv(file_path)

In [5]:

def drop_uncorrected_and_missing_rows(df):
    """
    Remove rows with uncorrected sentences and missing data, and ensure both "original"
    and "corrected" columns only contain string values.
    
    Args:
        df (pd.DataFrame): DataFrame containing the original and corrected sentences.
    
    Returns:
        pd.DataFrame: DataFrame with uncorrected, missing and non-string rows removed.
    """
    df = df[df["corrected"] != "Uncorrected"]
    df = df.dropna(subset=["original", "corrected"])
    df = df[df[["original", "corrected"]].applymap(lambda x: isinstance(x, str)).all(axis=1)]
    return df

df_cleaned = drop_uncorrected_and_missing_rows(df)

In [6]:

def remove_non_english_chars(df):
    """
    Remove non-English characters from the "original" and "corrected" columns.
    
    Args:
        df (pd.DataFrame): DataFrame containing the original and corrected sentences.
    
    Returns:
        pd.DataFrame: DataFrame with non-English characters removed.
    """
    df["original"] = df["original"].apply(lambda x: re.sub(r"[^a-zA-Z0-9\s.,!?'-]+", "", x) if isinstance(x, str) else x)
    df["corrected"] = df["corrected"].apply(lambda x: re.sub(r"[^a-zA-Z0-9\s.,!?'-]+", "", x) if isinstance(x, str) else x)
    return df

df_cleaned = remove_non_english_chars(df_cleaned)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["original"] = df["original"].apply(lambda x: re.sub(r"[^a-zA-Z0-9\s.,!?'-]+", "", x))


TypeError: expected string or bytes-like object

In [None]:




def filter_genuine_corrections(df, threshold=0.75):
    """
    Filter out rows that are not genuine corrections based on a threshold.
    
    Args:
        df (pd.DataFrame): DataFrame containing the original and corrected sentences.
        threshold (float): Threshold for considering a row as a genuine correction.
        
    Returns:
        pd.DataFrame: DataFrame containing genuine corrections only.
    """
    def is_genuine_correction(row):
        if isinstance(row["original"], str) and isinstance(row["corrected"], str):
            return len(row["corrected"]) <= threshold * len(row["original"])
        return False

    return df[df.apply(is_genuine_correction, axis=1)]

df_cleaned = filter_genuine_corrections(df_cleaned)
