In [1]:
import pandas as pd
import kagglehub
from typing import Any, Dict, List

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
def combine_text_columns(
    df: pd.DataFrame,
    columns_to_combine: List[str],
    output_column: str = "CombinedText"
) -> pd.DataFrame:
    """
    Combine multiple text columns into a single new column in the DataFrame.

    This function safely handles missing values and whitespace.

    Args:
        df (pd.DataFrame): The input DataFrame containing the text columns.
        columns_to_combine (List[str]): List of column names to combine.
        output_column (str): Name of the new combined column to create.

    Returns:
        pd.DataFrame: The original DataFrame with an additional combined text column.
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame.")

    if not isinstance(columns_to_combine, list) or not all(isinstance(col, str) for col in columns_to_combine):
        raise TypeError("columns_to_combine must be a list of column name strings.")

    missing_cols = [col for col in columns_to_combine if col not in df.columns]
    if missing_cols:
        raise ValueError(f"The following columns are missing from the DataFrame: {missing_cols}")

    df["Combined_Text_Col"] = df[columns_to_combine]\
        .fillna("")\
        .astype(str)\
        .agg(" ".join, axis=1)\
        .str.strip()

    return df

In [None]:
#load the dataset
raw_multi_df = pd.read_csv("C:/Users/toobr/.cache/kagglehub/datasets/tobiasbueck/multilingual-customer-support-tickets/versions/12/aa_dataset-tickets-multi-lang-5-2-50-version.csv")

#save to project data folder
raw_multi_df.to_csv("../data/raw/aa_dataset-tickets-multi-lang-5-2-50-version.csv", index=False)

In [None]:
#print sample row
raw_multi_df.sample(n=1)

Unnamed: 0,subject,body,answer,type,queue,priority,language,version,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8
27255,Assistance with Digital Strategies for Brand Visibility,"Inquiring about digital strategies to enhance brand visibility using the Hemingway Editor. Would appreciate detailed information and examples of case studies that demonstrate the effectiveness of these strategies. Additionally, interested in understanding how using the Hemingway Editor can benefit the brand's online presence. Looking forward to hearing back soon.","Thank you for your interest in digital strategies for enhancing brand visibility with the Hemingway Editor. Our team uses this tool to simplify and clarify content, making it more accessible to a wider audience and thereby improving the brand's online presence. We have seen significant success with this approach, as evidenced by various case studies where brands have experienced increased engagement and visibility after implementing these strategies. Let's schedule a call at your convenience to discuss the specific benefits and solutions in more detail.",Request,Technical Support,high,en,400,Feedback,Feature,Documentation,Sales,,,,


In [8]:
#run the combine function
df_combo_text = combine_text_columns(raw_multi_df, ["subject", "body"])
df_combo_text.sample(n=1)

Unnamed: 0,subject,body,answer,type,queue,priority,language,version,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8,Combined_Text_Col
3614,Security Breach in Data Security,A data security breach happened due to identified vulnerabilities. Efforts included scan attempts and updates.,We are investigating the reported data security breach. Kindly provide details on the vulnerabilities discovered during scans and any updates attempted. A call may be necessary to discuss the matter further; please let us know a convenient time at <tel_num> to review the breach details for account <acc_num>.,Problem,Product Support,medium,en,52,Security,Data Security,Vulnerability,Breach,Threat,Incident,IT,,Security Breach in Data Security A data security breach happened due to identified vulnerabilities. Efforts included scan attempts and updates.
