In [100]:
from ast import literal_eval
import pandas as pd
import emoji
from datasets.arrow_dataset import Dataset
from datasets import Features, Value, ClassLabel
from datasets.info import DatasetInfo

pd.set_option("display.max_colwidth", 256)

In [101]:
MIN_TITLE_WORDS = 2
MIN_TITLE_WORD_LENGTH = 2

In [None]:
df_listings = pd.read_csv(
    "../../data/listings.csv",
    converters={
        "category_verticals": literal_eval,
        "types": literal_eval,
        "services": literal_eval,
        "image_urls": literal_eval,
    },
)

In [103]:
# Only use parent_category_id 1098 (Services and Professionals)

df_listings = df_listings.loc[df_listings["parent_category_id"] == 1098]
print(f"Remaining listings: {len(df_listings.index)}")

Remaining listings: 21741


In [104]:
#  remove listings with duplicate title+description pairs

df_listings = df_listings.drop_duplicates(["title", "description"], keep="last")
print(f"Remaining listings: {len(df_listings.index)}")

Remaining listings: 21313


In [None]:
# Remove irelevant categories
REMOVE_CHILD_CATEGORY_IDS: list[int] = []

df_listings = df_listings[df_listings.child_category_id.isin(REMOVE_CHILD_CATEGORY_IDS) == False]

print(f"Remaining listings: {len(df_listings.index)}")

Remaining listings: 21258


In [106]:
# Split category verticals
df_listings[["category", "parent_category"]] = pd.DataFrame(
    df_listings["category_verticals"].tolist(), index=df_listings.index
)

In [107]:
# Split category verticals
df_listings[["category", "parent_category"]] = pd.DataFrame(
    df_listings["category_verticals"].tolist(), index=df_listings.index
)

In [108]:
# Remove listings with no child_category value
df_listings.dropna(subset=["category"], inplace=True)

print(f"Remaining listings: {len(df_listings.index)}")

Remaining listings: 21258


In [109]:
# remove punctuation
df_listings["title"] = df_listings["title"].str.replace(r"[^\w\s]", " ", regex=True)

# Remove lone numbers (not part of an alphanumeric)
df_listings["title"] = df_listings["title"].str.replace(r"\W+\d+\W*", " ", regex=True)

# remove emojis
df_listings["title"] = df_listings["title"].apply(lambda s: emoji.replace_emoji(s, ""))

# Remove words with length less than MIN_TITLE_WORD_LENGTH
df_listings["title"] = df_listings["title"].str.replace(
    r"\b(\w{1," + str(MIN_TITLE_WORD_LENGTH - 1) + r"})\b", " ", regex=True
)

# remove excess whitespace
df_listings["title"] = df_listings["title"].str.replace(r"\s+", " ", regex=True)

# trim whitespace
df_listings["title"] = df_listings["title"].str.strip()

In [None]:
# Remove rows with a title word count less than MIN_TITLE_WORDS
df_listings["title_word_count"] = df_listings["title"].str.split().str.len()
df_listings = df_listings[df_listings["title_word_count"] >= MIN_TITLE_WORDS]


print(f"Remaining listings: {len(df_listings.index)}")
df_listings.sample(5)

In [None]:
df_categories = pd.read_csv("../../data/categories.csv", converters={"label": literal_eval})

if df_categories.empty:
    # construct categories dataset with numeric labels
    df_categories = df_listings["category"].to_frame(name="name")
    df_categories.drop_duplicates(subset=["name"], inplace=True)
    df_categories.sort_values("name", ascending=True, inplace=True)
    df_categories.reset_index(inplace=True, drop=True)

    df_categories["label"] = df_categories.index
    df_categories.sort_values("label", ascending=True, inplace=True)
    df_categories = df_categories[["label", "name"]]
    df_categories.to_csv("../../data/categories.csv", index=False)

exp_num_categories = df_categories["name"].nunique()
num_categories = df_listings["category"].nunique()
if exp_num_categories < num_categories:
    exp_categories: set[str] = set(df_categories["name"].to_list())
    categories: set[str] = set(df_listings["category"].to_list())
    category_diff = categories.difference(exp_categories)
    raise Exception(
        f"Got difference in categories. Resolve manually in categories.csv: {category_diff}"
    )

categories_dict: dict[str, int] = pd.Series(
    df_categories["label"].values, index=df_categories["name"]
).to_dict()
df_categories.sample(5)

Unnamed: 0,label,name
71,71,repair_and_maintenance_equipment
40,40,group_outings_staff_parties
65,65,rentals_boats
18,18,brokers_and_appraisers
75,75,repair_and_maintenance_trailers_and_campers


In [None]:
df_categories_dataset = df_listings[["title", "category"]]
df_categories_dataset["label"] = df_categories_dataset["category"].map(categories_dict)
df_categories_dataset = df_categories_dataset[["title", "label"]]
df_categories_dataset.sample(5)

In [None]:
# Save the category classification dataset to a file

categories_features = Features(
    {"title": Value(dtype="string"), "label": ClassLabel(names=list(categories_dict.keys()))}
)
categories_info = DatasetInfo(
    description="Marktplaats Listing Titles/Categories", features=categories_features
)
categories_dataset = Dataset.from_pandas(
    df=df_categories_dataset,
    info=categories_info,
    features=categories_features,
    preserve_index=False,
)
categories_dataset.save_to_disk("../../data/category_classification")