1. Imports + Setup

In [3]:
import pandas as pd
import json
from pathlib import Path

print("Environment ready.")


Environment ready.


In [4]:
from google.colab import drive
drive.mount('/content/drive')
BASE_DIR = Path("/content/drive/MyDrive/complaint_intel_project/")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
RAW_DIR = BASE_DIR / "data/raw"
PROC_DIR = BASE_DIR / "data/processed"
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

In [8]:
NOTEBOOKS_DIR = BASE_DIR / "notebooks"
NOTEBOOKS_DIR.mkdir(parents=True, exist_ok=True)

2. Download dataset


In [15]:
!wget -O /content/consumer_complaints.zip \
  https://files.consumerfinance.gov/ccdb/complaints.csv.zip

!unzip /content/consumer_complaints.zip -d /content/


--2025-11-22 20:24:47--  https://files.consumerfinance.gov/ccdb/complaints.csv.zip
Resolving files.consumerfinance.gov (files.consumerfinance.gov)... 23.194.116.155, 23.194.116.159, 2600:1408:c400:13::17d4:f8c6, ...
Connecting to files.consumerfinance.gov (files.consumerfinance.gov)|23.194.116.155|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1619063867 (1.5G) [binary/octet-stream]
Saving to: ‘/content/consumer_complaints.zip’


2025-11-22 20:24:58 (136 MB/s) - ‘/content/consumer_complaints.zip’ saved [1619063867/1619063867]

Archive:  /content/consumer_complaints.zip
  inflating: /content/complaints.csv  


In [16]:
import shutil
shutil.move("/content/complaints.csv", RAW_DIR / "consumer_complaints.csv")

PosixPath('/content/drive/MyDrive/complaint_intel_project/data/raw/consumer_complaints.csv')

In [None]:
RAW_PATH = RAW_DIR / "consumer_complaints.csv"
df = pd.read_csv(RAW_PATH, low_memory=False)

# print("Rows:", len(df))
# df.head()
# df.columns.tolist()


In [19]:
import os

os.listdir(RAW_DIR)


['consumer_complaints.csv']

3. Load + Clean text

In [15]:
# RAW_PATH = RAW_DIR / "consumer_complaints.csv"
# df = pd.read_csv(RAW_PATH,
#                  low_memory=False,
#                  encoding='latin1')
# df.head()

ParserError: Error tokenizing data. C error: Expected 1 fields in line 7, saw 2


3.1 Define the columns

In [10]:
# chose the columns
use_cols = [
    "Consumer complaint narrative",
    "Product",
    "Issue",
    "Company",
    "Company response to consumer",
    "Timely response?",
    "Consumer disputed?"
]


3.2 Read the file in chunks + sample

In [12]:
import pandas as pd
from pathlib import Path

RAW_PATH = RAW_DIR / "consumer_complaints.csv"

chunksize = 100_000 # read 100k rows at a time
sample_target = 120_000   # stop once we collect ~120k rows
collected = []

for chunk in pd.read_csv(
    RAW_PATH,
    chunksize=chunksize,
    low_memory=False,
    encoding="latin1",
    usecols=use_cols
):
    # keep only rows with complaint text
    chunk = chunk.dropna(subset=["Consumer complaint narrative"])
    chunk["text"] = chunk["Consumer complaint narrative"].astype(str).str.strip()
    chunk = chunk[chunk["text"].str.len() > 20]

    collected.append(chunk)

    total = sum(len(c) for c in collected)
    print("Collected rows:", total)

    if total >= sample_target:
        break

df = pd.concat(collected, ignore_index=True)
len(df)


Collected rows: 7785
Collected rows: 18254
Collected rows: 32207
Collected rows: 49371
Collected rows: 69909
Collected rows: 92264
Collected rows: 116804
Collected rows: 143264


143264

3.3 Filter categories (issues with enough samples)

In [23]:
issue_counts = df["Issue"].value_counts()
print(issue_counts)
valid_issues = issue_counts[issue_counts >= 2000].index.tolist()
df = df[df["Issue"].isin(valid_issues)]
len(df)


Issue
Incorrect information on your report                                                45988
Improper use of your report                                                         25858
Problem with a company's investigation into an existing problem                     13667
Problem with a credit reporting company's investigation into an existing problem     8932
Attempts to collect debt not owed                                                    6519
Managing an account                                                                  3634
Written notification about debt                                                      3264
Trouble during payment process                                                       2126
Problem with a purchase shown on your statement                                      1990
Other transaction problem                                                            1884
False statements or representation                                                   1727
Took

109988

4. Map issues → super categories

In [24]:
issue_to_group = {}
for issue in valid_issues:
    i = issue.lower()
    if "fee" in i or "billing" in i:
        issue_to_group[issue] = "billing_and_fees"
    elif "credit report" in i:
        issue_to_group[issue] = "credit_report_errors"
    elif "harassment" in i or "debt" in i:
        issue_to_group[issue] = "harassment_and_collection"
    elif "fraud" in i or "identity" in i:
        issue_to_group[issue] = "fraud_and_identity_theft"
    elif "foreclosure" in i:
        issue_to_group[issue] = "loan_modification_and_foreclosure"
    else:
        issue_to_group[issue] = "other"

df["label"] = df["Issue"].map(issue_to_group)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["Issue"].map(issue_to_group)


5. Final dataset (max 100k rows)

In [25]:
final_df = df[["text", "label"]]

if len(final_df) > 100_000:
    final_df = final_df.sample(n=100_000, random_state=42)

len(final_df)


100000

6. Save processed data

In [26]:
final_df.to_csv(PROC_DIR / "complaints_clean.csv", index=False)

with open(PROC_DIR / "issue_to_group_mapping.json", "w") as f:
    json.dump(issue_to_group, f, indent=2)
