In [7]:
from utils import get_processed_metadata
import pandas as pd
from datetime import datetime
import re
from typing import Iterable, List, Set
from datetime import datetime, timezone, timedelta

In [8]:
processed_data_df = get_processed_metadata("./openai_processed_data_v2")
#processed_data_df

⚠️  JSON parse failed in batch_683e854d2d208190bc82d155615ddecd_output.jsonl: Expecting value: line 1 column 610 (char 609)
--- Content snippet ---
[{"tweet_id":"@Essentium3D_2021-02-18T17:03:20.000Z","row_num":2937677,"category":"N/A","subcategory":"N/A","sentiment":"Neutral","tweet_language":"English"},{"tweet_id":"@Stratasys_2014-02-16T17:25:0...



In [9]:
# 1️⃣  — show duplicates (based on tweet_id)
dupes = processed_data_df[
    processed_data_df.duplicated(subset="tweet_id", keep=False)  
].sort_values("tweet_id") 

print(f"{len(dupes)} duplicated rows found")
dupes.head()    

processed_data_df = (
    processed_data_df
        .drop_duplicates(subset="tweet_id", keep="first")
        .reset_index(drop=True)    # tidy re-index
)

20 duplicated rows found


In [10]:
### Validate That

In [None]:
# ---------------------------------------------------------------------------
# master list of allowed sub-categories taken from your prompt
ALLOWED_SUBCATEGORIES: dict[str, List[str]] = {
    "Use-case": [
        "Motor Vehicles / Automotive",
        "Aerospace",
        "Industrial / Business Machines",
        "Consumer Products / Electronics",
        "Medical / Dental",
        "Academic Institutions",
        "Government / Military",
        "Architectural",
        "Power / Energy",
        "Home & DIY (Consumer / Hobbyist)",
        "Other",
    ],
    "Business-relevant": [
        "Supply Chain, Manufacturing & Logistics",
        "Cost Models & Pricing",
        "Intellectual Property & Patents",
        "Mergers, Acquisitions & Partnerships",
        "Investment & Financing",
        "Business Models",
        "Customer Adoption & Demand Dynamics",
        "Sustainability & Circular Economy",
        "Market Trends & Forecast",
        "Other",
    ],
    "Technological": [
        "Materials",
        "Printing Processes",
        "Hardware & Equipment",
        "Software, Firmware & Design Tools",
        "Hardware, Sensors & Equipment",
        "Process Monitoring & Control",
        "Post-Processing Techniques",
        "AI & Digital Twin Integration",
        "Other",
    ],
    "N/A": ["N/A"],
}

def slug(text: str) -> str:
    """Turn 'Use-case' → 'use_case', 'Technological' → 'technological', etc."""
    return re.sub(r"[^\w]+", "_", text.strip().lower()).strip("_")
    
# ── Add the shorter variant so both spellings are accepted ──────────────
ALLOWED_SUBCATEGORIES["Use-case"].append("Home & DIY")
ALLOWED_SUBCATEGORIES["Use-case"].append("Business Machines")
ALLOWED_SUBCATEGORIES["Technological"].append("Design Tools")
"Processes Monitoring & Control"
# ---------------------------------------------------------------------------


def find_invalid_subcategories(
    df: pd.DataFrame,
    main_category: str,
    valid_subcategories: Iterable[str] | None = None,
    *,
    main_col: str = "category",
    sub_col: str = "subcategory",
) -> pd.DataFrame:
    """
    Return all records where `main_col` == main_category but `sub_col`
    is NOT in the allowed list.

    Parameters
    ----------
    df : pandas.DataFrame
        Your processed tweets.
    main_category : str
        The top-level category to test (e.g. "Use-case").
    valid_subcategories : Iterable[str] | None, default None
        Explicit list of allowed subs.  If None, the function will look up
        `ALLOWED_SUBCATEGORIES[main_category]`.
    main_col : str, default "classification"
        Column holding the top-level category.
    sub_col : str,  default "subclassification"
        Column holding the sub-category.

    Returns
    -------
    pandas.DataFrame
        All violating rows (same columns as the original `df`).
    """
    if valid_subcategories is None:
        try:
            valid_subcategories = ALLOWED_SUBCATEGORIES[main_category]
        except KeyError as e:
            raise ValueError(
                f"No built-in sub-category list for {main_category!r}; "
                "please supply `valid_subcategories` explicitly."
            ) from e

    mask_main = df[main_col] == main_category
    mask_bad = ~df[sub_col].isin(valid_subcategories)
    return df.loc[mask_main & mask_bad]


def save_bad_rows(
    df: pd.DataFrame,
    main_category: str,
    *,
    main_col: str = "category",
    sub_col: str = "subcategory",
) -> str:
    """
    • Runs find_invalid_subcategories() for `main_category`
    • Saves the result as CSV with   bad_<slug>_<YYYY-MM-DD_HHMMSS>.csv
    • Returns the file name created
    """
    bad_rows = find_invalid_subcategories(
        df,
        main_category,
        main_col=main_col,
        sub_col=sub_col,
    )

    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
    fname = f"bad_{slug(main_category)}_v2_{timestamp}.csv"
    bad_rows.to_csv(fname, index=False, encoding="utf-8")
    return fname, bad_rows


In [13]:
# Fix formatting: update subcategory "Hardware,Sensors & Equipment" to "Hardware, Sensors & Equipment"
processed_data_df.loc[
    processed_data_df["subcategory"] == "Hardware,Sensors & Equipment",
    "subcategory"
] = "Hardware, Sensors & Equipment"
# If category contains "Marketing" (case-insensitive), set both category and subcategory to "N/A"
processed_data_df.loc[
    processed_data_df["category"].str.contains(r"marketing", case=False, na=False),
    ["category", "subcategory"]
] = ["N/A", "N/A"]
# If category is "Business-relevant" and subcategory contains "Regulatory" (case-insensitive),
# change subcategory to "Other"
processed_data_df.loc[
    (processed_data_df["category"] == "Business-relevant") &
    (processed_data_df["subcategory"].str.contains(r"Regulatory", case=False, na=False)),
    "subcategory"
] = "Other"
# If category is "Business-relevant" and subcategory contains "Manufacturing" (case-insensitive),
# change subcategory to "Supply Chain, Manufacturing & Logistics"
processed_data_df.loc[
    (processed_data_df["category"] == "Business-relevant") &
    (processed_data_df["subcategory"].str.contains(r"Manufacturing", case=False, na=False)),
    "subcategory"
] = "Supply Chain, Manufacturing & Logistics"
# If category is "Business-relevant" and subcategory contains "Partnerships" (case-insensitive),
# change subcategory to "Mergers, Acquisitions & Partnerships"
processed_data_df.loc[
    (processed_data_df["category"] == "Business-relevant") &
    (processed_data_df["subcategory"].str.contains(r"Partnerships", case=False, na=False)),
    "subcategory"
] = "Mergers, Acquisitions & Partnerships"
# If category is "Business-relevant" and subcategory contains "maketing" or "promotion" (case-insensitive),
# set both category and subcategory to "N/A"
# If category is "Business-relevant" and subcategory contains "marketing" or "promotion" (case-insensitive),
# set both category and subcategory to "N/A"
processed_data_df.loc[
    (processed_data_df["category"] == "Business-relevant") &
    (processed_data_df["subcategory"].str.contains(r"(?:marketing|promotion)", case=False, na=False)),
    ["category", "subcategory"]
] = ["N/A", "N/A"]

#If both category and subcategory are "Use-case", change subcategory to "Other"
processed_data_df.loc[
    (processed_data_df["category"] == "Use-case") &
    (processed_data_df["subcategory"] == "Use-case"),
    "subcategory"
] = "Other"


In [14]:

#bad_file = save_bad_rows(processed_data_df, "Use-case")
fname, inaccurate_rows_technological = save_bad_rows(processed_data_df, "Technological")
inaccurate_rows_technological

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language
49,@TXInstruments_2021-10-29T14:15:09.000Z,2779893,Technological,Intellectual Property & Patents,Neutral,English
331,@printing3Dnews_2015-02-21T11:43:32.000Z,4415768,Technological,Sustainability & Circular Economy,Positive,English
333,@wareFLO_2017-12-26T16:50:03.000Z,2242682,Technological,Use-case,Neutral,English
697,@bjeaglefeather_2019-11-26T12:51:07.000Z,2359909,Technological,Safety & Health,Neutral,English
712,@keith_developer_2018-07-18T20:33:51.000Z,3390675,Technological,Safety & Health,Neutral,English
...,...,...,...,...,...,...
10948,@txvoodoo_2012-12-03 22:57:50+00:00,21299,Technological,Technological,Positive,English
11009,@3D_Innovations_2015-03-25T19:12:23.000Z,3751054,Technological,Consumer Products / Electronics,Positive,English
11033,@TheRockstarCEO_2015-06-25T15:57:06.000Z,4455131,Technological,Consumer Products / Electronics,Positive,English
11079,@ProdymSolutions_2015-06-01T08:58:15.000Z,4572880,Technological,Testing & Certification,Neutral,English


In [15]:

#bad_file = save_bad_rows(processed_data_df, "Use-case")
fname, inaccurate_rows_usecase = save_bad_rows(processed_data_df, "Use-case")
inaccurate_rows_usecase

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language
5,@madewithlayers_2021-12-29T20:02:06.000Z,2956041,Use-case,,Negative,English
81,@PrintGamii_2022-03-07T09:35:18.000Z,6445610,Use-case,Business-relevant,Neutral,English
83,@sebrlibrary_2020-04-14T14:25:48.000Z,579289,Use-case,Community / Hobbyist,Positive,English
94,@GunnerGale_2015-11-06T04:39:58.000Z,4358472,Use-case,Sustainability & Circular Economy,Positive,English
95,@UniOfHull_2021-12-09T12:42:02.000Z,3020708,Use-case,Sustainability & Circular Economy,Positive,English
...,...,...,...,...,...,...
11706,@SnoerenStephan_2023-05-20T11:59:14.000Z,6399546,Use-case,Sustainability & Circular Economy,Neutral,English
11707,@OFF3R_2016-08-24T12:15:02.000Z,1701627,Use-case,Business Models,Positive,English
11721,@pegx__2015-04-05T16:49:55.000Z,4488437,Use-case,Sustainability & Circular Economy,Positive,German
11780,@c4ssdotorg_2014-02-10T21:01:12.000Z,4892270,Use-case,Sustainability & Circular Economy,Neutral,English


In [None]:

fname, inaccurate_rows_business = save_bad_rows(processed_data_df, "Business-relevant")
inaccurate_rows_business

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language
7,@NewtownAction_2018-07-30T15:31:03.000Z,3348746,Business-relevant,Government / Military,Negative,English
8,@GIFFORDS_org_2018-08-06T01:01:00.000Z,3172038,Business-relevant,Government / Military,Negative,English
9,@HuffPostPol_2018-07-27T09:04:46.000Z,3208280,Business-relevant,Government / Military,Negative,English
10,@tizzywoman_2022-04-08T13:04:15.000Z,6673665,Business-relevant,Government / Military,Negative,English
11,@naomibrockwell_2018-08-27T20:06:56.000Z,3586765,Business-relevant,Government / Military,Neutral,English
...,...,...,...,...,...,...
11603,@TracTerrorism_2023-10-04T22:08:21.000Z,6325817,Business-relevant,Government / Military,Negative,English
11604,@MichaelBenenati_2016-09-22T21:10:13.000Z,2040813,Business-relevant,Government / Military,Neutral,English
11605,@akiatech_2016-09-21T23:11:47.000Z,956746,Business-relevant,Government / Military,Neutral,English
11716,@MitraMalek_2016-02-11T15:32:03.000Z,1590765,Business-relevant,Startup & Early Stage,Neutral,English


In [17]:
# Convert all invalid Business-relevant rows to N/A
processed_data_df.loc[inaccurate_rows_business.index, ["category", "subcategory"]] = "N/A"
processed_data_df.loc[inaccurate_rows_usecase.index, ["category", "subcategory"]] = "N/A"
processed_data_df.loc[inaccurate_rows_technological.index, ["category", "subcategory"]] = "N/A"


In [18]:
fname, inaccurate_rows_business = save_bad_rows(processed_data_df, "Business-relevant")
inaccurate_rows_business

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language


In [19]:
#bad_file = save_bad_rows(processed_data_df, "Use-case")
fname, inaccurate_rows_usecase = save_bad_rows(processed_data_df, "Use-case")
inaccurate_rows_usecase

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language


In [20]:

#bad_file = save_bad_rows(processed_data_df, "Use-case")
fname, inaccurate_rows_technological = save_bad_rows(processed_data_df, "Technological")
inaccurate_rows_technological

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language


In [21]:
# Fix formatting: update subcategory "Hardware,Sensors & Equipment" to "Hardware, Sensors & Equipment"
processed_data_df.loc[
    processed_data_df["subcategory"] == "Hardware,Sensors & Equipment",
    "subcategory"
] = "Hardware, Sensors & Equipment"


## 2. Find Invalid Categories

In [22]:
def find_invalid_categories(
    df: pd.DataFrame,
    *,
    main_col: str = "category",
    allowed: Iterable[str] | None = None,
) -> pd.DataFrame:
    """
    Return every row whose top-level `category` is NOT one of the 3 valid values.

    Parameters
    ----------
    df : pandas.DataFrame
    main_col : str            column holding the top-level category (default: "category")
    allowed : iterable[str]   list or set of valid categories; if None, defaults to
                              {"Use-case", "Business-relevant", "Technological"}

    Returns
    -------
    pandas.DataFrame          rows with an out-of-scope top-level category
    """
    if allowed is None:
        allowed: Set[str] = {"Use-case", "Business-relevant", "Technological", "N/A"}

    return df.loc[~df[main_col].isin(allowed)].copy()


In [23]:
bad_cat_rows = find_invalid_categories(processed_data_df)

bad_cat_rows

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language
214,@Nexa3D_2018-12-13T17:15:33.000Z,3546273,Academic Institutions,Academic Institutions,Positive,English
1258,@GeorgiaTechISyE_2019-10-16T19:20:09.000Z,2749147,Academic Institutions,Academic Institutions,Neutral,English
1259,@3dtemplates_2015-08-08T07:32:04.000Z,4637386,Academic Institutions,Academic Institutions,Positive,English
1262,@ph055a_2019-09-27T14:12:32.000Z,2606983,Government / Military,Government / Military,Neutral,English
1264,@wychwoodgeog_2021-06-18T14:26:02.000Z,2922809,Academic Institutions,Academic Institutions,Positive,English
...,...,...,...,...,...,...
11657,@CharlesJanosick_2018-12-16T11:45:02.000Z,3242608,Academic Institutions,Academic Institutions,Neutral,English
11665,@3DAdept_2018-07-24T06:40:00.000Z,3634028,Academic Institutions,Academic Institutions,Positive,English
11671,@dixonchan_2018-07-30T16:31:28.000Z,3349021,Government / Military,Government / Military,Neutral,English
11672,@marcdaalder_2019-10-31T21:53:02.000Z,2426630,Government / Military,Government / Military,Neutral,English


## 2.2 Categories Standardization

In [24]:
# If subcategory is "Academic Institutions" or "Use-case", and category is "Academic Institutions",
# update the category to "Use-case"
processed_data_df.loc[
    (processed_data_df["subcategory"].isin(["Academic Institutions", "Use-case"])) &
    (processed_data_df["category"] == "Academic Institutions"),
    "category"
] = "Use-case"


In [25]:
# If subcategory is "Government / Military" or "Use-case", and category is "Government / Military",
# update the category to "Use-case"
processed_data_df.loc[
    (processed_data_df["subcategory"].isin(["Government / Military", "Use-case"])) &
    (processed_data_df["category"] == "Government / Military"),
    "category"
] = "Use-case"


In [26]:
# Replace all category values that are "Other" or "Others" with "N/A"
processed_data_df.loc[
    processed_data_df["category"].isin(["Other", "Others"]),
    "category"
] = "N/A"


In [27]:
# If category is "Academic Institutions", update category to "Use-case" and subcategory to "Academic Institutions"
processed_data_df.loc[
    processed_data_df["category"] == "Academic Institutions",
    ["category", "subcategory"]
] = ["Use-case", "Academic Institutions"]


In [28]:
# If category is "Software, Firmware & Design Tools", update category to "Technological"
# and subcategory to "Software, Firmware & Design Tools"
processed_data_df.loc[
    processed_data_df["category"] == "Software, Firmware & Design Tools",
    ["category", "subcategory"]
] = ["Technological", "Software, Firmware & Design Tools"]


In [29]:
# If category is "Government / Military", update category to "Use-case"
# and subcategory to "Government / Military"
processed_data_df.loc[
    processed_data_df["category"] == "Government / Military",
    ["category", "subcategory"]
] = ["Use-case", "Government / Military"]


In [30]:
# If category contains "Intellectual Property" (case-insensitive),
# update category to "Business-relevant" and subcategory to "Intellectual Property & Patents"
processed_data_df.loc[
    processed_data_df["category"].str.contains(r"Intellectual Property", case=False, na=False),
    ["category", "subcategory"]
] = ["Business-relevant", "Intellectual Property & Patents"]


In [31]:
# If category contains "Hardware", "Sensor", or "Equipment" (case-insensitive),
# update category to "Technological" and subcategory to "Hardware, Sensors & Equipment"
processed_data_df.loc[
    processed_data_df["category"].str.contains(r"hardware|sensor|equipment", case=False, na=False),
    ["category", "subcategory"]
] = ["Technological", "Hardware, Sensors & Equipment"]


In [32]:
# If category contains "Mergers", "Acquisition", or "Partnerships" (case-insensitive),
# update category to "Business-relevant" and subcategory to "Mergers, Acquisitions & Partnerships"
processed_data_df.loc[
    processed_data_df["category"].str.contains(r"mergers|acquisition|partnerships", case=False, na=False),
    ["category", "subcategory"]
] = ["Business-relevant", "Mergers, Acquisitions & Partnerships"]


In [33]:
# If category is "Architectural", update category to "Use-case" and subcategory to "Architectural"
processed_data_df.loc[
    processed_data_df["category"] == "Architectural",
    ["category", "subcategory"]
] = ["Use-case", "Architectural"]


In [34]:
# If category is "Educational", update category to "Use-case" and subcategory to "Academic Institutions"
processed_data_df.loc[
    processed_data_df["category"] == "Educational",
    ["category", "subcategory"]
] = ["Use-case", "Academic Institutions"]


In [35]:
# If category is "Consumer Products / Electronics", update category to "Use-case" and subcategory to "Consumer Products / Electronics"
processed_data_df.loc[
    processed_data_df["category"] == "Consumer Products / Electronics",
    ["category", "subcategory"]
] = ["Use-case", "Consumer Products / Electronics"]


In [36]:
# If category is "Medical / Dental", update category to "Use-case" and subcategory to "Medical / Dental"
processed_data_df.loc[
    processed_data_df["category"] == "Medical / Dental",
    ["category", "subcategory"]
] = ["Use-case", "Medical / Dental"]


In [37]:
# If category is "Home & DIY (Consumer / Hobbyist)", update category to "Use-case" and subcategory to "Home & DIY (Consumer / Hobbyist)"
processed_data_df.loc[
    processed_data_df["category"] == "Home & DIY (Consumer / Hobbyist)",
    ["category", "subcategory"]
] = ["Use-case", "Home & DIY (Consumer / Hobbyist)"]


In [38]:
# Replace all category values that contain "Technology" (case-insensitive) with "Technological"
processed_data_df.loc[
    processed_data_df["category"].str.contains(r"technology", case=False, na=False),
    "category"
] = "Technological"


In [39]:
# If category is "Investment & Financing", update category to "Business-relevant"
# and subcategory to "Investment & Financing"
processed_data_df.loc[
    processed_data_df["category"] == "Investment & Financing",
    ["category", "subcategory"]
] = ["Business-relevant", "Investment & Financing"]


In [40]:
# If category is "Market Trends & Forecast", update category to "Business-relevant"
# and subcategory to "Market Trends & Forecast"
processed_data_df.loc[
    processed_data_df["category"] == "Market Trends & Forecast",
    ["category", "subcategory"]
] = ["Business-relevant", "Market Trends & Forecast"]


In [41]:
# If category is "Materials", update category to "Technological" and subcategory to "Materials"
processed_data_df.loc[
    processed_data_df["category"] == "Materials",
    ["category", "subcategory"]
] = ["Technological", "Materials"]


In [42]:
# If category contains "Educational" (case-insensitive), update category to "Use-case"
# and subcategory to "Academic Institutions"
processed_data_df.loc[
    processed_data_df["category"].str.contains(r"educational", case=False, na=False),
    ["category", "subcategory"]
] = ["Use-case", "Academic Institutions"]


In [43]:
# If category is "Aerospace", update category to "Use-case" and subcategory to "Aerospace"
processed_data_df.loc[
    processed_data_df["category"] == "Aerospace",
    ["category", "subcategory"]
] = ["Use-case", "Aerospace"]


In [44]:
# If category is "Customer Adoption & Demand Dynamics", update category to "Business-relevant"
# and subcategory to "Customer Adoption & Demand Dynamics"
processed_data_df.loc[
    processed_data_df["category"] == "Customer Adoption & Demand Dynamics",
    ["category", "subcategory"]
] = ["Business-relevant", "Customer Adoption & Demand Dynamics"]
# If category is "Sustainability & Circular Economy", update category to "Business-relevant"
# and subcategory to "Sustainability & Circular Economy"
processed_data_df.loc[
    processed_data_df["category"] == "Sustainability & Circular Economy",
    ["category", "subcategory"]
] = ["Business-relevant", "Sustainability & Circular Economy"]


In [45]:
# If category is "Motor Vehicles / Automotive", update category to "Use-case"
# and subcategory to "Motor Vehicles / Automotive"
processed_data_df.loc[
    processed_data_df["category"] == "Motor Vehicles / Automotive",
    ["category", "subcategory"]
] = ["Use-case", "Motor Vehicles / Automotive"]


In [46]:
processed_data_df.loc[
    processed_data_df["subcategory"] == "Hardware,Sensors & Equipment",
    "subcategory"
] = "Hardware, Sensors & Equipment"
# Drop rows where category is "Positive"
processed_data_df = processed_data_df[processed_data_df["category"] != "Positive"]
# Drop rows where category is "Neutral"
processed_data_df = processed_data_df[processed_data_df["category"] != "Neutral"]
# Drop rows where category is "Negative"
processed_data_df = processed_data_df[processed_data_df["category"] != "Negative"]



In [47]:
bad_cat_rows = find_invalid_categories(processed_data_df)

bad_cat_rows

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language
3845,@edeiller_2016-05-19T15:08:12.000Z,959007,Historical / Other,Other,Positive,English


In [48]:
# Assuming bad_cat_rows contains all rows with invalid categories:
processed_data_df.loc[bad_cat_rows.index, ["category", "subcategory"]] = ["N/A", "N/A"]


In [49]:
bad_cat_rows = find_invalid_categories(processed_data_df)

bad_cat_rows

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language


In [50]:
processed_data_df

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language
0,@rveenewman_2019-01-09T13:25:07.000Z,2447524,Use-case,Consumer Products / Electronics,Positive,English
1,@ThomasDangAB_2020-04-13T21:00:41.000Z,420939,Use-case,Medical / Dental,Positive,English
2,@nano_2021-10-10T17:00:11.000Z,2888729,Use-case,Architectural,Positive,English
3,@EOS3DPrinting_2017-10-18T14:58:19.000Z,1314705,Business-relevant,"Supply Chain, Manufacturing & Logistics",Positive,English
4,@DrOlubamiji_2021-02-02T20:30:51.000Z,3008460,Use-case,Academic Institutions,Positive,English
...,...,...,...,...,...,...
11828,@ProHacktivo_2021-07-09T17:51:32.000Z,2907693,Business-relevant,Other,Neutral,English
11829,@DRSCornwall_2020-06-21T22:45:12.000Z,411560,Use-case,Motor Vehicles / Automotive,Positive,English
11830,@TurboKitsdotcom_2020-04-13T16:10:05.000Z,579709,Use-case,Motor Vehicles / Automotive,Positive,English
11831,@_Brandor_2016-08-05T01:04:46.000Z,1580187,Use-case,Other,Positive,English


## 3. Merge with Input Data Set and Valiate Tweet ID as well as Row Number

In [51]:
import pandas as pd

def find_rownum_mismatches_df(
    processed_df: pd.DataFrame,
    cleaned_df: pd.DataFrame,
    *,
    tweet_id_col: str = "tweet_id",
    processed_row_col: str = "row_num",
    cleaned_row_col: str = "row_num",
) -> pd.DataFrame:
    """
    Compare row numbers between two in-memory DataFrames that share a tweet_id.

    Parameters
    ----------
    processed_df : pandas.DataFrame
        Output of `get_processed_metadata()` (or equivalent).
    cleaned_df   : pandas.DataFrame
        The original ‘clean’ dataset already loaded into memory.
    tweet_id_col : str
        Column name holding the tweet ID (must exist in both frames).
    processed_row_col : str
        Column holding row numbers in `processed_df`.
    cleaned_row_col   : str
        Column holding row numbers in `cleaned_df`.

    Returns
    -------
    pandas.DataFrame
        Rows where tweet_id matches but row_num differs.
        Columns included:
            tweet_id | row_num_processed | row_num_cleaned | …other processed cols…
    """
    # keep only the columns we need from cleaned_df
    clean_subset = cleaned_df[[tweet_id_col, cleaned_row_col]].copy()
    clean_subset = clean_subset.rename(columns={cleaned_row_col: "row_num_cleaned"})

    proc_subset = processed_df.copy()
    proc_subset = proc_subset.rename(
        columns={processed_row_col: "row_num_processed"}
    )

    merged = proc_subset.merge(
        clean_subset,
        on=tweet_id_col,
        how="inner",
        validate="one_to_one",
    )

    mismatches = merged[
        merged["row_num_processed"] != merged["row_num_cleaned"]
    ].copy()

    return mismatches


In [52]:

cleaned_df = pd.read_csv("merged_bad_and_cleaned.csv")

In [53]:
dupe_rows = cleaned_df[cleaned_df.duplicated("tweet_id", keep=False)].copy()
dupe_rows

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,langdetect_is_english


In [54]:
cleaned_df = cleaned_df.drop_duplicates(subset="tweet_id", keep="first").reset_index(drop=True)

In [55]:
rownum_mismatches = find_rownum_mismatches_df(processed_data_df, cleaned_df)
rownum_mismatches

Unnamed: 0,tweet_id,row_num_processed,category,subcategory,sentiment,tweet_language,row_num_cleaned
11013,@uttpal25993_2013-09-22T21:11:50.000Z,11024,Use-case,Consumer Products / Electronics,Positive,English,6169632


In [56]:
bad_ids = rownum_mismatches["tweet_id"].unique().tolist()
processed_data_df = processed_data_df[~processed_data_df["tweet_id"].isin(bad_ids)].reset_index(drop=True)

In [57]:
rownum_mismatches = find_rownum_mismatches_df(processed_data_df, cleaned_df)
rownum_mismatches

Unnamed: 0,tweet_id,row_num_processed,category,subcategory,sentiment,tweet_language,row_num_cleaned


## Check For Hallucinated Tweet IDs

In [58]:
def clean_tweet_ids(df: pd.DataFrame, tweet_id_col: str) -> pd.Series:
    return (
        df[tweet_id_col]
        .astype(str)
        .str.strip()
        .str.replace(r'[",]+$', '', regex=True)  # removes trailing , and "
    )
def check_hallucinated_rows(
    processed_df: pd.DataFrame,
    cleaned_df: pd.DataFrame,
    *,
    tweet_id_col: str = "tweet_id"
) -> pd.DataFrame:
    """
    Identify all rows in `processed_df` that are not present in `cleaned_df`
    based on the cleaned `tweet_id` (stripped, no trailing punctuation).

    Returns rows in processed_df where cleaned tweet_id is not in cleaned_df.
    """
    processed_clean = processed_df.copy()
    cleaned_clean = cleaned_df.copy()

    # Clean tweet_id columns
    processed_clean[tweet_id_col] = clean_tweet_ids(processed_clean, tweet_id_col)
    cleaned_clean[tweet_id_col] = clean_tweet_ids(cleaned_clean, tweet_id_col)

    # Find tweet_ids in processed but not in cleaned
    extra_rows = processed_clean[
        ~processed_clean[tweet_id_col].isin(cleaned_clean[tweet_id_col])
    ].copy()

    return extra_rows


In [59]:
extra_rows = check_hallucinated_rows(
    processed_data_df, 
    cleaned_df, 
    tweet_id_col="tweet_id"
)
extra_rows

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language


In [60]:
maker_row_processed = cleaned_df[
    cleaned_df["tweet_id"].astype(str).str.contains("@Clement_MENGUE_2014-05-09T07:30:04.000Z", na=False)
]
maker_row_processed

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,langdetect_is_english


## Check Unprocessed Rows

In [61]:
def check_missing_cleaned_rows(
    processed_df: pd.DataFrame,
    cleaned_df: pd.DataFrame,
    *,
    tweet_id_col: str = "tweet_id"
) -> pd.DataFrame:
    """
    Identify all rows in `cleaned_df` that are not present in `processed_df`
    based on the cleaned `tweet_id` (stripped, no trailing punctuation).

    Returns rows in cleaned_df where tweet_id is missing from processed_df.
    """
    processed_clean = processed_df.copy()
    cleaned_clean = cleaned_df.copy()

    # Clean tweet_id columns
    processed_clean[tweet_id_col] = clean_tweet_ids(processed_clean, tweet_id_col)
    cleaned_clean[tweet_id_col] = clean_tweet_ids(cleaned_clean, tweet_id_col)

    # Find tweet_ids in cleaned but not in processed
    missing_rows = cleaned_clean[
        ~cleaned_clean[tweet_id_col].isin(processed_clean[tweet_id_col])
    ].copy()

    return missing_rows


In [62]:
missing_from_processed = check_missing_cleaned_rows(
    processed_df=processed_data_df,
    cleaned_df=cleaned_df,
    tweet_id_col="tweet_id"
)
missing_from_processed

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,langdetect_is_english
1427,@PrinterBot,2,2013-09-03 18:39:13+00:00,@PrinterBot_2013-09-03T18:39:13.000Z,6208233,3d printing of titanium aluminide turbocharger...,True
4951,@F1John,0,2013-05-28 10:15:19+00:00,@F1John_2013-05-28T10:15:19.000Z,6208529,where can i buy a 3d printed 3d printer ? #3dp...,True
4952,@DanFiorella,0,2013-06-29 02:09:50+00:00,@DanFiorella_2013-06-29T02:09:50.000Z,5670999,where can i buy chocolate cartridges for my 3d...,True
4953,@PORTdesign,0,2015-07-27 21:13:15+00:00,@PORTdesign_2015-07-27T21:13:15.000Z,4266392,when you work for a company that develops soft...,True
4954,@skelstar,0,2018-12-06 18:22:26+00:00,@skelstar_2018-12-06T18:22:26.000Z,3241186,when your bl-touch mount breaks and you only h...,True
4955,@L_Benetton,0,2018-06-12 14:23:32+00:00,@L_Benetton_2018-06-12T14:23:32.000Z,3547397,"when your metal part is done 3d printing , you...",True
4956,@signoreD,0,2014-02-18 00:22:15+00:00,@signoreD_2014-02-18T00:22:15.000Z,5231886,when i meet w/ clients to discuss new (physica...,True
4957,@Mikebit3d,0,2019-02-19 12:29:16+00:00,@Mikebit3d_2019-02-19T12:29:16.000Z,2347413,when was the last time you cleaned/ serviced y...,True
4958,@Mikebit3d,0,2019-02-19 12:26:30+00:00,@Mikebit3d_2019-02-19T12:26:30.000Z,2347315,when was the last time you cleaned/ serviced y...,True
4959,@NexFlowCorp,0,2012-10-10 12:34:03+00:00,@NexFlowCorp_2012-10-10 12:34:03+00:00,58377,when old meets new – can additive manufacturin...,True


In [63]:
len(missing_from_processed)

35

In [64]:
matches = processed_data_df[
    processed_data_df["tweet_id"].astype(str).str.contains("mirrrroring_", na=False)
]

print(matches[["tweet_id"]])


Empty DataFrame
Columns: [tweet_id]
Index: []


In [65]:
null_or_empty_category_df = processed_data_df[
    processed_data_df["category"].isnull()
    | processed_data_df["category"].astype(str).str.strip().eq("")
    #| processed_data_df["category"].str.upper().eq("N/A")
]
null_or_empty_category_df

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language


## Finaly Sanity Checks

In [66]:

#bad_file = save_bad_rows(processed_data_df, "Use-case")
fname, inaccurate_rows_business = save_bad_rows(processed_data_df, "Business-relevant")
inaccurate_rows_business

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language


In [71]:
processed_data_df.loc[
    (processed_data_df["category"] == "Use-case") &
    (processed_data_df["subcategory"] == "Use-case"),
    "subcategory"
] = "Other"


In [72]:
#bad_file = save_bad_rows(processed_data_df, "Use-case")
fname, inaccurate_rows_usecase = save_bad_rows(processed_data_df, "Use-case")
inaccurate_rows_usecase

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language


In [68]:
#bad_file = save_bad_rows(processed_data_df, "Use-case")
fname, inaccurate_rows_technological = save_bad_rows(processed_data_df, "Technological")
inaccurate_rows_technological

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language


In [69]:
bad_cat_rows = find_invalid_categories(processed_data_df)
bad_cat_rows

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language


In [None]:
from datetime import datetime

# Generate a timestamped filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"CLEANED_POST-LLM_DATASET_v2__{timestamp}.csv"

# Write out the DataFrame to CSV
processed_data_df.to_csv(filename, index=False)

print(f"DataFrame successfully written to {filename}")
