## Analysis

In [11]:
import pandas as pd
from datetime import datetime

# 1. Read the three “post‐LLM” CSVs and concatenate them into a single DataFrame
df_v1 = pd.read_csv("CLEANED_POST-LLM_DATASET_v1__20250603_111851.csv")
df_v2 = pd.read_csv("CLEANED_POST-LLM_DATASET_v2__20250603_093547.csv")
df_v3 = pd.read_csv("CLEANED_POST-LLM_DATASET_v3__20250603_144231.csv")

merged_post_llm = pd.concat([df_v1, df_v2, df_v3], ignore_index=True)

# 2. Generate a timestamp string and write the merged DataFrame with datetime in its filename
now_ts = datetime.now().strftime("%Y%m%d_%H%M%S")
merged_filename = f"CLEANED_POST-LLM_DATASET_v1_v2_v3_{now_ts}.csv"
#merged_post_llm.to_csv(merged_filename, index=False)

# 3. Read the “master” cleaned dataset
cleaned_master = pd.read_csv("CLEANED_DATASET_20250531_004116.csv")

# 4. Find rows that are in cleaned_master but NOT in merged_post_llm (by tweet_id)
missing_by_id = cleaned_master[~cleaned_master["tweet_id"].isin(merged_post_llm["tweet_id"])]

# 5. From those “missing” rows, keep only the requested columns
cols_to_keep = [
    "Author ID",
    "Total Engagement",
    "Date",
    "tweet_id",
    "row_num",
    "Normalized Text",
    "langdetect_is_english"
]
missing_selected = missing_by_id[cols_to_keep]

# 6. Write the result to “merged_bad_and_cleaned_v2.csv”
#missing_selected.to_csv("merged_bad_and_cleaned_v2.csv", index=False)

# (Optional) Display the first few rows to verify
missing_selected


Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,langdetect_is_english
28467,@mirrrroring,75,2020-01-07 17:50:51+00:00,@mirrrroring_2020-01-07T17:50:51.000Z,524007,i’ve decided to start giving away works that m...,True
839125,@mirrrroring,2,2013-09-11 20:07:14+00:00,@mirrrroring_2013-09-11T20:07:14.000Z,5959718,wanted: 3d printing nerd who can make me finge...,True
900081,@ZamaZingTimmy,1,2015-07-23 17:23:10+00:00,@ZamaZingTimmy_2015-07-23T17:23:10.000Z,4619863,"great job lani! you crushed it, just as you ar...",True
1182046,@maralkalajian,0,2014-09-07 10:36:17+00:00,@maralkalajian_2014-09-07T10:36:17.000Z,4777125,mt @eric_gervet : the magic of 3d printing :) ...,True
1560358,@lcitdynamics,0,2015-08-27 06:36:32+00:00,@lcitdynamics_2015-08-27T06:36:32.000Z,4570458,shut up and take my money already! its amazing...,True
1599906,@Info3Dprinter,0,2013-06-26 19:13:41+00:00,@Info3Dprinter_2013-06-26T19:13:41.000Z,6060161,"re: "" 3d printing is a gimmick"" - uog, 3d prin...",True
1795123,@reisanar,0,2014-09-20 17:51:32+00:00,@reisanar_2014-09-20T17:51:32.000Z,5251081,3d printing is freaking awesome #makerfaire201...,True
1978061,@Best3dprint,0,2016-01-17 20:21:37+00:00,@Best3dprint_2016-01-17T20:21:37.000Z,1545085,@abstractunderground | #3dprint | the #bridge ...,True
1978062,@Best3dprint,0,2016-01-14 05:05:26+00:00,@Best3dprint_2016-01-14T05:05:26.000Z,2099809,@abstractunderground | #3dprint | on that stud...,True
1978397,@Best3dprint,0,2013-12-24 15:42:28+00:00,@Best3dprint_2013-12-24T15:42:28.000Z,5913471,@akbcustoms | new button for masada buttstock ...,True


In [12]:
len(missing_selected)

34

In [15]:
len(merged_post_llm)

2836611

In [17]:
len(cleaned_master)

2835759

## Duplicate Check Before Mergin

In [18]:
# 1. Identify duplicates in merged_post_llm based on tweet_id and row_num
dupes_merged = merged_post_llm[
    merged_post_llm.duplicated(subset=["tweet_id", "row_num"], keep=False)
]
print(f"Duplicates in merged_post_llm ({len(dupes_merged)} rows):")
print(dupes_merged)

# 2. Identify duplicates in cleaned_master based on tweet_id and row_num
dupes_cleaned = cleaned_master[
    cleaned_master.duplicated(subset=["tweet_id", "row_num"], keep=False)
]
print(f"\nDuplicates in cleaned_master ({len(dupes_cleaned)} rows):")
print(dupes_cleaned)


Duplicates in merged_post_llm (1224 rows):
                                          tweet_id    row_num  category  \
600               @tollsy_2014-03-29T16:51:04.000Z  4667710.0  Use-case   
8385          @ACSPhysics_2023-11-22T14:13:11.000Z  6405482.0  Use-case   
8404     @mllevanessadias_2016-10-06T16:30:00.000Z  1422754.0  Use-case   
8418              @KB1REQ_2013-11-07T02:09:29.000Z  5740290.0  Use-case   
46192        @Machenideas_2019-12-04T14:47:44.000Z  2625885.0  Use-case   
...                                            ...        ...       ...   
2824653  @Cosmeti08841509_2019-02-27T13:21:54.000Z  2364816.0  Use-case   
2824654        @ElliotOJH_2023-07-26T15:37:14.000Z  6359771.0  Use-case   
2824656    @derbycollege_2012-08-06 13:18:30+00:00   114668.0  Use-case   
2824756       @mirrroring_2020-01-07T17:50:51.000Z   524007.0  Use-case   
2825251       @mirrroring_2013-09-11T20:07:14.000Z  5959718.0       NaN   

                   subcategory sentiment tweet_language 

In [19]:
dupes_merged

Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language
600,@tollsy_2014-03-29T16:51:04.000Z,4667710.0,Use-case,Use-case,Neutral,English
8385,@ACSPhysics_2023-11-22T14:13:11.000Z,6405482.0,Use-case,Use-case,Positive,English
8404,@mllevanessadias_2016-10-06T16:30:00.000Z,1422754.0,Use-case,Use-case,Positive,English
8418,@KB1REQ_2013-11-07T02:09:29.000Z,5740290.0,Use-case,Use-case,Neutral,English
46192,@Machenideas_2019-12-04T14:47:44.000Z,2625885.0,Use-case,Use-case,Neutral,English
...,...,...,...,...,...,...
2824653,@Cosmeti08841509_2019-02-27T13:21:54.000Z,2364816.0,Use-case,Medical / Dental,Neutral,English
2824654,@ElliotOJH_2023-07-26T15:37:14.000Z,6359771.0,Use-case,Medical / Dental,Positive,English
2824656,@derbycollege_2012-08-06 13:18:30+00:00,114668.0,Use-case,Academic Institutions,Positive,English
2824756,@mirrroring_2020-01-07T17:50:51.000Z,524007.0,Use-case,Architectural,Neutral,English


In [21]:
# From the previously identified duplicates (dupes_merged), filter for groups
# where the “category” column has more than one unique value.

dupes_diff_category = (
    dupes_merged
      .groupby(["tweet_id", "row_num"])
      .filter(lambda grp: grp["category"].nunique() > 1)
      .sort_values(["tweet_id", "row_num"])
)

print(f"{len(dupes_diff_category)} rows where duplicates have differing categories:")
dupes_diff_category


82 rows where duplicates have differing categories:


Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language
1984971,@3DPelectronics_2016-01-08T12:10:33.000Z,1259002.0,Use-case,Use-case,Neutral,English
2821344,@3DPelectronics_2016-01-08T12:10:33.000Z,1259002.0,Technological,Materials,Positive,English
1609296,@3DPrintGirl_2018-05-09T21:08:36.000Z,3574464.0,Use-case,Use-case,Positive,English
2813859,@3DPrintGirl_2018-05-09T21:08:36.000Z,3574464.0,Technological,"Software, Firmware & Design Tools",Neutral,English
2691208,@3Dable_2019-02-26T10:43:11.000Z,2617485.0,Use-case,Use-case,Neutral,English
...,...,...,...,...,...,...
2813557,@tradelawnews_2020-01-24T02:55:08.000Z,429531.0,Business-relevant,Intellectual Property & Patents,Neutral,English
356420,@trendonwords_2018-08-28T23:20:17.000Z,3303927.0,Use-case,Use-case,Negative,English
2816331,@trendonwords_2018-08-28T23:20:17.000Z,3303927.0,Business-relevant,Other,Negative,English
356384,@zonamya1_2018-07-12T05:19:10.000Z,3238439.0,Use-case,Use-case,Neutral,English


In [24]:
# 1. Recompute dupes_merged from the updated merged_post_llm
dupes_merged = merged_post_llm[
    merged_post_llm.duplicated(subset=["tweet_id", "row_num"], keep=False)
]

# 2. Identify (tweet_id, row_num) pairs where duplicates have differing categories
keys_diff = (
    dupes_merged
      .groupby(["tweet_id", "row_num"])
      .filter(lambda grp: grp["category"].nunique() > 1)
      .loc[:, ["tweet_id", "row_num"]]
      .drop_duplicates()
)

# 3. Build a MultiIndex of those keys
idx_diff = pd.MultiIndex.from_frame(keys_diff)

# 4. Create a mask on merged_post_llm for rows matching any of those keys
mask = merged_post_llm.set_index(["tweet_id", "row_num"]).index.isin(idx_diff)

# 5. Set both category and subcategory to "N/A" for all masked rows
merged_post_llm.loc[mask, ["category", "subcategory"]] = "N/A"

# 6. (Verify) Recompute duplicates with differing categories
dupes_merged = merged_post_llm[
    merged_post_llm.duplicated(subset=["tweet_id", "row_num"], keep=False)
]
dupes_diff_category = (
    dupes_merged
      .groupby(["tweet_id", "row_num"])
      .filter(lambda grp: grp["category"].nunique() > 1)
)
print(f"{len(dupes_diff_category)} rows still have differing categories:")
dupes_diff_category


0 rows still have differing categories:


Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language


In [25]:
# From the previously identified duplicates (dupes_merged), filter for groups
# where the “category” column has more than one unique value.

dupes_diff_category = (
    dupes_merged
      .groupby(["tweet_id", "row_num"])
      .filter(lambda grp: grp["category"].nunique() > 1)
      .sort_values(["tweet_id", "row_num"])
)

print(f"{len(dupes_diff_category)} rows where duplicates have differing categories:")
dupes_diff_category


0 rows where duplicates have differing categories:


Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language


In [26]:
# 1. Identify duplicates in merged_post_llm based on tweet_id and row_num
dupes_merged = merged_post_llm[
    merged_post_llm.duplicated(subset=["tweet_id", "row_num"], keep=False)
]
print(f"Duplicates in merged_post_llm ({len(dupes_merged)} rows):")
print(dupes_merged)

# 2. Identify duplicates in cleaned_master based on tweet_id and row_num
dupes_cleaned = cleaned_master[
    cleaned_master.duplicated(subset=["tweet_id", "row_num"], keep=False)
]
print(f"\nDuplicates in cleaned_master ({len(dupes_cleaned)} rows):")
print(dupes_cleaned)


Duplicates in merged_post_llm (1224 rows):
                                          tweet_id    row_num  category  \
600               @tollsy_2014-03-29T16:51:04.000Z  4667710.0  Use-case   
8385          @ACSPhysics_2023-11-22T14:13:11.000Z  6405482.0  Use-case   
8404     @mllevanessadias_2016-10-06T16:30:00.000Z  1422754.0  Use-case   
8418              @KB1REQ_2013-11-07T02:09:29.000Z  5740290.0  Use-case   
46192        @Machenideas_2019-12-04T14:47:44.000Z  2625885.0  Use-case   
...                                            ...        ...       ...   
2824653  @Cosmeti08841509_2019-02-27T13:21:54.000Z  2364816.0  Use-case   
2824654        @ElliotOJH_2023-07-26T15:37:14.000Z  6359771.0  Use-case   
2824656    @derbycollege_2012-08-06 13:18:30+00:00   114668.0  Use-case   
2824756       @mirrroring_2020-01-07T17:50:51.000Z   524007.0  Use-case   
2825251       @mirrroring_2013-09-11T20:07:14.000Z  5959718.0       NaN   

                   subcategory sentiment tweet_language 

### Drop Duplicates

In [27]:
# Drop all duplicate rows in merged_post_llm based on tweet_id and row_num
merged_post_llm = merged_post_llm.drop_duplicates(
    subset=["tweet_id", "row_num"],
    keep="first"
).reset_index(drop=True)


## Merge Inference Output Dataset with Cleaned Master Dataset

In [28]:
# Merge merged_post_llm with cleaned_master on both tweet_id and row_num,
# bringing in Date, Normalized Text, and Total Engagement
now_ts = datetime.now().strftime("%Y%m%d_%H%M%S")
merged_final = merged_post_llm.merge(
    cleaned_master[["tweet_id", "row_num", "Date", "Normalized Text", "Total Engagement"]],
    on=["tweet_id", "row_num"],
    how="inner"
)


# (Optional) Write out the merged result
#merged_final.to_csv(f"CLEANED_POST-LLM_MERGED_WITH_MASTER_{now_ts}.csv", index=False)
merged_final


Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language,Date,Normalized Text,Total Engagement
0,@3HTi__2017-04-13T21:13:00.000Z,1938218.0,Business-relevant,Intellectual Property & Patents,Neutral,English,2017-04-13 21:13:00+00:00,disney files patent for a 3d printed soft robo...,0
1,@benzuria_2017-06-12T16:33:14.000Z,1614375.0,Business-relevant,Intellectual Property & Patents,Neutral,English,2017-06-12 16:33:14+00:00,disney files patent for 'anti-scanning' materi...,0
2,@stephcurcio_2016-02-22T16:27:33.000Z,904891.0,Business-relevant,Intellectual Property & Patents,Positive,English,2016-02-22 16:27:33+00:00,disney files patent application for high resol...,0
3,@patesalo_e_2017-06-10T10:26:13.000Z,697479.0,Business-relevant,Intellectual Property & Patents,Neutral,English,2017-06-10 10:26:13+00:00,"disney files patent application for ""anti-scan...",0
4,@eTeknix_2015-04-19T18:04:07.000Z,4456221.0,Use-case,Consumer Products / Electronics,Positive,English,2015-04-19 18:04:07+00:00,disney fabric 3d printer – creations you’ll wa...,0
...,...,...,...,...,...,...,...,...,...
2835719,@technws_2016-08-22T19:24:23.000Z,1110601.0,Use-case,Consumer Products / Electronics,Neutral,English,2016-08-22 19:24:23+00:00,crunchgear: roll your own business cards with ...,0
2835720,@supermineadam_2014-10-04T06:40:11.000Z,5527728.0,Technological,"Hardware, Sensors & Equipment",Positive,English,2014-10-04 06:40:11+00:00,cubepro 3d printer : 3d printing . real. pro. ...,0
2835721,@MadeGrid_2013-07-16T11:24:16.000Z,5978567.0,Use-case,Consumer Products / Electronics,Neutral,English,2013-07-16 11:24:16+00:00,create your own products with ebay exact. (inc...,0
2835722,@lilli_eye_2017-02-07T12:33:08.000Z,1698758.0,Business-relevant,Other,Neutral,English,2017-02-07 12:33:08+00:00,daniela bertol at mad: shapeways designer in r...,0


In [29]:
len(cleaned_master) - len(merged_final)

35

In [30]:
# Identify duplicate rows in merged_final based on tweet_id and row_num
dupes = merged_final[
    merged_final.duplicated(subset=["tweet_id", "row_num"], keep=False)
].sort_values(["tweet_id", "row_num"])

# Display how many duplicates and show the first few
print(f"{len(dupes)} duplicate rows found in merged_final")
dupes.head()


0 duplicate rows found in merged_final


Unnamed: 0,tweet_id,row_num,category,subcategory,sentiment,tweet_language,Date,Normalized Text,Total Engagement
