fix(corprep): add 'cluster' column to DataFrame in similar_docs funct…

…ions
entelecheia · Jul 27, 2023 · 1903b6f · 1903b6f
1 parent fd21b9e
commit 1903b6f
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 2 deletions.
diff --git a/src/corprep/conf/run/find_similar_docs.yaml b/src/corprep/conf/run/find_similar_docs.yaml
@@ -10,6 +10,7 @@ date_col: createdDt
 token_col: nouns
 id_col: newsId
 ordering_col: createdDt_int
+cluster_col: cluster
 duplicate_col: duplicate
 fig_col: fig_filename
 output_dir: .

diff --git a/src/corprep/datasets/similarity.py b/src/corprep/datasets/similarity.py
@@ -138,6 +138,7 @@ def process_batch(
     token_col: str = "nouns",
     id_col: str = "newsId",
     ordering_col: str = "createdDt_int",
+    cluster_col: str = "cluster",
     duplicate_col: str = "duplicate",
     fig_col: str = "fig_filename",
     output_dir: str = ".",
@@ -191,17 +192,18 @@ def process_batch(
 
     # Create DataFrame df with cluster labels and Unix timestamp createdDt
     df = pd.DataFrame(
-        {"cluster": ac.labels_, ordering_col: batch_data[ordering_col]},
+        {cluster_col: ac.labels_, ordering_col: batch_data[ordering_col]},
         index=batch_data[id_col],
     )
+    batch_data[cluster_col] = batch_data[id_col].map(df[cluster_col])
 
     # Convert createdDt_int to string and concatenate with newsId
     df[ordering_col] = df[ordering_col].astype(str)
     concat_col = f"{ordering_col}_{id_col}"
     df[concat_col] = df[ordering_col] + "|" + df.index
 
     # Find the minimum createdDt_newsId for each cluster
-    min_createdDt_newsId = df.groupby("cluster")[concat_col].min()
+    min_createdDt_newsId = df.groupby(cluster_col)[concat_col].min()
 
     # Extract newsId from the minimum createdDt_newsId
     earliest_doc_indices = min_createdDt_newsId.str.split("|").str[-1]
@@ -223,6 +225,7 @@ def find_similar_docs(
     token_col: str = "nouns",
     id_col: str = "newsId",
     ordering_col: str = "createdDt_int",
+    cluster_col: str = "cluster",
     duplicate_col: str = "duplicate",
     fig_col: str = "fig_filename",
     output_dir: str = ".",
@@ -238,6 +241,7 @@ def find_similar_docs(
     # Convert createdDt to Unix timestamp and store in createdDt_int
     data[ordering_col] = data[date_col].astype(np.int64) // 10**9
     data.set_index(date_col, inplace=True)
+    data[cluster_col] = None
     data[duplicate_col] = False
     data[fig_col] = None
 
@@ -259,6 +263,7 @@ def find_similar_docs(
         token_col=token_col,
         id_col=id_col,
         ordering_col=ordering_col,
+        cluster_col=cluster_col,
         duplicate_col=duplicate_col,
         fig_col=fig_col,
         output_dir=output_dir,