From f5fc5283a244d958a5b99dfe7427b74ed033e7d1 Mon Sep 17 00:00:00 2001 From: e11sy <130844513+e11sy@users.noreply.github.com> Date: Mon, 4 Aug 2025 20:29:18 +0300 Subject: [PATCH 1/4] imp(): update node version (#429) --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index e9b98e8f2..73f4faba5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM node:16.20-slim as build-stage +FROM node:18.20-slim as build-stage RUN apt update RUN apt install git -y From 694712913c386ce99eeaf927665b80d71f3713c2 Mon Sep 17 00:00:00 2001 From: e11sy <130844513+e11sy@users.noreply.github.com> Date: Tue, 5 Aug 2025 17:00:23 +0300 Subject: [PATCH 2/4] chore: imp convertor script (#431) --- convertors/move-timestamp-out-of-payload.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/convertors/move-timestamp-out-of-payload.js b/convertors/move-timestamp-out-of-payload.js index 3bea5bf60..97f6b369d 100644 --- a/convertors/move-timestamp-out-of-payload.js +++ b/convertors/move-timestamp-out-of-payload.js @@ -1,4 +1,5 @@ require('dotenv').config(); +require('process'); const { MongoClient } = require('mongodb'); /** @@ -126,7 +127,7 @@ async function backfillTimestampsFromEvents(db, repetitionCollectionName, projec * Method that runs convertor script */ async function run() { - const fullUri = 'mongodb://hawk_new:evieg9bauK0ahs2youhoh7aer7kohT@rc1d-2jltinutse1eadfs.mdb.yandexcloud.net:27018/hawk_events?authSource=admin&replicaSet=rs01&tls=true&tlsInsecure=true'; + const fullUri = process.env.MONGO_EVENTS_DATABASE_URI; // Parse the Mongo URL manually const mongoUrl = new URL(fullUri); From c98b769ea7f0b6414c67552b741d48b70220ff73 Mon Sep 17 00:00:00 2001 From: e11sy <130844513+e11sy@users.noreply.github.com> Date: Tue, 5 Aug 2025 18:27:49 +0300 Subject: [PATCH 3/4] chore(): disable grouping by levenstein (#433) * chore(): disable grouping by levenstein * chore(): rm redundant test * chore: lint fix --- convertors/move-timestamp-out-of-payload.js | 59 ++++++++++------- workers/grouper/src/index.ts | 70 ++++++++++----------- workers/grouper/tests/index.test.ts | 18 +++--- 3 files changed, 82 insertions(+), 65 deletions(-) diff --git a/convertors/move-timestamp-out-of-payload.js b/convertors/move-timestamp-out-of-payload.js index 97f6b369d..225b4ad85 100644 --- a/convertors/move-timestamp-out-of-payload.js +++ b/convertors/move-timestamp-out-of-payload.js @@ -16,7 +16,12 @@ async function movePayloadTimestampToEventLevel(db, collectionName) { const docsToUpdate = collection.find( { timestamp: { $exists: false } }, - { projection: { _id: 1, 'payload.timestamp': 1 } } + { + projection: { + _id: 1, + 'payload.timestamp': 1, + }, + } ).limit(documentsSelectionLimit); const batchedOps = []; @@ -34,11 +39,11 @@ async function movePayloadTimestampToEventLevel(db, collectionName) { updateOne: { filter: { _id: doc._id }, update: { - $set: { timestamp: Number(doc.payload.timestamp)}, - $unset: {'payload.timestamp': ''}, - } - } - }) + $set: { timestamp: Number(doc.payload.timestamp) }, + $unset: { 'payload.timestamp': '' }, + }, + }, + }); currentCount++; } @@ -47,7 +52,7 @@ async function movePayloadTimestampToEventLevel(db, collectionName) { await collection.bulkWrite(batchedOps); } - return currentCount + return currentCount; } /** * @param db - mongo db instance @@ -58,15 +63,21 @@ async function backfillTimestampsFromEvents(db, repetitionCollectionName, projec const repetitions = db.collection(repetitionCollectionName); const events = db.collection(`events:${projectId}`); - let bulkOps = []; + const bulkOps = []; let repetitionCount = 1; const repetitionsList = await repetitions.find( { timestamp: { $exists: false }, }, - { projection: { _id: 1, groupHash: 1 } } - ).limit(documentsSelectionLimit).toArray(); + { + projection: { + _id: 1, + groupHash: 1, + }, + } + ).limit(documentsSelectionLimit) + .toArray(); const groupHashList = []; @@ -78,14 +89,19 @@ async function backfillTimestampsFromEvents(db, repetitionCollectionName, projec const relatedEvents = await events.find( { groupHash: { $in: groupHashList } }, - { projection: { timestamp: 1, groupHash: 1 } } + { + projection: { + timestamp: 1, + groupHash: 1, + }, + } ).toArray(); - const relatedEventsMap = new Map() + const relatedEventsMap = new Map(); relatedEvents.forEach(e => { relatedEventsMap.set(e.groupHash, e); - }) + }); for (const repetition of repetitionsList) { const relatedEvent = relatedEventsMap.get(repetition.groupHash); @@ -93,9 +109,9 @@ async function backfillTimestampsFromEvents(db, repetitionCollectionName, projec if (!relatedEvent) { bulkOps.push({ deleteOne: { - filter: { _id: repetition._id } - } - }) + filter: { _id: repetition._id }, + }, + }); } else if (relatedEvent?.timestamp !== null) { bulkOps.push({ updateOne: { @@ -112,11 +128,12 @@ async function backfillTimestampsFromEvents(db, repetitionCollectionName, projec const result = await repetitions.bulkWrite(bulkOps); const updated = result.modifiedCount; const deleted = result.deletedCount; + processed = bulkOps.length; console.log(` updates (${processed} processed, ${updated} updated, ${deleted} deleted)`); if (updated + deleted === 0) { - repetitionCollectionsToCheck.filter(collection => collection !== repetition) + repetitionCollectionsToCheck.filter(collection => collection !== repetition); } } @@ -175,13 +192,13 @@ async function run() { // Convert events let i = 1; - let documentsUpdatedCount = 1 + let documentsUpdatedCount = 1; while (documentsUpdatedCount != 0) { documentsUpdatedCount = 0; i = 1; const collectionsToUpdateCount = eventCollectionsToCheck.length; - + for (const collectionName of eventCollectionsToCheck) { console.log(`[${i}/${collectionsToUpdateCount}] Processing ${collectionName}`); const updated = await movePayloadTimestampToEventLevel(db, collectionName); @@ -190,10 +207,10 @@ async function run() { eventCollectionsToCheck = eventCollectionsToCheck.filter(collection => collection !== collectionName); } - documentsUpdatedCount += updated + documentsUpdatedCount += updated; i++; } - } + } // Convert repetitions + backfill from events documentsUpdatedCount = 1; diff --git a/workers/grouper/src/index.ts b/workers/grouper/src/index.ts index d6a01caf0..4a2809aad 100644 --- a/workers/grouper/src/index.ts +++ b/workers/grouper/src/index.ts @@ -23,7 +23,7 @@ import HawkCatcher from '@hawk.so/nodejs'; import { MS_IN_SEC } from '../../../lib/utils/consts'; import DataFilter from './data-filter'; import RedisHelper from './redisHelper'; -import levenshtein from 'js-levenshtein'; +// import levenshtein from 'js-levenshtein'; import { computeDelta } from './utils/repetitionDiff'; import TimeMs from '../../../lib/utils/time'; import { rightTrim } from '../../../lib/utils/string'; @@ -109,7 +109,7 @@ export default class GrouperWorker extends Worker { let existedEvent = await this.getEvent(task.projectId, uniqueEventHash); /** - * If we couldn't group by group hash (title), try grouping by Levenshtein distance or patterns + * If we couldn't group by group hash (title), try grouping by patterns */ if (!existedEvent) { const similarEvent = await this.findSimilarEvent(task.projectId, task.payload); @@ -287,35 +287,35 @@ export default class GrouperWorker extends Worker { * @param event - event to compare */ private async findSimilarEvent(projectId: string, event: EventData): Promise { - const eventsCountToCompare = 60; - const diffTreshold = 0.35; + // const eventsCountToCompare = 60; + // const diffTreshold = 0.35; - const lastUniqueEvents = await this.findLastEvents(projectId, eventsCountToCompare); + // const lastUniqueEvents = await this.findLastEvents(projectId, eventsCountToCompare); /** * Trim titles to reduce CPU usage for Levenshtein comparison */ - const trimmedEventTitle = hasValue(event.title) ? rightTrim(event.title, MAX_CODE_LINE_LENGTH) : ''; + // const trimmedEventTitle = hasValue(event.title) ? rightTrim(event.title, MAX_CODE_LINE_LENGTH) : ''; /** * First try to find by Levenshtein distance */ - const similarByLevenshtein = lastUniqueEvents.filter(prevEvent => { - const trimmedPrevTitle = hasValue(prevEvent.payload.title) ? rightTrim(prevEvent.payload.title, MAX_CODE_LINE_LENGTH) : ''; + // const similarByLevenshtein = lastUniqueEvents.filter(prevEvent => { + // const trimmedPrevTitle = hasValue(prevEvent.payload.title) ? rightTrim(prevEvent.payload.title, MAX_CODE_LINE_LENGTH) : ''; - if (trimmedEventTitle === '' || trimmedPrevTitle === '') { - return false; - } + // if (trimmedEventTitle === '' || trimmedPrevTitle === '') { + // return false; + // } - const distance = levenshtein(trimmedEventTitle, trimmedPrevTitle); - const threshold = trimmedEventTitle.length * diffTreshold; + // const distance = levenshtein(trimmedEventTitle, trimmedPrevTitle); + // const threshold = trimmedEventTitle.length * diffTreshold; - return distance < threshold; - }).pop(); + // return distance < threshold; + // }).pop(); - if (similarByLevenshtein) { - return similarByLevenshtein; - } + // if (similarByLevenshtein) { + // return similarByLevenshtein; + // } /** * If no match by Levenshtein, try matching by patterns @@ -402,23 +402,23 @@ export default class GrouperWorker extends Worker { * @param count - how many events to return * @returns {GroupedEventDBScheme[]} list of the last N unique events */ - private findLastEvents(projectId: string, count: number): Promise { - return this.cache.get(`last:${count}:eventsOf:${projectId}`, async () => { - return this.eventsDb.getConnection() - .collection(`events:${projectId}`) - .find() - .sort({ - _id: 1, - }) - .limit(count) - .toArray(); - }, - /** - * TimeMs class stores time intervals in milliseconds, however NodeCache ttl needs to be specified in seconds - */ - /* eslint-disable-next-line @typescript-eslint/no-magic-numbers */ - TimeMs.MINUTE / 1000); - } + // private findLastEvents(projectId: string, count: number): Promise { + // return this.cache.get(`last:${count}:eventsOf:${projectId}`, async () => { + // return this.eventsDb.getConnection() + // .collection(`events:${projectId}`) + // .find() + // .sort({ + // _id: 1, + // }) + // .limit(count) + // .toArray(); + // }, + // /** + // * TimeMs class stores time intervals in milliseconds, however NodeCache ttl needs to be specified in seconds + // */ + // /* eslint-disable-next-line @typescript-eslint/no-magic-numbers */ + // TimeMs.MINUTE / 1000); + // } /** * Decides whether to increase the number of affected users for the repetition and the daily aggregation diff --git a/workers/grouper/tests/index.test.ts b/workers/grouper/tests/index.test.ts index bf3ea84e5..d3f5649e1 100644 --- a/workers/grouper/tests/index.test.ts +++ b/workers/grouper/tests/index.test.ts @@ -457,17 +457,17 @@ describe('GrouperWorker', () => { }); describe('Grouping', () => { - test('should group events with partially different titles', async () => { - await worker.handle(generateTask({ title: 'Some error (but not filly identical) example' })); - await worker.handle(generateTask({ title: 'Some error (yes, it is not the identical) example' })); - await worker.handle(generateTask({ title: 'Some error (and it is not identical) example' })); + // test('should group events with partially different titles', async () => { + // await worker.handle(generateTask({ title: 'Some error (but not filly identical) example' })); + // await worker.handle(generateTask({ title: 'Some error (yes, it is not the identical) example' })); + // await worker.handle(generateTask({ title: 'Some error (and it is not identical) example' })); - const originalEvent = await eventsCollection.findOne({}); + // const originalEvent = await eventsCollection.findOne({}); - expect((await repetitionsCollection.find({ - groupHash: originalEvent.groupHash, - }).toArray()).length).toBe(2); - }); + // expect((await repetitionsCollection.find({ + // groupHash: originalEvent.groupHash, + // }).toArray()).length).toBe(2); + // }); describe('Pattern matching', () => { beforeEach(() => { From 1dfb4a55340205a84e21bc484c4739d2f27b006a Mon Sep 17 00:00:00 2001 From: Tatiana Fomina Date: Wed, 6 Aug 2025 18:07:55 +0300 Subject: [PATCH 4/4] fix(groupper): Remove sending duplicate key error (#438) * Remove sending error * Lint --- workers/grouper/src/index.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/workers/grouper/src/index.ts b/workers/grouper/src/index.ts index 4a2809aad..b1eb7883c 100644 --- a/workers/grouper/src/index.ts +++ b/workers/grouper/src/index.ts @@ -19,7 +19,6 @@ import type { import type { RepetitionDBScheme } from '../types/repetition'; import { DatabaseReadWriteError, DiffCalculationError, ValidationError } from '../../../lib/workerErrors'; import { decodeUnsafeFields, encodeUnsafeFields } from '../../../lib/utils/unsafeFields'; -import HawkCatcher from '@hawk.so/nodejs'; import { MS_IN_SEC } from '../../../lib/utils/consts'; import DataFilter from './data-filter'; import RedisHelper from './redisHelper'; @@ -177,7 +176,6 @@ export default class GrouperWorker extends Worker { * and we need to process this event as repetition */ if (e.code?.toString() === DB_DUPLICATE_KEY_ERROR) { - HawkCatcher.send(new Error('[Grouper] MongoError: E11000 duplicate key error collection')); await this.handle(task); return;