Skip to content

Commit

Permalink
Remove UTM tags from url and guid
Browse files Browse the repository at this point in the history
This helps preventing duplicate entries to appear -- some with utm tags
and others without.
  • Loading branch information
brutasse committed Oct 16, 2016
1 parent 3f21740 commit 2422bf0
Show file tree
Hide file tree
Showing 4 changed files with 369 additions and 1 deletion.
3 changes: 2 additions & 1 deletion feedhq/feeds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
from .tasks import (ensure_subscribed, store_entries, update_favicon,
update_feed)
from .utils import (epoch_to_utc, FAVICON_FETCHER, get_job, is_feed,
JobNotFound, USER_AGENT)
JobNotFound, remove_utm_tags, USER_AGENT)
from .. import es
from ..storage import OverwritingStorage
from ..tasks import enqueue
Expand Down Expand Up @@ -341,6 +341,7 @@ def entry_data(cls, entry, parsed):
data['guid'] = entry.title
if not data['guid']:
return
data['guid'] = remove_utm_tags(data['guid'])
if 'description' in entry:
data['subtitle'] = entry.description
if 'summary' in entry:
Expand Down
10 changes: 10 additions & 0 deletions feedhq/feeds/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
import datetime
from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit

from django.utils import timezone

Expand Down Expand Up @@ -36,3 +37,12 @@ def get_job(name):
if not redis.exists(key):
raise JobNotFound
return job_details(name, connection=redis)


def remove_utm_tags(guid):
parts = list(urlsplit(guid))
qs = parse_qs(parts[3]) # [3] is query component
filtered = sorted([(k, v) for k, v in qs.items()
if not k.startswith('utm_')])
parts[3] = urlencode(filtered, doseq=True)
return urlunsplit(parts)
Loading

0 comments on commit 2422bf0

Please sign in to comment.