Permalink
Browse files

Revamp importer logic to drop URLs

We've once again been bitten by the bug wherein we depend on the URLs
and end up causing duplicate posts or comment imports. While it was nice
to depend on those URLs, we don't need to.

This commit changes it so that our import_source lines are actually just
slash separated tuples that we can generate with information that we
know, and that the remote site has to provide. (Itemids, notably.)

This commit also takes care of a long-standing issue and updates the
grabbed_until columns for jobs so that long running jobs won't get
passed on to a new worker while they're still busy.
  • Loading branch information...
1 parent e557ee2 commit 0196aafc5ba82baaf84dcdcbf5b9ebc00bacca57 @zorkian zorkian committed Nov 3, 2012
View
@@ -4,3 +4,4 @@
/ext
/etc/config*.pl
/build
+*.sw?
@@ -0,0 +1,65 @@
+#!/usr/bin/perl
+
+use v5.10;
+use strict;
+use lib "$ENV{LJHOME}/cgi-bin";
+BEGIN { require 'ljlib.pl'; }
+use LJ::Talk;
+use DW::Worker::ContentImporter::Local::Entries;
+use DW::Worker::ContentImporter::Local::Comments;
+
+use Getopt::Long;
+
+my ( $user, $confirm );
+GetOptions(
+ 'user=s' => \$user,
+ 'confirm=s' => \$confirm,
+);
+
+my $u = LJ::load_user( $user )
+ or die "Usage: $0 -u USER -c CODEWORD\n";
+$confirm = $confirm && $confirm eq 'b00p' ? 1 : 0;
+
+# Select posts that were imported
+my %map = %{ DW::Worker::ContentImporter::Local::Entries->get_entry_map( $u ) || {} };
+unless ( scalar keys %map > 0 ) {
+ say 'Account has no imported entries, nothing to do.';
+ exit 0;
+}
+
+# Nuke all entries that have been imported.
+my %csrc_in = %{ DW::Worker::ContentImporter::Local::Comments->get_comment_map( $u ) || {} };
+my %csrc;
+$csrc{$csrc_in{$_}} = $_ foreach keys %csrc_in; # Invert it.
+
+foreach my $val ( keys %map ) {
+ my $jitemid = $map{$val};
+ say "$val (jitemid $jitemid) ...";
+
+ my $nuke = 1;
+ my %cmts = %{ LJ::Talk::get_talk_data( $u, 'L', $jitemid ) || {} };
+ foreach my $jtalkid ( keys %cmts ) {
+ next if exists $csrc{$jtalkid};
+
+ say " ... non-imported comment: $jtalkid";
+ $nuke = 0;
+ }
+
+ unless ( $nuke ) {
+ say ' ... NOT DELETING';
+ next;
+ }
+
+ if ( $confirm ) {
+ my $rv = LJ::delete_entry( $u, $jitemid, 0, undef );
+ if ( $rv ) {
+ say ' ... deleted';
+ } else {
+ say ' ... FAILED TO DELETE';
+ }
+ } else {
+ say ' ... no action, confirmation not set';
+ }
+}
+
+exit 0;
@@ -158,11 +158,6 @@ sub try_work {
$log->( 'Loaded entry map with %d entries.', scalar( keys %$entry_map ) );
$log->( 'memory usage is now %dMB', LJ::gtop()->proc_mem($$)->resident/1024/1024 );
- # and xpost map
- my $xpost_map = $class->get_xpost_map( $u, $data ) || {};
- $log->( 'Loaded xpost map with %d entries.', scalar( keys %$xpost_map ) );
- $log->( 'memory usage is now %dMB', LJ::gtop()->proc_mem($$)->resident/1024/1024 );
-
# now backfill into jitemid_map
my ( %entry_source, %jitemid_map );
$log->( 'Filtering parameters: hostname=[%s], username=[%s].', $data->{hostname}, $data->{username} );
@@ -175,7 +170,7 @@ sub try_work {
( $turl =~ /\b$data->{username}\b/ ||
( $data->{usejournal} && $turl =~ /\b$data->{usejournal}\b/ ) );
- if ( $url =~ m!/(\d+)\.html$! ) {
+ if ( $url =~ m!/(\d+)(?:\.html)?$! ) {
my $jitemid = $1 >> 8;
$jitemid_map{$jitemid} = $entry_map->{$url};
$entry_source{$jitemid_map{$jitemid}} = $url;
@@ -184,6 +179,11 @@ sub try_work {
$log->( 'Entry map has %d entries post-prune.', scalar( keys %$entry_map ) );
$log->( 'memory usage is now %dMB', LJ::gtop()->proc_mem($$)->resident/1024/1024 );
+ # now prepare the xpost map
+ my $xpost_map = $class->get_xpost_map( $u, $data ) || {};
+ $log->( 'Loaded xpost map with %d entries.', scalar( keys %$xpost_map ) );
+ $log->( 'memory usage is now %dMB', LJ::gtop()->proc_mem($$)->resident/1024/1024 );
+
foreach my $jitemid ( keys %$xpost_map ) {
$jitemid_map{$jitemid} = $xpost_map->{$jitemid};
$entry_source{$jitemid_map{$jitemid}} = "CROSSPOSTER " . $data->{hostname} . " " . $data->{username} . " $jitemid "
@@ -205,7 +205,7 @@ sub try_work {
( $turl =~ /\b$data->{username}\b/ ||
( $data->{usejournal} && $turl =~ /\b$data->{usejournal}\b/ ) );
- if ( $url =~ m!thread=(\d+)$! ) {
+ if ( $url =~ m!(?:thread=|/)(\d+)$! ) {
my $jtalkid = $1 >> 8;
$jtalkid_map->{$jtalkid} = $talk_map->{$url};
}
@@ -261,6 +261,10 @@ sub try_work {
# hit up the server for metadata
while ( defined $server_next_id && $server_next_id =~ /^\d+$/ ) {
+ # let them know we're still working
+ $job->grabbed_until( time() + 3600 );
+ $job->save;
+
$log->( 'Fetching metadata; max_id = %d, next_id = %d.', $server_max_id || 0, $server_next_id || 0 );
$title->( 'meta-fetch from id %d', $server_next_id );
@@ -459,6 +463,10 @@ sub try_work {
# start looping to fetch all of the comment bodies
while ( $lastid < $server_max_id ) {
+ # let them know we're still working
+ $job->grabbed_until( time() + 3600 );
+ $job->save;
+
$log->( 'Fetching bodydata; last_id = %d, max_id = %d.', $lastid || 0, $server_max_id || 0 );
my ( $reset_lastid, $reset_curid ) = ( $lastid, $curid );
@@ -142,7 +142,7 @@ sub try_work {
next unless $url =~ /\Q$data->{hostname}\E/ &&
$url =~ /\b$data->{username}\b/;
- unless ( $url =~ m!/(\d+)\.html$! ) {
+ unless ( $url =~ m!/(\d+)(?:\.html)?$! ) {
$log->( 'URL %s not of expected format in prune.', $url );
next;
}
@@ -167,22 +167,16 @@ sub try_work {
$title->( 'post-prune' );
- # used below for automatically determining prefixes
- my $url_prefix = 'http://' . $data->{username} . '.' . $data->{hostname};
- $url_prefix =~ s/_/-/g; # URLs use '-'
-
# this is a useful helper sub we use
my $count = 0;
my $process_entry = sub {
my $evt = $_[0];
- # URL remapping. It seems that sometimes LJ is returning a URL that
- # is prefixed with a hash mark, which is causing our duplicate check
- # to do report an all-clear, leading to dupes. It also makes comments
- # fail to import.
- $evt->{url} =~ s/^#/$url_prefix/;
+ # URL remapping. We know the username and the site, so we set this to
+ # something that is dependable.
+ $evt->{key} = $evt->{url} = $data->{hostname} . '/' . $data->{username} . '/' .
+ ( $evt->{itemid} * 256 + $evt->{anum} );
- $evt->{key} = $evt->{url};
$count++;
$log->( ' %d %s %s; mapped = %d (import_source) || %d (xpost).',
$evt->{itemid}, $evt->{url}, $evt->{logtime}, $entry_map->{$evt->{key}},
@@ -274,6 +268,10 @@ sub try_work {
# helper to load some events
my $fetch_events = sub {
+ # let them know we're still working
+ $job->grabbed_until( time() + 3600 );
+ $job->save;
+
$log->( 'Fetching %d items.', scalar @_ );
$title->( 'getevents - %d to %d', $_[0], $_[-1] );
@@ -152,7 +152,7 @@ sub insert_comment {
# load the data we need to make this comment
my $jitem = $EntryCache->{$cmt->{jitemid}} ||
LJ::Entry->new( $u, jitemid => $cmt->{jitemid} );
- my $source = ( $cmt->{entry_source} || $jitem->prop( "import_source" ) ) . "?thread=" . ( $cmt->{orig_id} << 8 );
+ my $source = ( $cmt->{entry_source} || $jitem->prop( "import_source" ) ) . "/" . ( $cmt->{orig_id} << 8 );
my $user = $cmt->{posterid} ? ( $UserCache->{$cmt->{posterid}} || LJ::load_userid( $cmt->{posterid} ) ) : undef;
# fix the XML timestamp to a useful timestamp

0 comments on commit 0196aaf

Please sign in to comment.