Skip to content

Commit

Permalink
Improve Telligent importer
Browse files Browse the repository at this point in the history
* Try multiple filenames and do lots of guessing when searching for attachments
* Unescape HTML in filenames and replace invalid characters in filenames
* Existing permalinks prevented resuming of import
* Prevent duplicate attachments in same post
  • Loading branch information
gschlager committed Aug 13, 2018
1 parent dadbf2e commit 7f4ef3d
Showing 1 changed file with 60 additions and 16 deletions.
76 changes: 60 additions & 16 deletions script/import_scripts/telligent.rb
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
require_relative 'base'
require 'tiny_tds'

# Import script for Telligent communities
#
# Users are currently imported from a temp table. This will need some
# work the next time this import script is used, because that table
# won't exist. Also, it's really hard to find all attachments, but
# the script tries to do it anyway.

class ImportScripts::Telligent < ImportScripts::Base
BATCH_SIZE ||= 1000
LOCAL_AVATAR_REGEX ||= /\A~\/.*(?<directory>communityserver-components-(?:selectable)?avatars)\/(?<path>[^\/]+)\/(?<filename>.+)/i
REMOTE_AVATAR_REGEX ||= /\Ahttps?:\/\//i
EMBEDDED_ATTACHMENT_REGEX ||= /<a href="\/cfs-file(?:\.ashx)?\/__key\/(?<directory>[^\/]+)\/(?<path>[^\/]+)\/(?<filename>.+)">.*?<\/a>/i
EMBEDDED_ATTACHMENT_REGEX ||= /<a href="\/cfs-file(?:\.ashx)?\/__key\/(?<directory>[^\/]+)\/(?<path>[^\/]+)\/(?<filename1>.+)">(?<filename2>.*?)<\/a>/i

CATEGORY_LINK_NORMALIZATION = '/.*?(f\/\d+)$/\1'
TOPIC_LINK_NORMALIZATION = '/.*?(f\/\d+\/t\/\d+)$/\1'
Expand Down Expand Up @@ -174,7 +181,8 @@ def import_categories

if category_id = replace_with_category_id(row, child_categories, parent_category_id)
add_category(row['ForumId'], Category.find_by_id(category_id))
Permalink.create(url: "f/#{row['ForumId']}", category_id: category_id)
url = "f/#{row['ForumId']}"
Permalink.create(url: url, category_id: category_id) unless Permalink.exists?(url: url)
nil
else
{
Expand Down Expand Up @@ -268,7 +276,8 @@ def import_topics
post_create_action: proc do |post|
topic = post.topic
Jobs.enqueue_at(topic.pinned_until, :unpin_topic, topic_id: topic.id) if topic.pinned_until
Permalink.create(url: "f/#{row['ForumId']}/t/#{row['ThreadId']}", topic_id: topic.id)
url = "f/#{row['ForumId']}/t/#{row['ThreadId']}"
Permalink.create(url: url, topic_id: topic.id) unless Permalink.exists?(url: url)
end
}

Expand Down Expand Up @@ -345,7 +354,7 @@ def import_posts
end

def raw_with_attachment(row, user_id)
raw, embedded_paths = replace_embedded_attachments(row["Body"], user_id)
raw, embedded_paths, upload_ids = replace_embedded_attachments(row["Body"], user_id)
raw = html_to_markdown(raw) || ""

filename = row["FileName"]
Expand All @@ -358,13 +367,16 @@ def raw_with_attachment(row, user_id)
"%02d" % row["ApplicationId"],
"%02d" % row["ApplicationContentTypeId"],
("%010d" % row["ContentId"]).scan(/.{2}/),
filename
clean_filename(filename)
)

unless embedded_paths.include?(path)
if File.exists?(path)
upload = @uploader.create_upload(user_id, path, filename)
raw << "\n" << @uploader.html_for_upload(upload, filename) if upload.present? && upload.persisted?

if upload.present? && upload.persisted? && !upload_ids.include?(upload.id)
raw << "\n" << @uploader.html_for_upload(upload, filename)
end
else
STDERR.puts "Could not find file: #{path}"
end
Expand All @@ -375,31 +387,63 @@ def raw_with_attachment(row, user_id)

def replace_embedded_attachments(raw, user_id)
paths = []
upload_ids = []

raw = raw.gsub(EMBEDDED_ATTACHMENT_REGEX) do
match_data = Regexp.last_match
filename = match_data[:filename]

path = File.join(
ENV["FILE_BASE_DIR"],
match_data[:directory].gsub("-", "."),
match_data[:path].split("-"),
filename
)
filename, path = attachment_path(Regexp.last_match)

if File.exists?(path)
upload = @uploader.create_upload(user_id, path, filename)

if upload.present? && upload.persisted?
paths << path
upload_ids << upload.id
@uploader.html_for_upload(upload, filename)
end
else
STDERR.puts "Could not find file: #{path}"
end
end

[raw, paths]
[raw, paths, upload_ids]
end

def clean_filename(filename)
CGI.unescapeHTML(filename)
.gsub(/[\x00\/\\:\*\?\"<>\|]/, '_')
.gsub(/_(?:2B00|2E00|2D00|5B00|5D00|5F00)/, '')
end

def attachment_path(match_data)
filename, path = join_attachment_path(match_data, filename_index: 2)
filename, path = join_attachment_path(match_data, filename_index: 1) unless File.exists?(path)
[filename, path]
end

# filenames are a total mess - try to guess the correct filename
# works for 70% of all files
def join_attachment_path(match_data, filename_index:)
filename = clean_filename(match_data[:"filename#{filename_index}"])
base_path = File.join(
ENV["FILE_BASE_DIR"],
match_data[:directory].gsub("-", "."),
match_data[:path].split("-")
)

path = File.join(base_path, filename)
return [filename, path] if File.exists?(path)

original_filename = filename.dup

filename = filename.gsub("-", " ")
path = File.join(base_path, filename)
return [filename, path] if File.exists?(path)

filename = filename.gsub("_", "-")
path = File.join(base_path, filename)
return [filename, path] if File.exists?(path)

[original_filename, File.join(base_path, original_filename)]
end

def mark_topics_as_solved
Expand Down

0 comments on commit 7f4ef3d

Please sign in to comment.