Skip to content

Commit

Permalink
Utils: +runtime checks of config lists' uniqueness; fix the many conf…
Browse files Browse the repository at this point in the history
…ig errors exposed by the new checks
  • Loading branch information
gwern committed Sep 3, 2023
1 parent 7799fcf commit 8896f36
Show file tree
Hide file tree
Showing 14 changed files with 1,448 additions and 1,567 deletions.
3 changes: 2 additions & 1 deletion build/Config/GenerateSimilar.hs
@@ -1,6 +1,7 @@
module Config.GenerateSimilar where

import Data.List (isPrefixOf, isSuffixOf)
import Utils (isUniqueList)

-- how many results do we want?
bestNEmbeddings :: Int
Expand All @@ -27,5 +28,5 @@ embeddingsPath = "metadata/embeddings.bin"

-- some weird cases: for example, “Estimating the effect-size of gene dosage on cognitive ability across the coding genome” is somehow close to *every* embedding...?
blackList :: String -> Bool
blackList p = p `elem` ["https://www.biorxiv.org/content/10.1101/2020.04.03.024554.full", "/doc/genetics/heritable/correlation/2019-kandler.pdf", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4210287/", "https://www.wired.com/1996/12/ffglass/", "https://andrewmayneblog.wordpress.com/2021/05/18/a-simple-method-to-keep-gpt-3-focused-in-a-conversation/", "https://www.dutchnews.nl/news/2022/07/german-fighter-pilot-identified-after-79-years-from-dna-on-envelope/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1065034/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2653069/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2925254/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2998793/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4763788/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4921196/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6022844/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8931369/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9232116/", "https://www.statnews.com/2022/07/28/abandoned-technique-revived-in-effort-to-make-artificial-human-eggs/", "https://www.thenationalnews.com/health/2022/09/07/woman-who-can-smell-parkinsons-helps-scientists-develop-new-test-for-condition/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4898064/"]
blackList p = p `elem` isUniqueList ["https://www.biorxiv.org/content/10.1101/2020.04.03.024554.full", "/doc/genetics/heritable/correlation/2019-kandler.pdf", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4210287/", "https://www.wired.com/1996/12/ffglass/", "https://andrewmayneblog.wordpress.com/2021/05/18/a-simple-method-to-keep-gpt-3-focused-in-a-conversation/", "https://www.dutchnews.nl/news/2022/07/german-fighter-pilot-identified-after-79-years-from-dna-on-envelope/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1065034/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2653069/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2925254/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2998793/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4763788/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4921196/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6022844/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8931369/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9232116/", "https://www.statnews.com/2022/07/28/abandoned-technique-revived-in-effort-to-make-artificial-human-eggs/", "https://www.thenationalnews.com/health/2022/09/07/woman-who-can-smell-parkinsons-helps-scientists-develop-new-test-for-condition/", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4898064/"]
|| "/doc/" `isPrefixOf` p && "/index" `isSuffixOf` p
815 changes: 320 additions & 495 deletions build/Config/Inflation.hs

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions build/Config/Interwiki.hs
Expand Up @@ -2,12 +2,13 @@
module Config.Interwiki where

import qualified Data.Text as T (Text)
import Utils (isUniqueList, isUniqueKeys)

quoteOverrides :: [T.Text]
quoteOverrides = ["Antoine's", "Bloomingdale's", "Collier's", "Kinko's", "Mzoli's", "Security_hacker#Birth_of_subculture_and_entering_mainstream:_1960's-1980's", "Security hacker#Birth of subculture and entering mainstream: 1960's-1980's"]
quoteOverrides = isUniqueList ["Antoine's", "Bloomingdale's", "Collier's", "Kinko's", "Mzoli's", "Security_hacker#Birth_of_subculture_and_entering_mainstream:_1960's-1980's", "Security hacker#Birth of subculture and entering mainstream: 1960's-1980's"]

redirectDB :: [(T.Text, T.Text)]
redirectDB = [
redirectDB = isUniqueKeys [
("https://en.wikipedia.org/wiki/WP:RS", "https://en.wikipedia.org/wiki/Wikipedia:Reliable_sources")
, ("https://en.wikipedia.org/wiki/1000_Genomes", "https://en.wikipedia.org/wiki/1000_Genomes_Project")
, ("https://en.wikipedia.org/wiki/120_Days_of_Sodom", "https://en.wikipedia.org/wiki/The_120_Days_of_Sodom")
Expand Down
8 changes: 2 additions & 6 deletions build/Config/LinkArchive.hs
@@ -1,6 +1,6 @@
module Config.LinkArchive where

import Utils (sed, anyInfix, anyPrefix, anySuffix, replace)
import Utils (sed, anyInfix, anyPrefix, anySuffix, replace, isUniqueList)

archiveDelay, archivePerRunN :: Integer
archiveDelay = 60
Expand Down Expand Up @@ -94,7 +94,7 @@ whiteList url
| anyInfix url ["citeseerx.ist.psu.edu"] = False -- TODO: after fixing all existing Citeseer links, set this rule to False
| anyPrefix url ["/", "./", "../", "https://gwern.net", "#", "!", "$", "mailto", "irc", "/metadata/", "/doc/"] = True
| anySuffix url [".pdf", "/pdf", ".pdf#"] = False
| anyInfix url [
| anyInfix url $ isUniqueList [
"archive.org"
, "web.archive.org" -- TODO: we want to avoid IA links long-term (see <https://gwern.net/archiving#why-not-internet-archive>), so once all the regular links are archived, remove IA from the whitelist so they start archiving too
, ".txt" -- TODO: generalize the PDF download to handle all non-HTML filetypes
Expand All @@ -106,7 +106,6 @@ whiteList url
, ".png"
, ".ogg"
, ".jpg"
, ".ogg"
, "halshs.archives-ouvertes.fr/"
, "apenwarr.ca"
, "distill.pub"
Expand Down Expand Up @@ -140,7 +139,6 @@ whiteList url
, "cdlib.org"
, "econlib.org"
, "ssgac.org"
, "stlouisfed.org"
, "davidsongifted.org"
, "projecteuclid.org"
, "erowid.org"
Expand Down Expand Up @@ -531,7 +529,6 @@ whiteList url
, "touhouwiki.net" -- stable
, "epjournal.net"
, "gwern.net" -- redundant
, "gwern.net" -- redundant
, "lwn.net" -- stable
, "incompleteideas.net" -- stable
, "videolectures.net" -- service/interactive
Expand Down Expand Up @@ -923,7 +920,6 @@ whiteList url
, "https://universome.github.io/stylegan-v" -- video embed
, "https://openaipublic.blob.core.windows.net/webgpt-answer-viewer/index.html" -- interactive
, "http://recur-env.eba-rm3fchmn.us-east-2.elasticbeanstalk.com/" -- interactive
, "http://recur-env.eba-rm3fchmn.us-east-2.elasticbeanstalk.com/" -- interactive
, "https://pandoc.org/" -- homepage
, "https://caniuse.com/" -- updated
, "https://www.vesta.earth/" -- homepage
Expand Down
3 changes: 2 additions & 1 deletion build/Config/LinkAuto.hs
Expand Up @@ -3,11 +3,12 @@ module Config.LinkAuto where

import Data.List (sortBy)
import qualified Data.Text as T (length, Text)
import Utils (isUniqueAll)

-- descending order, longest match to shortest (for regex priority):
-- WARNING: we appear to be hitting some sort of exponential slowdown despite the optimizations. From now on, delete at least one rewrite for every added rewrite. Many are unnecessary.
custom :: [(T.Text, T.Text)]
custom = sortBy (\a b -> compare (T.length $ fst b) (T.length $ fst a)) [
custom = sortBy (\a b -> compare (T.length $ fst b) (T.length $ fst a)) $ isUniqueAll [
("(1-Lipschitz|Lipschitz)", "https://en.wikipedia.org/wiki/Lipschitz_continuity")
, ("(A2C|A3C|[Aa]synchronous [Aa]dvantage [Aa]ctor-[Cc]ritic)", "https://arxiv.org/abs/1602.01783#deepmind")
, ("(ADHD|[Aa]ttention[ -][Dd]eficit [Hh]yperactivity [Dd]isorder)s?", "https://en.wikipedia.org/wiki/Attention_deficit_hyperactivity_disorder")
Expand Down

0 comments on commit 8896f36

Please sign in to comment.