# Make JSON files with metadata from existing files

In [2]:
import koalas as kl

## Common words (Gutenberg)

In [2]:
wordlist = kl.read_csv('original_wordlists/words10000.wiktionary200604.csv').rename(columns={'count (per billion)': 'frequency'})
wordlist

Unnamed: 0,rank,word,frequency
0,1,the,5.627187e+07
1,2,of,3.395006e+07
2,3,and,2.994418e+07
3,4,to,2.595610e+07
4,5,in,1.742064e+07
5,6,i,1.176480e+07
6,7,that,1.107332e+07
7,8,was,1.007824e+07
8,9,his,8.799755e+06
9,10,he,8.397205e+06


In [3]:
wordlist[wordlist['word'].isna()]

Unnamed: 0,rank,word,frequency
14660,14661,,1465.97


In [5]:
wordlist.loc[14660, 'word'] = 'null'

In [6]:
name = 'common words Gutenberg'
provenance = {'title': 'The 36663 most common words in Project Gutenberg',
              'category': 'high frequency words',
              'source': 'Wiktionary',
              'link': 'https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#Project_Gutenberg',
              'description': '''Most common words in Project Gutenberg: These lists are the most frequent words, when performing a simple, straight (obvious) frequency count of all the books found on Project Gutenberg. The list of books was downloaded in July 2005, and "rsynced" monthly thereafter. These are mostly English words, with some other languages finding representation to a lesser extent. Many Project Gutenberg books are scanned once their copyright expires, typically book editions published before 1923, so the language does not necessarily always represent current usage. For example, "thy" is listed as the 280th most common word. Also, with 24,000+ books, the text of the boilerplate warning for Project Gutenberg appears on each of them.''',
              'licence': 'Creative Commons Attribution-ShareAlike License',
              'version': 'as of 2006-04-16'}
termColumn = 'word'

In [7]:
kl.lists.save(wordlist, name, provenance, termColumn)

## Common words (Google)

In [5]:
wordlist = kl.read_csv('sources/GoogleTop20000.txt', names=['word'])
wordlist

Unnamed: 0,word
0,the
1,of
2,and
3,to
4,a
5,in
6,for
7,is
8,on
9,that


In [6]:
name = 'common words Google'
provenance = {'title': '20000 most common words according to Google',
              'category': 'high frequency words',
              'source': 'Google',
              'link': 'https://github.com/first20hours/google-10000-english/blob/master/20k.txt',
              'description': '''This repo contains a list of the 10,000 most common English words in order of frequency, as determined by n-gram frequency analysis of the Google's Trillion Word Corpus. This repo is useful as a corpus for typing training programs. According to analysis of the Oxford English Corpus, the 7,000 most common English lemmas account for approximately 90% of usage, so a 10,000 word training corpus is more than sufficient for practical training applications.''',
              'licence': 'Web 1T 5-gram Version 1 Agreement',
              'version': '2016/07/18 (commit: cb727a8ef0932d7d9519e8fe3973ad0a9600170d)'}
termColumn = 'word'

In [7]:
kl.lists.save(wordlist, name, provenance, termColumn)

## Common words (BYU)

In [36]:
wordlist = kl.read_excel('original_wordlists/WordFrequencyMostFrequ5000.xlsx')
wordlist.remove_whitespace(on='word').to_csv('clean.csv')

In [37]:
name = 'common words BYU'
provenance = {'title': '5000 Most Frequent words in a balanced corpus of American English',
              'category': 'high frequency words',
              'source': 'Brigham Young University',
              'link': 'https://www.wordfrequency.info/free.asp',
              'description': '''The 5,000-60,000 word lists are based on the only large, genre-balanced, up-to-date corpus of American English -- the 560 million word Corpus of Contemporary American English (COCA).''',
              'licence': 'free to use',
              'version': 'unknown, fetched 2018/04/19'}
termColumn = 'word'

In [38]:
kl.lists.save(wordlist, name, provenance, termColumn)

## Stop words (Humboldt)

In [29]:
wordlist = kl.read_csv('original_wordlists/StopwordsList.txt', names=['word'])
wordlist

Unnamed: 0,word
0,a
1,about
2,above
3,across
4,after
5,afterwards
6,again
7,against
8,all
9,almost


In [30]:
name = 'stopwords Humbolt'
provenance = {'title': 'A list of 319 stopwords (Humbolt Stopwords)',
              'category': 'stopwords',
              'source': 'HumboltStopwords',
              'link': 'http://xpo6.com/list-of-english-stop-words/',
              'description': 'Stop Words are words which do not contain important significance to be used in Search Queries. Usually these words are filtered out from search queries because they return vast amount of unnecessary information.',
              'licence': 'unknown',
              'version': 'as of 2006-04-16'}
termColumn = 'word'

In [31]:
kl.lists.save(wordlist, name, provenance, termColumn)

## Stop words (Stanford)

In [23]:
wordlist = kl.read_csv('original_wordlists/StandfordNLPcoreNLPstopwords.txt', names=['word'])
wordlist

Unnamed: 0,word
0,!!
1,?!
2,??
3,!?
4,`
5,``
6,''
7,-lrb-
8,-rrb-
9,-lsb-


In [24]:
name = 'stopwords Stanford'
provenance = {'title': '257 Stopwords used in the Standford Core NLP functionalities',
              'category': 'stopwords',
              'source': 'Stanford NLP',
              'link': 'https://github.com/stanfordnlp/CoreNLP/blob/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt',
              'description': 'Single words including punctuation and contracted words with apostrophe.',
              'licence': 'GNU GPL v3',
              'version': '2016/01/23 (commit: ceefe81491183e557cd047755a5a88a90bfeb2af)'}
termColumn = 'word'

In [25]:
kl.lists.save(wordlist, name, provenance, termColumn)

## Stop words (NLTK)

In [20]:
wordlist = kl.read_csv('original_wordlists/NLTKstopwords.txt', names=['word'])
wordlist

Unnamed: 0,word
0,i
1,me
2,my
3,myself
4,we
5,our
6,ours
7,ourselves
8,you
9,you're


In [21]:
name = 'stopwords Stanford'
provenance = {'title': '179 Stopwords used in NLTK',
              'category': 'stopwords',
              'source': 'NLTK',
              'link': 'http://www.nltk.org/nltk_data/',
              'description': 'Single words including contracted words with apostrophe.',
              'licence': 'unknown',
              'version': 'unknown'}
termColumn = 'word'

In [22]:
kl.lists.save(wordlist, name, provenance, termColumn)

## Stop words (MySQL)

In [23]:
wordlist = kl.read_csv('sources/MySQLstopWords.txt', names=['word'])
wordlist

Unnamed: 0,word
0,a's
1,able
2,about
3,above
4,according
5,accordingly
6,across
7,actually
8,after
9,afterwards


In [24]:
name = 'stopwords MySQL'
provenance = {'title': '543 Stopwords used in MySQL',
              'category': 'stopwords',
              'source': 'Ranks.nl',
              'link': 'https://www.ranks.nl/stopwords',
              'description': 'Single words including contracted words with apostrophe.',
              'licence': 'unknown',
              'version': 'unknown'}
termColumn = 'word'

In [25]:
kl.lists.save(wordlist, name, provenance, termColumn)

## British and American English

In [36]:
wordlist = kl.read_csv('original_wordlists/British_American_English.csv')
wordlist

Unnamed: 0,british,american
0,accessorise,accessorize
1,accessorised,accessorized
2,accessorises,accessorizes
3,accessorising,accessorizing
4,acclimatisation,acclimatization
5,acclimatise,acclimatize
6,acclimatised,acclimatized
7,acclimatises,acclimatizes
8,acclimatising,acclimatizing
9,accoutrements,accouterments


In [37]:
name = 'British English'
provenance = {'title': 'British English and American English variants',
              'category': 'high frequency words',
              'source': 'unknown',
              'link': 'unknown',
              'description': 'A list of words whose spelling differs between British and American English.',
              'licence': 'unknown',
              'version': 'unknown'}
termColumn = 'british'
substitutionColumn = 'american'

In [38]:
kl.lists.save(wordlist, name, provenance, termColumn,substitutionColumn)

## Cities

In [32]:
wordlist = kl.read_csv('sources/WikipediaTopCities.txt', sep='\t')
wordlist

Unnamed: 0,City,Nation
0,Chongqing,China
1,Shanghai,China
2,Delhi,India
3,Beijing,China
4,Mumbai,India
5,Lagos,Nigeria
6,Chengdu,China
7,Karachi,Pakistan
8,Guangzhou,China
9,Istanbul,Turkey


In [33]:
name = 'cities'
provenance = {'title': 'List of the 223 largest cities',
              'category': 'geo-political entities',
              'source': 'Wikipedia',
              'link': 'https://en.wikipedia.org/wiki/List_of_largest_cities',
              'description': 'A list of large cities and the countries they lie in.',
              'licence': 'Creative Commons Attribution-ShareAlike License',
              'version': '04:40, 18 April 2018'}
termColumn = 'City'

In [34]:
kl.lists.save(wordlist, name, provenance, termColumn)

## Demonyms

In [8]:
wordlist = (kl.read_csv('original_wordlists/demonyms.csv')
              .rename(columns={'Adjectivals': 'adjective', 'Demonyms': 'demonym', 'Country name': 'country'})
              .remove_qualifiers(on='country')
              .str.replace(r',.*|/.*', '')
              .remove_qualifiers(on='adjective')
              .str.replace(r',.*|/.*', '')
              .str.replace(r' or ', ' and ')
              .remove_qualifiers(on='demonym')
              .str.replace(r',.*|/.*', '')
              .str.replace(r' or ', ' and ')
              .str.replace(r's$', '')
           )
wordlist

Unnamed: 0,country,adjective,demonym
0,Abkhazia,Abkhaz,Abkhazian
1,Afghanistan,Afghan,Afghan
2,Åland Islands,Åland Island,Åland Islander
3,Albania,Albanian,Albanian
4,Algeria,Algerian,Algerian
5,American Samoa,American Samoan,American Samoan
6,Andorra,Andorran,Andorran
7,Angola,Angolan,Angolan
8,Anguilla,Anguillan,Anguillan
9,Antarctica,Antarctic,Antarctic resident


In [11]:
name = 'demonyms'
provenance = {'title': 'List of countries and demonyms',
              'category': 'geo-political entities',
              'source': 'Wikipedia',
              'link': 'https://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations',
              'description': 'List of countries and territories with adjectives and names of their inhabitants',
              'licence': 'Creative Commons Attribution-ShareAlike License',
              'version': '06:57, 13 May 2018'}
termColumn = 'adjective'

In [12]:
kl.lists.save(wordlist, name, provenance, termColumn)

## Diseases

In [38]:
wordlist = kl.read_csv('sources/Diseases.txt', names=['term'])
wordlist

Unnamed: 0,term
0,Abdominal Aortic Aneurysm
1,Acanthamoeba
2,Acanthamoeba
3,Acanthamoeba Infection
4,ACE
5,Acinetobacter Infection
6,Acquired Immune Deficiency Syndrome
7,Acquired Immunodeficiency Syndrome
8,Adenovirus Infection
9,Adenovirus Vaccination


In [39]:
name = 'diseases'
provenance = {'title': 'Diseases',
              'category': 'domain specific terms',
              'source': 'unknown',
              'link': 'unknown',
              'description': 'A list of 938 diseases including abbreviations and spelling variations.',
              'licence': 'unknown',
              'version': 'unknown'}
termColumn = 'term'

In [40]:
kl.lists.save(wordlist, name, provenance, termColumn)

## Chemical elements

In [2]:
wordlist = kl.WordFrame({'term': ['hydrogen','helium','lithium','beryllium','boron','carbon','nitrogen','oxygen','fluorine',
                'neon','sodium','magnesium','aluminium', 'aluminum','silicon','phosphorus','sulfur','chlorine','argon',
                'potassium','calcium','scandium','titanium','vanadium','chromium','manganese','iron','cobalt',
                'nickel','copper','zinc','gallium','germanium','arsenic','selenium','bromine','krypton',
                'rubidium','strontium','yttrium','zirconium','niobium','molybdenum','technetium','ruthenium',
                'rhodium','palladium','silver','cadmium','indium','tin','antimony','tellurium','iodine','xenon',
                'caesium', 'cesium','barium','lanthanum','cerium','praseodymium','neodymium','promethium','samarium',
                'europium','gadolinium','terbium','dysprosium','holmium','erbium','thulium','ytterbium',
                'lutetium','hafnium','tantalum','tungsten','rhenium','osmium','iridium','platinum','gold','mercury',
                'thallium','lead','bismuth','polonium','astatine','radon','francium','radium','actinium','thorium',
                'protactinium','uranium','neptunium','plutonium','americium','curium','berkelium','californium',
                'einsteinium','fermium','mendelevium','nobelium','lawrencium','rutherfordium','dubnium','seaborgium',
                'bohrium','hassium','meitnerium','darmstadtium','roentgenium','copernicium','nihonium','flerovium',
                'moscovium','livermorium','tennessine','oganesson']})
wordlist

Unnamed: 0,term
0,hydrogen
1,helium
2,lithium
3,beryllium
4,boron
5,carbon
6,nitrogen
7,oxygen
8,fluorine
9,neon


In [3]:
name = 'chemical elements'
provenance = {'title': 'Chemical elements',
              'category': 'domain specific terms',
              'source': 'unknown',
              'link': 'unknown',
              'description': 'A list of the first 119 chemical elements.',
              'licence': 'unknown',
              'version': 'unknown'}
termColumn = 'term'

In [4]:
kl.lists.save(wordlist, name, provenance, termColumn)

## Accented characters

In [44]:
wordlist = kl.read_csv('sources/chars.csv')
wordlist

Unnamed: 0,character,replacement
0,À,A
1,à,a
2,Á,A
3,á,a
4,Â,A
5,â,a
6,Ã,A
7,ã,a
8,Ä,A
9,ä,a


In [45]:
name = 'accented characters'
provenance = {'title': 'Accented characters',
              'category': 'characters',
              'source': 'AnAGram',
              'link': 'unknown',
              'description': 'A list of accented characters and ASCII characters that should be used to replace them.',
              'licence': 'unknown',
              'version': 'unknown'}
termColumn = 'character'
substitutionColumn = 'replacement'

In [46]:
kl.lists.save(wordlist, name, provenance, termColumn, substitutionColumn)

## Greek characters

In [46]:
import string
mins_english = ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'sigma', 'tau', 'ypsilon', 'phi', 'chi', 'psi', 'omega']
caps_english = [string.capwords(i) for i in english]; del caps_english[17]
caps = [chr(i) for i in range(913, 938)]; del caps[17]
mins = [chr(i) for i in range(945, 970)]

wordlist = kl.WordFrame({'character': caps+mins, 'replacement': caps_english+mins_english})
wordlist

Unnamed: 0,character,replacement
0,Α,Alpha
1,Β,Beta
2,Γ,Gamma
3,Δ,Delta
4,Ε,Epsilon
5,Ζ,Zeta
6,Η,Eta
7,Θ,Theta
8,Ι,Iota
9,Κ,Kappa


In [47]:
name = 'greek characters'
provenance = {'title': 'Greek characters',
              'category': 'characters',
              'source': 'Wikipedia',
              'link': 'https://en.wikipedia.org/wiki/Greek_alphabet',
              'description': 'A list of greek characters and their names in Latin script.',
              'licence': 'free',
              'version': '800 B.C.'}
termColumn = 'character'
substitutionColumn = 'replacement'

In [48]:
kl.lists.save(wordlist, name, provenance, termColumn, substitutionColumn)

## Academic words

In [7]:
wordlist = kl.read_csv('original_wordlists/AWL_sublists.csv')
wordlist

Unnamed: 0,sublist,word
0,1,analyse
1,1,analysed
2,1,analyser
3,1,analysers
4,1,analyses
5,1,analysing
6,1,analysis
7,1,analyst
8,1,analysts
9,1,analytic


In [8]:
name = 'academic words'
provenance = {'title': 'Academic words',
              'category': 'domain specific terms',
              'source': 'Dr Averil Coxhead, Victoria University of Wellington',
              'link': 'https://www.victoria.ac.nz/lals/resources/academicwordlist',
              'description': 'The list contains 570 word families which were selected according to principles. The list does not include words that are in the most frequent 2000 words of English. The AWL was primarily made so that it could be used by teachers as part of a programme preparing learners for tertiary level study or used by students working alone to learn the words most needed to study at tertiary institutions. The word list has been divided into sublists based on the frequency of occurrence of the words in the Academic Corpus. The words in Sublist 1 occur more frequently in the corpus than the other words in the list. Sublist 2 occurs with the next highest frequency.',
              'licence': 'unknown',
              'version': '2000'}
termColumn = 'word'

In [9]:
kl.lists.save(wordlist, name, provenance, termColumn)