Prepare a file to generate the [`Wordle`](https://gher-ulg.github.io/Liege-Colloquium/topicwordle.html) using the list of topics (file [topic.md](https://github.com/gher-ulg/gher-ulg.github.io/blob/master/Liege-Colloquium/topics.md)).

In [16]:
import os
import re
import operator
from collections import Counter

File to read titles from:

In [17]:
datafile = "/home/ctroupin/ULiege/gher-ulg.github.io/Liege-Colloquium/topics.md"
jsonfile = "../data/topicsWordle.js"
jsonfile2 = "/home/ctroupin/ULiege/gher-ulg.github.io/data/topicsWordle.js"

In [18]:
topiclist = []
with open(datafile, "r") as f:
    for lines in f:
        # Split line with | as separator
        topic = lines.split("|")[2]
        # Convert to lower case
        topic = [x.lower() for x in topic.split()]
        # Remove final 's' (assuming it's plural)
        # topic = [x.rstrip()[:-1] if (x.rstrip()[-1] == "s") else x.rstrip() for x in topic]
        # Add to list
        topiclist.extend(topic)
# Convert list to string
wordstring = " ".join(topiclist)

## Remove punctuation and common words

In [54]:
def clean_words(wordstring):
    
    punctuation = [":", ".", "–", '"', ",", "'"]

    removelist = ["\n", "the", "in", "a", "of", "les",
                  "from" "le", "à", "but",
                  "and","to", "for", "at", "on", 
                  "no", "specific", "topic", "identifiable", "contents",
                  "several", "papers", "related", "easily"
                  "contents"]

    substdict = {"l’océan": "ocean",
                  "from": " ",
                  "équations": "equations",
                  "mesoscale/synoptic": "mesoscale synoptic",
                  "modeling": "modelling", "no specific topic": " ",
                  "sub-mesoscale": "submesoscale", 
                  "hydrodynamically": 'hydrodynamics', 
                  "hydrodynamical": 'hydrodynamics', 
                  "hydrodynamiques": 'hydrodynamics', 
                  "hydrodynamic": 'hydrodynamics', 
                  "seas": "sea", "changes": "change",
                  "waters": "water", 
                  "ecosystems": "ecosystem",
                  "estuarine": "estuaries",
                  "applicable": "applicable",
                  "tracers": "tracer",
                  "models": "modelling",
                  "re³-visited": "revisited",
                  "re-revisited": "revisited",
                  "environmental": "environments",
                  "remote sensing": "remote-sensing",
                  "bio-geo-chemical": "biogeochemical",
                  "analyses": "analysis",
                  "interaction": "interactions",
                  "prediction": "predictions"
                   }

    for symbols in punctuation:
        wordstring = wordstring.replace(symbols, ' ')
    for words in removelist:
        wordstring = re.sub(r"\b{}\b".format(words), ' ', wordstring)
    for oldvalue, newvalue in substdict.items():
        wordstring = re.sub(r"{}\b".format(oldvalue), newvalue, wordstring)
        
    return wordstring

## Count word frequency and sort

In [20]:
worddict = Counter(wordstring.split())
sortedworddict = sorted(worddict.items(), key=operator.itemgetter(1), reverse=True)

# Convert the dictionary to json
We should get something like this: 
```
var words = [
  {text: 'have', size: 102},
  {text: 'Oliver', size: 47},
  {text: 'say', size: 46},
  {text: 'said', size: 36}
]
```

In [21]:
with open(jsonfile, "w") as jf:
    jf.write("var words = [\n")
    for lines in sortedworddict:
        jf.write("".join(("\t{text: '", lines[0], "', size: ", str(lines[1]), "},\n")))
    jf.write("]")

# Build dictionary with the years for each word

In [22]:
clqdict = {}
with open(datafile, "r") as f:
    for lines in f:
        # Split line with | as separator
        year = int(lines.split("|")[1])
        topic = lines.split("|")[2]
        
        for wordstring in topic.split():
            # Convert to lower case
            wordstring = wordstring.lower()
            
            for symbols in punctuation:
                wordstring = wordstring.replace(symbols, ' ')
            for words in removelist:
                wordstring = re.sub(r"\b{}\b".format(words), ' ', wordstring)
            for oldvalue, newvalue in substdict.items():
                wordstring = re.sub(r"{}\b".format(oldvalue), newvalue, wordstring)
            
            for w in wordstring.split():
                if w not in removelist:
                    if w in clqdict.keys():
                        clqdict[w].insert(0, year)
                    else:
                        clqdict[w] = [year]

## Convert dictionary to json file

In [23]:
with open(jsonfile2, "w") as jf:
    jf.write("var words = [\n")
    jf.write("\t{text: 'colloquium', size: 102, yearlist: '1969-2018'},\n")
    for k, v in clqdict.items():
        jf.write("".join(("\t{text: '", k, "', size: ", str(len(v)), ", yearlist: ", str(v),"},\n")))
    jf.write("]")
print("Written file {}".format(jsonfile2))

Written file /home/ctroupin/ULiege/gher-ulg.github.io/data/topicsWordle.js


# Counting by decade
We will create a dictionary where the keys are the years.

In [83]:
topicdict = {}
with open(datafile, "r") as f:
    for lines in f:
        year = int(lines.split("|")[1].strip())
        # Split line with | as separator
        topic = lines.split("|")[2]
        # Convert to lower case
        topic = [x.lower() for x in topic.split()]
        # Remove final 's' (assuming it's plural)
        # topic = [x.rstrip()[:-1] if (x.rstrip()[-1] == "s") else x.rstrip() for x in topic]
        # Add to dictionary
        topicdict[year] = topic

Some years had an inclear title, so we hard-code it:

In [84]:
topicdict[1974] = ["phytoplankton",]
topicdict[1973] = ""
topicdict[1972] = ["Turbulence", "mixing", "internal", "waves"]

Loop on the years:

In [86]:
for yearmin in range(1971, 2020, 10):
    yearmax = yearmin + 10
    print("Working on decade {} - {}".format(yearmin, yearmax-1))
    topiclistdecade = []
    for yyyy in range(yearmin, yearmax):
        topiclistdecade.extend(topicdict[yyyy])
         
    wordstring = " ".join(topiclistdecade)
    wordstring2 = clean_words(wordstring)
    
    worddict = Counter(wordstring2.split())
    sortedworddict = sorted(worddict.items(), key=operator.itemgetter(1), reverse=True)
    print(sortedworddict)
    print("")

Working on decade 1971 - 1980
[('hydrodynamics', 3), ('modelling', 2), ('turbulence', 2), ('marine', 2), ('biochemical', 1), ('state', 1), ('variables', 1), ('evolution', 1), ('equations', 1), ('mathematical', 1), ('sea', 1), ('pollution', 1), ('Turbulence', 1), ('mixing', 1), ('internal', 1), ('waves', 1), ('phytoplankton', 1), ('continental', 1), ('shelf', 1), ('dynamics', 1), ('bottom', 1), ('estuaries', 1), ('fjords', 1), ('forecasting', 1), ('predictability', 1), ('ocean', 1), ('ecohydrodynamics', 1)]

Working on decade 1981 - 1990
[('hydrodynamics', 3), ('sea', 3), ('modelling', 3), ('ocean', 2), ('coupled', 2), ('ocean-atmosphere', 2), ('marine', 2), ('turbulence', 2), ('ice', 2), ('semi-enclosed', 1), ('equatorial', 1), ('remote-sensing', 1), ('shelf', 1), ('interfaces', 1), ('ecohydrodynamics', 1), ('three-dimensional', 1), ('estuaries', 1), ('dynamics', 1), ('small-scale', 1), ('mixing', 1), ('mesoscale', 1), ('synoptic', 1), ('coherent', 1), ('structures', 1), ('geophysical'