# Chargement des commentaires depuis le xml 

In [1]:
import xml.etree.ElementTree as ET

root = ET.fromstring(open('le-blog-voyage-delida-et-florian-2020-05-10T16 48 25.440680-all.xml').read())

root

<Element '{http://disqus.com}disqus' at 0x10caa61d0>

# Extraction des différents threads 

In [2]:
from collections import defaultdict

namespaces = {'dsq': 'http://disqus.com'}

threads = defaultdict(list)

for post in root.findall('dsq:post', namespaces):
    message = post.find('dsq:message', namespaces=namespaces).text
    
    parent = post.find('dsq:parent', namespaces=namespaces)
    if parent is not None:
        parent = list(post.find('dsq:parent', namespaces=namespaces).attrib.values())[0]
    else:
        parent = ''
    
    author = post.find('dsq:author', namespaces=namespaces).find('dsq:name', namespaces=namespaces).text
    
    timestamp  = post.find('dsq:createdAt', namespaces=namespaces).text
    
    thread = list(post.find('dsq:thread', namespaces=namespaces).attrib.values())[0]
    
    post_id = list(post.attrib.values())[0]
    
    threads[thread].append((author, timestamp, message, parent, post_id))

In [3]:
for thread in threads:
    threads[thread] = sorted(threads[thread], key=lambda items: items[3])

In [4]:
threads[thread]

[('Rola Priatel',
  '2018-10-28T00:40:10Z',
  '<p>So nice to read and see Beautiful British Columbia through your eyes! It was such joy to have you and Elida here for two weeks from the moment you landed and I drove the car without the keys until you had to hop in a car and drive us and our friends to the Belgian Restaurant Chambar ( an institution here in Vancouver for years) and we made you join us before you headed back to the house...and all the wonderful guitar lessons and life lessons you ended up teaching our boys by them just hanging out with you! We will never forget those beautiful days and we still talk about you two! Now on this blog you were too modest in not writing more about the treacherous Grouse Grind and you must have a photo from the top after you finished  the straight vertical climb!  Also I wish we had a photo of the full turkey on the Thanksgiving photo 😄 <br>As you say it was a shame that we didn’t get the chance to go to the island and the Juan de Fuca passage

# Construction d'un arbre  

In [5]:
from anytree import Node, findall, RenderTree

In [6]:
thread_roots = []
for key, posts in threads.items():
    thread_root = Node(key)
    posts = posts[:]
    while len(posts) > 0:
        author, timestamp, message, parent, post_id = posts.pop(0)
        if parent == '': # toplevel, create a node
            n = Node(post_id, parent=thread_root)
            n.author = author
            n.timestamp = timestamp
            n.message = message
        else:
            search = findall(thread_root, filter_=lambda node: node.name in (parent))
            if len(search) > 0:
                assert len(search) == 1
                parent = search[0]
                n = Node(post_id, parent=parent)
                n.author = author
                n.timestamp = timestamp
                n.message = message
            else:
                posts.append((author, timestamp, message, parent, post_id))
    thread_roots.append(thread_root)

In [7]:
print(RenderTree(thread_roots[0]))

Node('/6645732233')
├── Node('/6645732233/3882847907', author='Thibaud', message="<p>Ça a l'air de bien fonctionner mais peut-être que tu auras envie de modifier ton CSS pour centrer les photos et les espacer légèrement :D Tu pourrais faire un article sur le CSS ensuite :)</p>", timestamp='2018-05-02T16:40:25Z')
│   └── Node('/6645732233/3882847907/3885383220', author='Florian LB', message='<p>Salut Thibaud ! Déjà, merci pour le commentaire. Bravo, tu es le premier ! Le CSS, c\'est un sujet sensible, mais j\'ai demandé à une experte de nous prêter main forte pour améliorer ça. L\'idée de faire un article sur "comment ce blog est fait" me trotte dans la tête, mais je vais sans doute attendre encore pas mal de temps avant de l\'écrire. A bientôt !</p>', timestamp='2018-05-04T08:37:24Z')
├── Node('/6645732233/3883874178', author='ZRC', message="<p>Tu trouveras peut-être que la photographie, c'est comme l'optimisation : quand on optimise dans un ensemble limité de solutions, l'optimum est 

In [8]:
thread_roots[0].descendants

(Node('/6645732233/3882847907', author='Thibaud', message="<p>Ça a l'air de bien fonctionner mais peut-être que tu auras envie de modifier ton CSS pour centrer les photos et les espacer légèrement :D Tu pourrais faire un article sur le CSS ensuite :)</p>", timestamp='2018-05-02T16:40:25Z'),
 Node('/6645732233/3882847907/3885383220', author='Florian LB', message='<p>Salut Thibaud ! Déjà, merci pour le commentaire. Bravo, tu es le premier ! Le CSS, c\'est un sujet sensible, mais j\'ai demandé à une experte de nous prêter main forte pour améliorer ça. L\'idée de faire un article sur "comment ce blog est fait" me trotte dans la tête, mais je vais sans doute attendre encore pas mal de temps avant de l\'écrire. A bientôt !</p>', timestamp='2018-05-04T08:37:24Z'),
 Node('/6645732233/3883874178', author='ZRC', message="<p>Tu trouveras peut-être que la photographie, c'est comme l'optimisation : quand on optimise dans un ensemble limité de solutions, l'optimum est facile à atteindre, mais quand 

# Export des threads vers du html 

In [9]:
import pandas as pd
from collections import OrderedDict

htmls = OrderedDict()
for thread_root in thread_roots:
    html = "<h2>Commentaires</h2><ul>"
    for post in thread_root.descendants:
        timestamp = pd.Timestamp(post.timestamp).to_pydatetime().strftime('%Y-%m-%d %Hh%M')
        html += f"<li><div>{post.author}, <em>{timestamp}</em></div>"
        html += f"<p>{post.message}</p></li>"
    html += "</ul>"
    htmls[thread_root.name] = html

# Preview de tous les commentaires 

In [10]:
from IPython.display import HTML
HTML("".join(htmls.values()))

In [11]:
htmls.keys()

odict_keys(['6645732233', '6641311775', '6650763697', '6654637667', '6676813876', '6684882319', '6687556570', '6695191914', '6708926675', '6703528651', '6741046420', '6753644651', '6753759827', '6727164403', '6810510720', '6827540548', '6846800535', '6858178959', '6912768754', '6924405594', '7000727802'])

# Ecriture des fichiers html sur le disque 

Now getting the threads from which we will write individual HTML files.

In [12]:
html_files = []
for thread_elem in root.findall('dsq:thread', namespaces):
    thread_id = list(thread_elem.attrib.values())[0]
    url = thread_elem.find('dsq:link', namespaces).text
    slug = url.split('/')[-1].split('.')[0]
    if thread_id in htmls:
        print(f'writing {slug}, id: {thread_id}')
        fname = slug + '.html'
        html_files.append(fname)
        with open(fname, 'w') as f:
            f.write(f"""<html>
            <head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head>
            <body>{htmls[thread_id]}</body>
            </html>""")
            

writing bonjour-monde, id: 6641311775
writing photographie-en-voyage, id: 6645732233
writing pont-Occident-Orient, id: 6650763697
writing bonjour-du-village, id: 6654637667
writing elections-cedres, id: 6676813876
writing tripoli-beyrouth-qadisha-saida, id: 6684882319
writing manger-au-liban, id: 6687556570
writing arrivee-en-russie, id: 6695191914
writing au-revoir-liban, id: 6703528651
writing crapahutages-moscova, id: 6708926675
writing empire-du-milieu, id: 6727164403
writing chine-pas-de-course, id: 6741046420
writing impressions-chinoises, id: 6753644651
writing konnichiwa, id: 6753759827
writing nagano-osaka, id: 6810510720
writing great-ocean-road, id: 6827540548
writing animaux-australie, id: 6846800535
writing polynesie-tahiti, id: 6858178959
writing santiago-desert-atacama, id: 6912768754
writing perou-machu-picchu, id: 6924405594
writing vancouver-colombie-britannique, id: 7000727802


# Conversion en .tex 

And now using Pandoc to convert html to .tex for comment inclusion!

In [13]:
import os.path as op
import subprocess

comment_output_files = ["comment_" + comment_file.split('.html')[0] + '.tex' for comment_file in html_files]

for inp, outp in zip(html_files, comment_output_files):
    subprocess.call(f"pandoc {inp} -o {outp}",
        shell=True)

    print("\\include{comments/" + outp[:-4] + "}")

\include{comments/comment_bonjour-monde}
\include{comments/comment_photographie-en-voyage}
\include{comments/comment_pont-Occident-Orient}
\include{comments/comment_bonjour-du-village}
\include{comments/comment_elections-cedres}
\include{comments/comment_tripoli-beyrouth-qadisha-saida}
\include{comments/comment_manger-au-liban}
\include{comments/comment_arrivee-en-russie}
\include{comments/comment_au-revoir-liban}
\include{comments/comment_crapahutages-moscova}
\include{comments/comment_empire-du-milieu}
\include{comments/comment_chine-pas-de-course}
\include{comments/comment_impressions-chinoises}
\include{comments/comment_konnichiwa}
\include{comments/comment_nagano-osaka}
\include{comments/comment_great-ocean-road}
\include{comments/comment_animaux-australie}
\include{comments/comment_polynesie-tahiti}
\include{comments/comment_santiago-desert-atacama}
\include{comments/comment_perou-machu-picchu}
\include{comments/comment_vancouver-colombie-britannique}


# Standalone latex files 

In [14]:
import os.path as op
import subprocess

standalone_comment_output_files = ["standalone_comment_" + comment_file.split('.html')[0] + '.tex' for comment_file in html_files]

for inp, outp in zip(html_files, standalone_comment_output_files):
    subprocess.call(f"pandoc {inp} -s -o {outp}",
        shell=True)

    print("\\include{comments/" + outp[:-4] + "}")

\include{comments/standalone_comment_bonjour-monde}
\include{comments/standalone_comment_photographie-en-voyage}
\include{comments/standalone_comment_pont-Occident-Orient}
\include{comments/standalone_comment_bonjour-du-village}
\include{comments/standalone_comment_elections-cedres}
\include{comments/standalone_comment_tripoli-beyrouth-qadisha-saida}
\include{comments/standalone_comment_manger-au-liban}
\include{comments/standalone_comment_arrivee-en-russie}
\include{comments/standalone_comment_au-revoir-liban}
\include{comments/standalone_comment_crapahutages-moscova}
\include{comments/standalone_comment_empire-du-milieu}
\include{comments/standalone_comment_chine-pas-de-course}
\include{comments/standalone_comment_impressions-chinoises}
\include{comments/standalone_comment_konnichiwa}
\include{comments/standalone_comment_nagano-osaka}
\include{comments/standalone_comment_great-ocean-road}
\include{comments/standalone_comment_animaux-australie}
\include{comments/standalone_comment_pol