In this notebook, we used Dynamics Topic Modeling to explore how terms used in speeches have changed over time.

In [None]:
import pandas as pd
import numpy as np

import pickle
import logging

import ast
import gensim
from gensim.models import Phrases, LdaModel, LdaSeqModel
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv("amazon_clean_discourses.tsv", sep="\t", encoding="utf-8")
df = df.drop(["discourse_link", "session", "original_discourse", "phase", "speaker", "party", "state"], axis=1)

# create a year colunm
df['year'] = pd.DatetimeIndex(df['date']).year

# set date as index
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df = df.set_index(['date'], drop=False)

# tokenizing column
df["tokenized"] = [gensim.utils.simple_preprocess(line) for line in df["tokenized"]]

In [None]:
# building bigram
bigram = gensim.models.Phrases(df["tokenized"], min_count=5, threshold=90)
bigram_mod = gensim.models.phrases.Phraser(bigram)

def make_bigrams(text):
    return [bigram_mod[doc] for doc in text]

# adding a column in our df
df["bigrams"] = make_bigrams(df["tokenized"])

# now, we get all the discourses and convert them to a list of strings.
# the output is a list containing list of strings for each document (discourse)
bigrams = df['bigrams'].values

A very important input for DTM to work is the time_slice input. It should be a list which contains the number of documents in each time slice. In our case, the time slice will be in years.

In [None]:
time_stamps = np.arange(int(df.year.min(axis=0)), int(df.year.max(axis=0))+1, 1)
time_stamps = list(time_stamps)

# find out the time slice
gp = df.groupby(by=['year'])
total_yearly_list = list(gp.size())

time_slice = total_yearly_list
print(time_slice)

[164, 797, 337, 1070, 898, 874, 575, 1180, 1188, 1342, 483, 726, 568, 700, 404, 438, 273, 641, 298, 859, 296, 491]


In [None]:
# dictionary: tokens will be translated to unique ids, we may do this by transforming the text to a list of words and passing it to the method
dictionary = Dictionary(bigrams)

# no_above: keep tokens which are contained in no more than X% documents, this means that tokens appearing in more than 60% will be removed
# no_below: keep tokens which are contained in at least Y documents; less than 5 documents
# so we'll filter out words that occur less than 5 documents, or more than 20% of the documents.
dictionary.filter_extremes(no_above=0.6, no_below=5)
# print(dictionary.token2id)

# the corpus object contains both the word id and the frequency with which it appears in each doc. 
# to create a BOW corpus, we need to feed the tokenized list of words to the dictionary after it has been updated
texts = bigrams
corpus = [dictionary.doc2bow(text) for text in texts]
#pickle.dump(corpus, open('corpus.pkl', 'wb'))

# dictionary = pickle.load(open('amazon_dictionary.pkl', 'rb'))
# corpus = pickle.load(open('corpus.pkl', 'rb'))

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 50454
Number of documents: 14602


In [None]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO

# use LdaSeqModel to generate DTM results
ldaseq = LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=time_slice, num_topics=12,
                    lda_model=lda_model, passes=1, lda_inference_max_iter=8, em_min_iter=4,
                    em_max_iter=6, chunksize=30)

INFO : using symmetric eta at 0.08333333333333333
INFO : using serial LDA version on this node
INFO : running online (single-pass) LDA training, 12 topics, 1 passes over the supplied corpus of 14602 documents, updating model once every 2000 documents, evaluating perplexity every 14602 documents, iterating 50x with a convergence threshold of 0.001000
INFO : PROGRESS: pass 0, at document #2000/14602
INFO : merging changes from 2000 documents into a model of 14602 documents
INFO : topic #11 (0.010): 0.009*"região" + 0.007*"desenvolvimento" + 0.005*"federal" + 0.004*"recursos" + 0.003*"rondônia" + 0.003*"nacional" + 0.003*"projeto" + 0.003*"programa" + 0.003*"acre" + 0.003*"áreas"
INFO : topic #4 (0.010): 0.006*"região" + 0.005*"projeto" + 0.005*"desenvolvimento" + 0.004*"federal" + 0.004*"nacional" + 0.004*"água" + 0.003*"rio" + 0.003*"recursos" + 0.003*"rondônia" + 0.003*"nordeste"
INFO : topic #9 (0.010): 0.006*"região" + 0.005*"federal" + 0.005*"desenvolvimento" + 0.004*"projeto" + 0.0

INFO : topic #7 (0.010): 0.006*"indígenas" + 0.005*"nacional" + 0.004*"defesa" + 0.004*"direito" + 0.004*"mundo" + 0.003*"terra" + 0.003*"federal" + 0.003*"democracia" + 0.003*"direitos" + 0.003*"bolsonaro"
INFO : topic #8 (0.010): 0.006*"dilma" + 0.005*"lula" + 0.005*"nordeste" + 0.005*"roraima" + 0.005*"dinheiro" + 0.004*"federal" + 0.004*"partido" + 0.004*"nacional" + 0.003*"população" + 0.003*"economia"
INFO : topic #5 (0.010): 0.010*"desenvolvimento" + 0.008*"região" + 0.006*"obras" + 0.006*"nacional" + 0.006*"pará" + 0.005*"federal" + 0.005*"investimentos" + 0.005*"mato_grosso" + 0.005*"energia" + 0.005*"recursos"
INFO : topic diff=0.598425, rho=0.377964
INFO : -9.333 per-word bound, 644.8 perplexity estimate based on a held-out corpus of 602 documents with 89641 words
INFO : PROGRESS: pass 0, at document #14602/14602
INFO : merging changes from 602 documents into a model of 14602 documents
INFO : topic #1 (0.010): 0.012*"lei" + 0.008*"bolsonaro" + 0.008*"medida_provisória" + 0.0

INFO : Fitting topic number 4
INFO : Computing bound, all times
INFO : initial sslm bound is -1875215.530560
INFO : Computing bound, all times
INFO : iteration 1 iteration lda seq bound is -1859657.813165 convergence is 0.008296
INFO : Computing bound, all times
INFO : iteration 2 iteration lda seq bound is -1853230.714110 convergence is 0.003456
INFO : Fitting topic number 5
INFO : Computing bound, all times
INFO : initial sslm bound is -3615257.481036
INFO : Computing bound, all times
INFO : iteration 1 iteration lda seq bound is -3595269.230727 convergence is 0.005529
INFO : Computing bound, all times
INFO : iteration 2 iteration lda seq bound is -3585495.997808 convergence is 0.002718
INFO : Fitting topic number 6
INFO : Computing bound, all times
INFO : initial sslm bound is -3664396.889790
INFO : Computing bound, all times
INFO : iteration 1 iteration lda seq bound is -3645643.120402 convergence is 0.005118
INFO : Computing bound, all times
INFO : iteration 2 iteration lda seq bo

INFO : initial sslm bound is -3412161.978087
INFO : Computing bound, all times
INFO : iteration 1 iteration lda seq bound is -3407975.038591 convergence is 0.001227
INFO : Computing bound, all times
INFO : iteration 2 iteration lda seq bound is -3405682.265612 convergence is 0.000673
INFO : Fitting topic number 3
INFO : Computing bound, all times
INFO : initial sslm bound is -1716579.077895
INFO : Computing bound, all times
INFO : iteration 1 iteration lda seq bound is -1713590.479498 convergence is 0.001741
INFO : Computing bound, all times
INFO : iteration 2 iteration lda seq bound is -1712732.874313 convergence is 0.000500
INFO : Fitting topic number 4
INFO : Computing bound, all times
INFO : initial sslm bound is -1664731.244861
INFO : Computing bound, all times
INFO : iteration 1 iteration lda seq bound is -1662030.900436 convergence is 0.001622
INFO : Computing bound, all times
INFO : iteration 2 iteration lda seq bound is -1660564.872871 convergence is 0.000882
INFO : Fitting to

In [None]:
# save ldaseq model
# ldaseq.save("dtm_model")

# load
ldaseq = LdaSeqModel.load("dtm_amazon")

Let's find the coherence values for the DTM: we have to specify the time-slice we want to find coherence for.

In [None]:
# we just have to specify the time-slice we want to find coherence for.
topics_dtm = ldaseq.dtm_coherence(time=1)

cmass_DTM = CoherenceModel(topics=topics_dtm, corpus=corpus, dictionary=dictionary, coherence='u_mass')
print("DTM u_mass:", cmass_DTM.get_coherence())

ccv_DTM = CoherenceModel(topics=topics_dtm, texts=texts, dictionary=dictionary, coherence='c_v')
print("DTM ccv_mass:", ccv_DTM.get_coherence())

DTM u_mass: -1.3532025923197004
DTM ccv_mass: 0.4527922597570973


In [None]:
def topic_time(topic, time_stamps):
    '''function that gets a particular topic's terms and stores in a dataframe where
    columns are the time slices'''
    for i in range(len(time_slice)-1):
        if i==0:
            temp_1 = pd.DataFrame(topic[i])
            temp_2 = pd.DataFrame(topic[i+1])
            temp_1.columns = ['words', time_stamps[i]]
            temp_2.columns = ['words', time_stamps[i+1]]
            temp_1 = pd.merge(temp_1,temp_2)
        else:
            temp_2 = pd.DataFrame(topic[i+1])
            temp_2.columns = ['words', time_stamps[i+1]]
            temp_1 = pd.merge(temp_1,temp_2)
    topic_words_time=temp_1
    
    return topic_words_time

In [None]:
DTM0 = ldaseq.print_topic_times(topic=0, top_terms=500)
DTM2 = ldaseq.print_topic_times(topic=2, top_terms=500)
DTM3 = ldaseq.print_topic_times(topic=3, top_terms=500)
DTM4 = ldaseq.print_topic_times(topic=4, top_terms=500)
DTM5 = ldaseq.print_topic_times(topic=5, top_terms=500)
DTM6 = ldaseq.print_topic_times(topic=6, top_terms=500)
DTM7 = ldaseq.print_topic_times(topic=7, top_terms=500)
DTM9 = ldaseq.print_topic_times(topic=9, top_terms=500)
DTM10 = ldaseq.print_topic_times(topic=10, top_terms=500)
DTM11 = ldaseq.print_topic_times(topic=11, top_terms=500)

In [None]:
# apply the function and save as csv
# economical development
topic0 = topic_time(DTM0, time_stamps)
topic0.to_csv("dtm_topic0.csv", sep=",")

# social welfare
topic2 = topic_time(DTM2, time_stamps)
topic2.to_csv("dtm_topic2.csv", sep=",")

# foreign affairs
topic3 = topic_time(DTM3, time_stamps)
topic3.to_csv("dtm_topic3.csv", sep=",")

# water transposition
topic4 = topic_time(DTM4, time_stamps)
topic4.to_csv("dtm_topic4.csv", sep=",")

# road projects
topic5 = topic_time(DTM5, time_stamps)
topic5.to_csv("dtm_topic5.csv", sep=",")

# agricultural expansion
topic6 = topic_time(DTM6, time_stamps)
topic6.to_csv("dtm_topic6.csv", sep=",")

# indigenous rights
topic7 = topic_time(DTM7, time_stamps)
topic7.to_csv("dtm_topic7.csv", sep=",")

# national sovereignty
topic9 = topic_time(DTM9, time_stamps)
topic9.to_csv("dtm_topic9.csv", sep=",")

# land rights
topic10 = topic_time(DTM10, time_stamps)
topic10.to_csv("dtm_topic10.csv", sep=",")

# environmental protection
topic11 = topic_time(DTM11, time_stamps)
topic11.to_csv("dtm_topic11.csv", sep=",")

In [None]:
rows0 = ['recursos', 'desenvolvimento', 'zona_franca', 'empresas', 
         'suframa', 'dnit', 'sudam', 'incentivos_fiscais',
         'rondônia', 'regional']

df0 = topic0.loc[topic0['words'].isin(rows0)]
df0 = pd.melt(df0, id_vars=["words"], var_name="year")

ec = px.line(df0,
              x="year",
              y="value",
              color='words',
              color_discrete_sequence=px.colors.qualitative.Prism,
              labels={"words":"", "value":"probability"},
              width=900, 
              height=700,
              #groupnorm='percent',
              title="Amazon's Development topic selected terms probability over time")

newnames = {"recursos":"resources", "zona_franca":"MFTZ", "rondônia":"rondônia", "regional":"regional",
            "desenvolvimento":"development", "empresas":"companies", "sudam":"sudam",
            "incentivos_fiscais":"tax benefits", "suframa":"suframa", "dnit":"dnit"}

ec.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                     legendgroup = newnames[t.name],
                                     hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                    ))

ec.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2000, 2005, 2010, 2015, 2020]))

ec.write_html("/dtm_economic_development.html")
ec.show()

In [None]:
# topic 2: social welfare
rows2 = ['educação', 'saúde', 'recursos', 'trabalho', 'desenvolvimento', 
         'federal', 'sociais', 'jovens', 'políticas', 'direito']
         
df2 = topic2.loc[topic2['words'].isin(rows2)]
df2 = pd.melt(df2, id_vars=["words"], var_name="year")

sw = px.line(df2,
             x="year",
             y="value",
             color='words',
             color_discrete_sequence=px.colors.qualitative.Prism,
             labels={"words":"", "value":"probability"},
             width=900, 
             height=700,
             #groupnorm="percent",
             title="Social Welfare topic selected terms probability over time")

newnames = {"educação":"education", "saúde":"healthcare", "sociais":"social",
            "recursos":"resources", "trabalho":"labour", "desenvolvimento":"development",
            "direito":"rights", "jovens":"youth", "federal":"federal", "políticas":"policies"}

sw.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                     legendgroup = newnames[t.name],
                                     hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                    ))

sw.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2000, 2005, 2010, 2015, 2020]))

sw.write_html("/dtm_social_welfare.html")
sw.show()

In [None]:
# topic 3: foreign affairs
rows3 = ['mundo', 'países', 'internacional', 'desenvolvimento', 'mercosul',
         'relações_exteriores', 'economia', 'europa', 'américa_sul', 'venezuela']

df3 = topic3.loc[topic3['words'].isin(rows3)]
df3 = pd.melt(df3, id_vars=["words"], var_name="year")

fa = px.line(df3,
              x="year",
              y="value",
              color='words',
              color_discrete_sequence=px.colors.qualitative.Prism,
              labels={"words":"", "value":"probability"},
              width=900, 
              height=700,
              #groupnorm='percent',
              title="Foreign Affairs topic selected terms probability over time"
            )

newnames = {'mundo':'world', 'países':'countries', 'internacional':'international',
            'desenvolvimento':'development', 'mercosul':'mercosul','relações_exteriores':'foreign affairs', 
            'economia':'economic', 'europa':'europe', 'américa_sul':'south america', 'venezuela':'venezuela'
           }

fa.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                     legendgroup = newnames[t.name],
                                     hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])))

fa.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2000, 2005, 2010, 2015, 2020]))

fa.write_html("/dtm_foreign_affairs.html")
fa.show()

In [None]:
# topic 4: water transposition
rows4 = ['rio', 'francisco', 'recursos_hídricos', 'transposição', 'seca', 
         'integração', 'sustentável', 'saneamento', 'semi_árido', 'projeto', 
         'desenvolvimento']

df4 = topic4.loc[topic4['words'].isin(rows4)]
df4 = pd.melt(df4, id_vars=["words"], var_name="year")

wt = px.line(df4,
              x="year",
              y="value",
              color='words',
              color_discrete_sequence=px.colors.qualitative.Prism,
              labels={"words":"", "value":"probability"},
              width=900, 
              height=700,
              #groupnorm='percent',
              title="Water Transposition topic selected terms probability over time")

newnames = {"rio":"river", "francisco":"francisco", "seca":"dry",
            "recursos_hídricos":"water resources", "sustentável":"sustainable", 
            "saneamento":"sanitation", "semi_árido":"semiarid", "projeto":"project",
            "desenvolvimento":"development", "transposição":"transposition",
            "integração":"integration"}

wt.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                     legendgroup = newnames[t.name],
                                     hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                    ))

wt.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2000, 2005, 2010, 2015, 2020]))

wt.write_html("/dtm_water_transposition.html")
wt.show()

In [142]:
# topic 5: road projects
rows5 = ["investimentos", "infraestrutura", "construção",
         "recursos", "obras", "transportes", "aeroportos", "privatização",
         "turismo"]

df5 = topic5.loc[topic5['words'].isin(rows5)]
df5 = pd.melt(df5, id_vars=["words"], var_name="year")

rp = px.line(df5,
              x="year",
              y="value",
              color='words',
              color_discrete_sequence=px.colors.qualitative.Prism,
              labels={"words":"", "value":"probability"},
              width=900, 
              height=700,
              #groupnorm='percent',
              title="Road Projects topic selected terms probability over time")

newnames = {"investimentos":"investments", "infraestrutura":"infrastructure", 
           "construção":"construction", "recursos":"recourses", "obras":"works",
           "transportes":"transport", "aeroportos":"airports", "privatização":"privatization",
           "turismo":"tourism"}

rp.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                     legendgroup = newnames[t.name],
                                     hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                    ))

rp.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2000, 2005, 2010, 2015, 2020]))


rp.write_html("/dtm_road_projects.html")
rp.show()

In [141]:
# topic 6: agricultural expansion (2)
rows6 = ['clima', 'aquecimento_global', 'sustentável',
         'mudanças_climáticas',
         'natureza', 'meio_ambiente', 'efeito_estufa']

df6 = topic6.loc[topic6['words'].isin(rows6)]
df6 = pd.melt(df6, id_vars=["words"], var_name="year")

ae = px.line(df6,
             x="year",
             y="value",
             color='words',
             color_discrete_sequence=px.colors.qualitative.Prism,
             labels={"words":"", "value":"probability"},
             width=900, 
             height=700,
             #groupnorm='percent',
             title="Agricultural Expansion topic selected terms probability over time")

newnames = {"clima":"climate", "sustentável":"sustainable", "natureza":"nature", 
            "mudanças_climáticas":"climate change", "aquecimento_global":"global warming", "efeito_estufa":"greenhouse effect", 
            'meio_ambiente':'environment'}

ae.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                     legendgroup = newnames[t.name],
                                     hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                    ))

ae.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2000, 2005, 2010, 2015, 2020]))

ae.write_html("/dtm_agricultural_expansion_2.html")
ae.show()

In [146]:
# topic 6: agricultural expansion (1)
rows6 = ["mercado", 'produtores', 'agricultura', 'agronegócio',
         'economia', 'agricultura_familiar','investimentos', 'tecnologia', 'alimentos']

df6 = topic6.loc[topic6['words'].isin(rows6)]
df6 = pd.melt(df6, id_vars=["words"], var_name="year")

ae = px.line(df6,
             x="year",
             y="value",
             color='words',
             color_discrete_sequence=px.colors.qualitative.Prism,
             labels={"words":"", "value":"probability"},
             width=900, 
             height=700,
             #groupnorm='percent',
             title="Agricultural Expansion topic selected terms probability over time")

newnames = {"agricultura":"agriculture","mercado":"market", 
            "produtores":"producers", "tecnologia":"tecnology",
            "alimentos":"food", "agronegócio":"agribusiness", "economia":"economy",
            "agricultura_familiar":"family farming", "investimentos":"investments"}

ae.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                     legendgroup = newnames[t.name],
                                     hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                    ))

ae.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2000, 2005, 2010, 2015, 2020]))

ae.write_html("dtm_agricultural_expansion_1.html")
ae.show()

In [None]:
rows7 = ['violência', 'direitos', 'funai', 'direitos_humanos', 'crime', 'indígenas', 'demarcação', 
         'reservas', 'constituição', 'assassinato', 'constituição']

df7 = topic7[topic7['words'].isin(rows7)]
df7 = pd.melt(df7, id_vars=["words"], var_name="year")

ir = px.line(df7,
              x="year",
              y="value",
              color='words',
              color_discrete_sequence=px.colors.qualitative.Prism,
              labels={"words":"", "value":"probability"},
              width=900, 
              height=700,
              #groupnorm='percent',
              title="Indigenous Rights topic selected terms probability over time")

newnames = {"indígenas":"indigenous","direitos_humanos":"human_rights", 
            "crime":"crime", "violência":"violence",
            "assassinato":"murdered", "direitos":"rights",
            "demarcação":"demarcation", "constituição":"constitution",
            "funai":"funai", "reservas":"reserves"}

ir.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                     legendgroup = newnames[t.name],
                                     hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                    ))

ir.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2000, 2005, 2010, 2015, 2020]))

ir.write_html("/dtm_indigenous_rights.html")
ir.show()

In [None]:
# topic 9: national sovereignty
rows9 = ['defesa', 'exército', 'forças_armadas', 'território', 
         'fronteiras', 'drogas', 'violência', 'guerra',
         'vigilância']

df9 = topic9.loc[topic9['words'].isin(rows9)]
df9 = pd.melt(df9, id_vars=["words"], var_name="year")

ns = px.line(df9,
              x="year",
              y="value",
              color='words',
              color_discrete_sequence=px.colors.qualitative.Prism,
              labels={"words":"", "value":"probability"},
              width=900, 
              height=700,
              #groupnorm='percent',
              title="National Sovereignty topic selected terms probability over time")

newnames = {"defesa":"defence", "exército":"army", "forças_armadas":"military", 
            "violência":"violence", "território":"territory", "fronteiras":"borders",
            "drogas":"drugs", "guerra":"war", "vigilância":"surveillance"
           }

ns.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                     legendgroup = newnames[t.name],
                                     hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                    ))

ns.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2000, 2005, 2010, 2015, 2020]))

ns.write_html("/dtm_national_sovereignty.html")
ns.show()

In [None]:
# topic 10: land rights
rows10 = ['terra', 'incra', 'reforma_agrária', 'assentamentos', 'reserva', 'produtor', 'regularização_fundiária',
          'rural', 'campo', 'empresa']

df10 = topic10.loc[topic10['words'].isin(rows10)]
df10 = pd.melt(df10, id_vars=["words"], var_name="year")

lr = px.line(df10,
              x="year",
              y="value",
              color='words',
              color_discrete_sequence=px.colors.qualitative.Prism,
              labels={"words":"", "value":"probability"},
              width=900, 
              height=700,
              #groupnorm='percent',
              title="Land Rights topic selected terms probability over time")

newnames = {"terra":"land", "incra":"incra", "reforma_agrária":"agrarian reform", 
            "assentamentos":"settlements", "reserva":"reserve", 
            "rural":"rural", "campo":"farm", "produtor":"producers", 
            "regularização_fundiária":"land regularization", "empresa":"company"
           }

lr.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                     legendgroup = newnames[t.name],
                                     hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                    ))

lr.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2000, 2005, 2010, 2015, 2020]))

lr.write_html("/dtm_land_rights.html")
lr.show()

In [None]:
# topic 11: environmental protection
rows11 = ['desenvolvimento', 'sustentável', 'preservação', 'desmatamento',
          'proteção', 'conservação', 'destruição',
          'queimadas', 'código_florestal', 'meio_ambiente']

df11 = topic11.loc[topic11['words'].isin(rows11)]
df11 = pd.melt(df11, id_vars=["words"], var_name="year")

ep = px.line(df11,
              x="year",
              y="value",
              color='words',
              color_discrete_sequence=px.colors.qualitative.Prism,
              labels={"words":"", "value":"probability"},
              width=900, 
              height=700,
              #groupnorm='percent',
              title="Environmental protection selected terms probability over time"
            )

newnames = {'desenvolvimento':'development', 'sustentável':'sustainable', 
            'preservação':'preservation', 'desmatamento':'deforestation',
            'proteção':'protection', 'conservação':'conservation', 'meio_ambiente':'environment', 
            'destruição':'destruction', 'queimadas':'fires', 'código_florestal':'forest code'
           }

ep.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                     legendgroup = newnames[t.name],
                                     hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                    ))


ep.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2000, 2005, 2010, 2015, 2020]))

ep.write_html("/dtm_environmental_protection.html")
ep.show()