# Análisis de la transcripción del pleno del parlamento Andaluz.

## Extracción de datos

Primero, se ha tratado el PDF y se ha convertido a `.txt`.
A continuación, se lee y se almacena en una lista, línea por línea.

In [2]:
with open("sumtext.txt","r", encoding="utf8") as file:
    lines = file.read()    
    
lines = lines.split("\n")

In [3]:
document = []
line_erasers = ["LENO","\x0c","úm. 14","XI LEGISLATURA","DSPA_11_014","Pág.","2 de mayo de 2019","11-19/CAU-000001","11-19"]
take_out_words = ["\n","—","PLENO"]

for l in lines:
    insert = True
    for word in take_out_words:
        l = l.replace(word,"")
    for eraser in line_erasers:
        if eraser in l:
            insert = False
    if insert:
        document.append(l)

# 1 Núm. 14 XI LEGISLATURA 2 de mayo de 201 11-19/DVOT-000012

El objeto intervención servirá para guardar los fragmentos de conversación, con la persona que habla, la linea de inicio, el texto, y la linea final. Guardamos las lineas para poder encontrar luego las respuestas del texto.

In [4]:
class Intervencion:
    def __init__(self,persona,inicio,texto,fin):
        self.persona = persona
        self.inicio = inicio
        self.texto = texto
        self.fin = fin

Se crea un diccionario intervenciones con una estructura `[persona,invervenciones]`, donde persona es el nombre de quien interviene e intervenciones una lista que contiene todas las intervenciones.

In [8]:
intervenciones = {}
key = ""
i = 0
texto = ""
no_dialogo = True
persona = ""
inicio = None
fin = None
for n_pag,line in enumerate(document):
    if ("El señor" in line or "La señora" in line) and sum(map(str.isupper,line)) > 9 and sum(map(str.islower,line)) < 10:
        dialogo = True
        i+=1
        if i == 1:
            persona = line.strip("La señora ").strip("El señor ")
            persona = persona.split(",")
            if type(persona) == list:
                persona = persona[0]
            inicio = n_pag
        else:
            fin = n_pag-1
            intervencion = Intervencion(persona,inicio,texto,fin)
            persona = line.strip("La señora ").strip("El señor ")
            persona = persona.split(",")
            if type(persona) == list:
                persona = persona[0]
            if "ÓPEZ" == persona[0:4]:
                persona = "L"+persona
            if "SPINOSA" == persona[0:7]:
                persona = "E"+persona
            if "RAMIREZ DE ARELLANO" in persona:
                persona = "RAMÍREZ DE ARELLANO"
            if "MORA GRAND" == persona:
                persona = "MORA GRANDE"
            if "FERRIZ GOMEZ" == persona:
                persona = "FÉRRIZ GÓMEZ"
            if "SEGOVIA BROM" == persona:
                persona = "SEGOVIA BROME"
            if "OPIS" == persona[0:4]:
                persona = "LL" + persona
            
            inicio = n_pag
            texto = ""
            if not intervencion.persona in intervenciones.keys():
                intervenciones[intervencion.persona] = [intervencion]
            else:
                intervenciones[intervencion.persona].append(intervencion)
    else:
        if i > 0:
            if "11-19/DL-000001" in line or "DVOT-000012" in line:
                dialogo = False
            if dialogo:
                texto+=line + " "

A continuación se puede ver como se guardan los datos de cada objeto intervención, con un ejemplo de la presidenta de la camara, Bosquet Aznar:

In [None]:
test = intervenciones["BOSQUET AZNAR"][1]
print(test.persona,"\n",test.inicio,"\n",test.fin,"\n",test.texto,"\n")


Una vez que tenemos el cuerpo del PDF procesado y organizado, puede comenzarse el análisis. Gracias a el uso de objetos, podrá elegirse qué tipo de intervenciones se quieren analizar en cada momento.

## Análisis superficial del pleno

In [None]:
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import pandas as pd
py.init_notebook_mode(connected=False)

count_interv = {}
for interv, n in intervenciones.items():
    count_interv[interv] = len(n)

In [None]:
int_count_df = pd.DataFrame.from_dict(count_interv,orient='index')
int_count_df.columns = ['count']
int_count_df['persona'] = int_count_df.index

A continuación creamos una lista con el conteo de intervenciones de cada diputado del parlamento:

In [None]:
df = pd.read_csv("partidos.csv",delimiter=";")
df = pd.merge(df,int_count_df,on='persona')
df[0:10]

Como resulta evidente, la presidenta de la cámara tiene, con diferencia, la mayor cantidad de intervenciones.

### Gráfica 1. Intervenciones/persona (Agrupado en partidos)

**1.1 Con la presidenta**

In [None]:
colors = {'PSOE': 'red',
          'CIUDADANOS': 'orange',
          'VOX': 'lightgreen',
          'ADELANTE': 'green',
           'PP': 'blue',
           'AXSI': 'gray'}

In [None]:
#https://plot.ly/python/bar-charts/
import plotly.graph_objs as go
bars = []
df['apellido1'] = df.persona.str.split().str.get(0)
df['apellido1'] += " " + df.persona.str.split().str.get(1).str.slice(stop=1)+"."


for partido,partido_df in df.groupby('partido'):
    bars.append(go.Bar(
    x = partido_df['apellido1'],
    y = partido_df['count'],
    text = partido_df['persona'],
    name = partido,
    marker = {'color': colors[partido]}
    ))

go.FigureWidget(data = bars)

Como era de esperar, ciudadanos tiene el mayor numero de intervenciones, pero esto se debe a que la presidenta de la camara es de este partido. Para nivelar los resultados, se le elimina del conjunto.

**1.2 Sin la presidenta:**

In [None]:
bars_n = []

for partido,partido_df in df[df.persona != 'BOSQUET AZNAR'].groupby('partido'):
    bars_n.append(go.Bar(
    x = partido_df['apellido1'],
    y = partido_df['count'],
    text = partido_df['persona'],
    name = partido,
    marker = {'color': colors[partido]}
    ))
    
layout = go.Layout(
    title=go.layout.Title(
        xref='paper',
        x=0
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Nº Intervenciones',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        )
    )
)
fig = go.FigureWidget(data = bars_n,layout=layout)
py.iplot(fig, filename='grouped-bar')

Para analizar el lenguaje de cada uno de los participantes, unimos todos sus discursos en un unico corpus. Esto se utilizará primero para hacer una comparación entre la gráfica de intervenciones y la de palabras.

In [None]:
import string
import re
from nltk.corpus import stopwords
from spacy.lang.es.stop_words import STOP_WORDS
from collections import Counter

#remover = str.maketrans(dict.fromkeys(string.punctuation))

#def remover_punctuacion(text):
#      return text.translate(remover)
    
def limpiar_texto(text):
    text = text.lower()
    text = re.sub('_', ' ', text)
    text = ' '.join(re.findall('[a-zA-ZáéíñóúüÁÉÍÑÓÚÜ]+', text))
    #text = re.sub('[0-9]+', ' ', text)

    #Elimina stopwords    
    filtered_words = [word for word in text.split() if (word not in stopwords.words('spanish')) and (word not in STOP_WORDS)]
    text = ' '.join(filtered_words)
    
    text = text.strip(' ')
    return text


textos={}
texto_unsplitted = {}

#Guarda una lista de listas. Cada conjero tendrá varias listas tokenizadas, cada una correspondiente a una intervención suya.
textos_intervenciones = {}

for p,inters in intervenciones.items():
    textos_intervenciones[p] = []
    textos[p] = ""
    for inter in inters:
        textos[p]+= inter.texto
        textos_intervenciones[p].append((limpiar_texto(inter.texto)).split())
    
    textos[p] = limpiar_texto(textos[p])
   # textos[p] = textos[p].lower()
    texto_unsplitted[p] = textos[p]
    textos[p] = textos[p].split()
    
    
    
#for inter in inters:
#    palabras = limpiar_texto(inter.texto).split()
#    if len(palabras)>20:
#        textos[p]+=' '.join(palabras)
#        textos_intervenciones[p].append(palabras)

#texto_unsplitted[p] = ''.join(textos[p])
#textos[p] = textos[p].split()

In [None]:
n_palabras = {}
for persona,texto in textos.items():
    n_palabras[persona] = sum([y for x,y in Counter(texto).items()])

In [None]:
df_palabras = pd.DataFrame.from_dict(n_palabras,orient='index')
df_palabras.columns = ['palabras']
df_palabras['persona'] = df_palabras.index
df_palabras = pd.merge(df,df_palabras,on='persona')
df_palabras[0:5]

In [None]:
bars_2 = []

for partido,partido_df in df_palabras.groupby('partido'):
    bars_2.append(go.Bar(
    x = partido_df['apellido1'],
    y = partido_df['palabras'],
    text = partido_df['persona'],
    name = partido,
    marker = {'color': colors[partido]}
    ))
    
layout = go.Layout(
    title=go.layout.Title(
        xref='paper',
        x=0
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Nº palabras',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        )
    )
)
go.FigureWidget(data = bars_2,layout=layout)

## ANÁLISIS LINGUISTICO DE LAS INTERVENCIONES

In [None]:
from collections import Counter

aplausos={}
for p,inters in intervenciones.items():
    aplausos[p] = 0
    for inter in inters:
        corpus = inter.texto
        for char in '-.,\n\[\]¿?':
            corpus = corpus.replace(char,' ')
        corpus = corpus.lower()
        corpus = corpus.split()
        for word in corpus:
            if word=="aplausos":
                aplausos[p]+=1
df_aplausos = pd.DataFrame.from_dict(aplausos,orient='index')
df_aplausos.columns = ['aplausos']
df_aplausos['persona'] = df_aplausos.index
df_aplausos = pd.merge(df,df_aplausos,on='persona')
df_aplausos = df_aplausos[df_aplausos.persona != 'BOSQUET AZNAR']


In [None]:
from collections import Counter

interrup={}
for p,inters in intervenciones.items():
    interrup[p] = 0
    for inter in inters:
        corpus = inter.texto
        for char in '-.,\n\[\]¿?':
            corpus = corpus.replace(char,' ')
        corpus = corpus.lower()
        corpus = corpus.split()
        for word in corpus:
            if word=="rumores":
                interrup[p]+=1
df_inter = pd.DataFrame.from_dict(interrup,orient='index')
df_inter.columns = ['interrupciones']
df_inter['persona'] = df_inter.index
df_inter = pd.merge(df_aplausos,df_inter,on='persona')
df_inter = df_inter[df_inter.persona != 'BOSQUET AZNAR']

df_sum = df_inter.groupby(by="partido").sum()
df_sum['partido'] = df_sum.index

interrupciones = go.Bar(
    y = df_sum['interrupciones'],
    x = df_sum['partido'],
    name = 'Interrupciones'
)
aplausos = go.Bar(
    y = df_sum['aplausos'],
    x = df_sum['partido'],
    name = 'Aplausos'
)

data = [interrupciones, aplausos]

layout = go.Layout(
    
    barmode='group',
    yaxis = go.layout.YAxis(
        title = go.layout.yaxis.Title(
            text='nº veces',
            font=dict(
                size=15,
            ))))

a = go.FigureWidget(data=data,layout=layout)
a

In [None]:
## NO CONSIGO PONER ESTO CON UN SIZE DECENTE
from IPython.display import IFrame

from plotly import tools


fig = tools.make_subplots(rows=1, cols=len(bars_n),
                          shared_xaxes=True, shared_yaxes=True,
                          vertical_spacing=0.001)
for i,b in enumerate(bars_n):
    fig.append_trace(b, 1, i+1)

    
fig2 = tools.make_subplots(rows=1, cols=len(bars),
                          shared_xaxes=True, shared_yaxes=True,
                          vertical_spacing=0.001)
for i,b in enumerate(bars_2):
    fig2.append_trace(b, 1, i+1)

fig['layout'].update(height=400, width=1000, title='Intervenciones')
fig2['layout'].update(height=400, width=1000, title='Wordcount')

py.iplot(go.FigureWidget(fig))
py.iplot(go.FigureWidget(fig2))

## Modelamiento de tematica (topic)

In [None]:
import spacy
#pip install --upgrade gensim
from gensim.models import Phrases
from gensim import corpora
from gensim import models
import gensim
import numpy as np

nlp = spacy.load('es_core_news_md')

In [None]:
#Texto de ejemplo

inter_un = texto_unsplitted["CANO PALOMINO"]

In [None]:
#Lemmanización

inter_un = [l.lemma_ for l in nlp(inter_un)]

In [None]:
#Palabras más repetidas

def showWordCloud(data):
    words = ' '.join(data)
    cleaned_word = " ".join([word for word in words.split()])
    wordcloud = WordCloud(stopwords = STOPWORDS,
                         background_color = 'black',
                         width = 2500,
                         height = 2500
                         ).generate(cleaned_word)
    plt.figure(1,figsize = (13,13))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()

showWordCloud(inter_un)

In [None]:
#Ejemplo para el modelado de topicos

inter = textos_intervenciones["CANO PALOMINO"]

#inter = [l.lemma_ for l in nlp([i for i in inter])]     

In [None]:
#Eliminar las intervenciones con menos de 20 palabras, o 20 palabras

inter = [i for i in inter if len(i)>20]

In [None]:
# Buscar frases comunes en las entrada, y las convierte en una sola, es decir palabras que suelan estar juntas:
# Ej: señora magistrado = señora_magistrado

#tokens = data['tokens'].tolist()
bigram_model = Phrases(inter)   
trigram_model = Phrases(bigram_model[inter], min_count=1) 
tokens = list(trigram_model[bigram_model[inter]])

In [None]:
dictionary_LDA = corpora.Dictionary(tokens)
#dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

In [None]:
np.random.seed(123456)
num_topics = 5
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

In [None]:
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20):
    print(str(i)+": "+ topic)
    print()

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    
    sent_topics_df = pd.DataFrame()

    # Obtener el tópico principal de cada documento
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Obtenga el topico dominante, la contribución porcentual y las palabras clave de cada documento
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => topico dominante
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=tokens)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

In [None]:
# Configuración de la pantalla para mostrar más caracteres en la columna
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset indices  
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Mostrar
sent_topics_sorteddf_mallet.head(10)

In [None]:
# 1. Nube de las n palabras top en cada topico
from matplotlib import pyplot as plt
#from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=STOPWORDS,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
from collections import Counter
topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in tokens for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(2, 2, figsize=(16,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Conteo de palabras')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Importancia')
    ax.set_ylabel('Conteo de palabras', color=cols[i])
    ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Número de palabras e importancia de las palabras clave por tema', fontsize=22, y=1.05)    
plt.show()

In [None]:
#pip install pyldavis

%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

### Palabras más comunes por pares de partido

In [None]:
import nltk
texto_partido = {'CIUDADANOS':"",'VOX':"",'PP':"",'PSOE':"",'ADELANTE':"",'AXSI':""}
for persona,texto in texto_unsplitted.items():
    if persona != 'BOSQUET AZNAR':
        for part in texto_partido.keys():
            p = df[df.persona == persona].partido.values[0]
            if p == part:
                texto_partido[part] +=texto

split_texto_partido = {'CIUDADANOS':[],'VOX':[],'PP':[],'PSOE':[],'ADELANTE':[],'AXSI':[]}
for persona,texto in textos.items():
    for part in split_texto_partido.keys():
        p = df[df.persona == persona].partido.values[0]
        if p == part:
            split_texto_partido[part] +=texto

trigrams_dict = {'CIUDADANOS':[],'VOX':[],'PP':[],'PSOE':[],'ADELANTE':[],'AXSI':[]}
for partido,texto in texto_partido.items():
    trigrams =  nltk.trigrams(texto)
    #for bg in bigrams:
    #    if "y" in bg[0]:
    #        print(bg)
    trigrams_dict[partido] = Counter(trigrams).most_common()

In [None]:
#python -m spacy download es_core_news_md

import scattertext as st

doc = spacy_nlp(article)
tokens = [token.text for token in doc if not token.is_stop]
print('Original Article: %s' % (article))
print()
print(tokens)

import spacy
nlp = spacy.load('es_core_news_md')

corpus_df = pd.DataFrame(columns = ['partido','texto','parsed','parsed2'])
corpus_df['partido'] = texto_partido.keys()
corpus_df['texto'] = [y for x,y in texto_partido.items()]
corpus_df['parsed'] = [y for x,y in split_texto_partido.items()]
corpus_df['parsed2'] = corpus_df.texto.apply(nlp)
corpus = st.CorpusFromParsedDocuments(corpus_df, category_col='partido', parsed_col='parsed2').build()

In [None]:
corpus_df

In [None]:
corpus_df.texto[1]

In [None]:
from scattertext import CorpusFromPandas, produce_scattertext_explorer

html = produce_scattertext_explorer(corpus,
                                    category='VOX',
                                    category_name='Vox',
                                    not_category_name='Otros',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=10,
                                    transform=st.Scalers.scale,
                                    #transform=st.Scalers.percentile
                                    metadata=corpus_df['partido'])
file_name = 'output/plenoAndaluz.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [None]:
comparison = corpus_df[corpus_df.partido == 'VOX' ]
comparison = comparison.append(corpus_df[corpus_df.partido == 'ADELANTE'])
corpus = st.CorpusFromParsedDocuments(comparison, category_col='partido', parsed_col='parsed2').build()
html = produce_scattertext_explorer(corpus,
                                    category='VOX',
                                    category_name='Vox',
                                    not_category_name='Adelante',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.percentile)
file_name = 'output/vox-adelante.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [None]:
corpus_df.head()

In [None]:
comparison = corpus_df[corpus_df.partido == 'PSOE' ]
comparison = comparison.append(corpus_df[corpus_df.partido == 'ADELANTE'])
corpus = st.CorpusFromParsedDocuments(comparison, category_col='partido', parsed_col='parsed2').build()
html = produce_scattertext_explorer(corpus,
                                    category='PSOE',
                                    category_name='Psoe',
                                    not_category_name='Adelante',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.percentile)
file_name = 'output/vox-adelante.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [None]:
comparison = corpus_df[corpus_df.partido == 'PSOE' ]
comparison = comparison.append(corpus_df[corpus_df.partido == 'CIUDADANOS'])
corpus = st.CorpusFromParsedDocuments(comparison, category_col='partido', parsed_col='parsed2').build()
html = produce_scattertext_explorer(corpus,
                                    category='PSOE',
                                    category_name='Psoe',
                                    not_category_name='Cs',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.percentile)
file_name = 'output/vox-adelante.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [None]:
comparison = corpus_df[corpus_df.partido == 'CIUDADANOS' ]
comparison = comparison.append(corpus_df[corpus_df.partido == 'ADELANTE'])
corpus = st.CorpusFromParsedDocuments(comparison, category_col='partido', parsed_col='parsed2').build()
html = produce_scattertext_explorer(corpus,
                                    category='CIUDADANOS',
                                    category_name='Cs',
                                    not_category_name='Adelante',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.percentile)
file_name = 'output/vox-adelante.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [None]:
comparison = corpus_df[corpus_df.partido == 'CIUDADANOS' ]
comparison.append(corpus_df[corpus_df.partido == 'PP'])
corpus = st.CorpusFromParsedDocuments(comparison, category_col='partido', parsed_col='parsed2').build()
html = produce_scattertext_explorer(corpus,
                                    category='CIUDADANOS',
                                    category_name='Cs',
                                    not_category_name='PP',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    max_terms=1000,
                                    transform=st.Scalers.percentile,
                                    metadata=corpus_df['partido'])
file_name = 'output/cs-pp.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [None]:
from scattertext import word_similarity_explorer
html = word_similarity_explorer(corpus,
                                category='Cs',
                                category_name='PP',
                                not_category_name='Republican',
                                target_term='jobs',
                                minimum_term_frequency=5,
                                pmi_threshold_coefficient=4,
                                width_in_pixels=1000,
                                metadata=convention_df['speaker'],
                                alpha=0.01,
                                max_p_val=0.05,
                                save_svg_button=True)