# Configuramos variables de ambiente

In [None]:
import dotenv

config = dotenv.dotenv_values(".env")

In [None]:
api_key = config['API_KEY']
embeddings_pathname = config['EMBEDDINGS_PATH']

## Experimento: Usar textos propios para generar embeddings

### Generar textos

In [None]:
# https://www.diffen.com/difference/Democrat_vs_Republican

In [None]:
import pandas as pd
comparison = pd.read_csv('comparison_chart.csv')
comparison.head()

In [None]:
def generate_prompts(data):
    data['democrats-prompts'] = 'Write a reddit submission of no more than 10k chars about: ' + data.Topics + '. You should be taking the following stance: ' + data.Democrats
    data['republicans-prompts'] = 'Write a reddit submission of no more than 10k chars about: ' + data.Topics + '. You should be taking taking the following stance: ' + data.Republicans
    return data

In [None]:
data = generate_prompts(comparison)

In [None]:
import cohere
import time

def generate_text(prompts):
    generations = []
    for prompt in prompts:
        co = cohere.Client(api_key)
        response = co.generate(
          model='command',
          prompt=f'\'{prompt}\'',
          max_tokens=300,
          temperature=0.9,
          k=0,
          stop_sequences=[],
          return_likelihoods='NONE')
        
        generations.append(response.generations[0].text)
        time.sleep(62)

    return generations

In [None]:
democrats_generations = generate_text(data['democrats-prompts')
republicans_generations = generate_text(data['republicans-prompts')

df = pd.DataFrame({'democrats-prompts': data['democrats-prompts'], 
                  'republicans-prompts': data['republicans-prompts'],
                  'democrats-generations': democrats_generations,
                   'republicans-generations': republicans_generations,
                   'topic': comparison.Topics
                  })
df.head()

In [None]:
def clean_text(df, column):
    df[column] = df[column].str.replace('\n', ' ').str.replace('\r', ' ').str.strip()
clean_text(df,'democrats-generations')     
clean_text(df,'republicans-generations')

In [None]:
df.head()

In [None]:
df.to_csv('generations.csv', index=False)

### Generar embeddings

In [None]:
import cohere
from tqdm import tqdm

def generate_cohere_embedding(data, text_key, group_key):
    co = cohere.Client(api_key) ## Aca pone tu api key
    
    embeddings = []
    group_used = []
    cont = 0
    texts_arrays = []
    for index, subreddit in tqdm(data.iterrows()):
        comments = subreddit[text_key]
        if len(comments) > 0:    
            texts_arrays.append(subreddit[text_key])
            group_used.append(subreddit[group_key])
            if(len(texts_arrays) > 40):
                try:
                    response = co.embed(texts_arrays)
                except:
                    time.sleep(62)
                    response = co.embed(texts_arrays)
                embeddings = embeddings + response.embeddings
                texts_arrays = []
            
    if len(texts_arrays) > 0:
        response = co.embed(texts_arrays)
        embeddings = embeddings + response.embeddings
    
    tf_idf = pd.DataFrame(embeddings, index=group_used, columns=range(0,4096))
    return tf_idf

In [None]:
data = pd.read_csv('generations.csv')

In [None]:
data.head()

In [None]:
republican_data = pd.DataFrame({'text': data['republicans-generations'], 'topic': data['topic']})
republican_data

In [None]:
republicans_embeddings = generate_cohere_embedding(republican_data, 'text', 'topic')
republicans_embeddings.loc[:'Stance on Immigration'].to_csv('republican-generated-embeddings.csv')

In [None]:
democrat_data = pd.DataFrame({'text': data['democrats-generations'], 'topic': data['topic']})
democrat_data = democrat_data.iloc[:11]

In [None]:
democrats_embeddings = generate_cohere_embedding(democrat_data, 'text', 'topic')
democrats_embeddings.to_csv('democrat-generated-embeddings.csv', index=)

### Generar dimensiones

In [None]:
import pandas as pd

embeddings_2018 = pd.read_csv(embeddings_pathname, index_col=0)

In [None]:
from experiments.ranking import arxiv_waller_ranking

In [None]:
filter_list = [idx in arxiv_waller_ranking() for idx in embeddings_2018.index]

In [None]:
pd.Series(filter_list).value_counts()

In [None]:
embeddings_2018 = embeddings_2018[filter_list]

In [None]:
embeddings_2018.head()

In [None]:
democrats_embeddings_generated = pd.read_csv('democrat-generated-embeddings.csv')
republicans_embeddings_generated = pd.read_csv('republican-generated-embeddings.csv')

In [None]:
democrats_embeddings_generated.head()

In [None]:
democrats_embeddings_generated['new_index'] = 'democrats: ' + democrats_embeddings_generated['Unnamed: 0'] 

In [None]:
republicans_embeddings_generated['new_index'] = 'republicans: ' + republicans_embeddings_generated['Unnamed: 0'] 

In [None]:
republicans_embeddings_generated = republicans_embeddings_generated.set_index('new_index')
democrats_embeddings_generated = democrats_embeddings_generated.set_index('new_index')

In [None]:
republicans_embeddings_generated = republicans_embeddings_generated.drop('Unnamed: 0', axis=1)
democrats_embeddings_generated = democrats_embeddings_generated.drop('Unnamed: 0', axis=1)

In [None]:
embeddings = pd.concat([republicans_embeddings_generated, democrats_embeddings_generated, embeddings_2018])

In [None]:
from experiments.experiment import Experiment

In [None]:
experiment = Experiment(None, '', None, None, None)

In [None]:
import pandas as pd
comparison = pd.read_csv('comparison_chart.csv')
comparison.head()

In [None]:
seeds = [(f'democrats: {topic}', f'republicans: {topic}') for topic in comparison.Topics]

In [None]:
def embeddings_for(embeddings, seed):
    filter_list = [idx in arxiv_waller_ranking() + [seed[0], seed[1]] for idx in embeddings.index]
    return embeddings[filter_list]    

In [None]:
scores = [experiment.get_scores(embeddings_for(embeddings, seed), seeds=[seed]) for seed in seeds]

In [None]:
from experiments.ranking import Ranking

In [None]:
metrics = {}
kendall = {}
p_value = {}

for i, topic in enumerate(comparison.Topics):
    ranking_scores = scores[i].to_dict(orient='dict')['dem_rep']
    ranking_scores = ({k: v for k,v in ranking_scores.items() if k in arxiv_waller_ranking()})
    ranking = Ranking(ranking_scores)
    metrics[topic] = pd.DataFrame(ranking.evaluate_ranking_metrics())

In [None]:
df = pd.concat(list(metrics.values())).reset_index(drop=True)
df['topic'] = comparison.Topics

In [None]:
df

In [None]:
relevant_topics = df[df['p-value'] < 0.05]
relevant_topics

In [None]:
index_to_topic = relevant_topics['topic'].to_dict()

In [None]:
topic_to_index = {index_to_topic[index]: index for index in index_to_topic.keys()}
topic_to_index

### Relevant Topics

In [None]:
from experiments.ranking import Ranking

In [None]:
def ranking_with_seeds(topic, scores, topic_to_index):
    index = topic_to_index[topic]
    topic_data = scores[index]
    return topic_data.sort_values('dem_rep')

In [None]:
def ranking_without_seeds(topic, scores, topic_to_index):
    index = topic_to_index[topic]
    topic_data = scores[index]
    filter_arr = [index in arxiv_waller_ranking() for index in topic_data.index]
    topic_data = topic_data[filter_arr]
    return topic_data.sort_values('dem_rep')

In [None]:
def show_ranking_bump_plot(data):
    ranking = Ranking.from_pandas(data)
    return ranking.bump_plot();

def show_ranking_violin_plot(data):
    ranking = Ranking.from_pandas(data)
    return ranking.violin_plot();

In [None]:
def compare_by_topic(topic):
    print(ranking_with_seeds(topic, scores, topic_to_index))
    data = ranking_without_seeds(topic, scores, topic_to_index)

    print(data)
    
    show_ranking_bump_plot(data);
    plt = show_ranking_violin_plot(data)
    plt.savefig(f'{topic}.png')   

In [None]:
for topic in remarkable_topics['topic']:
    compare_by_topic(topic)