### Preprocess

In [1]:
%%capture
!pip install -r requirements.txt --user

In [1]:
path_to_MyModule = '..'

import sys
sys.path.insert(0, path_to_MyModule) 

import pandas as pd

from MyModule.GeneralFunctions import *
from MyModule.SummarizationFunctions import *
from MyModule.SamplingFunctions import *

In [2]:
df = pd.read_excel('..\datos.xlsx')[['ID','texto','desafio']]

In [3]:
# Limpieza
df.drop_duplicates(subset='texto', inplace=True)

# Quitando texto de mas en columna "desafio"
df['desafio'] = df['desafio'].apply(lambda x: re.findall('[0-9]+', x)[0])

# A str
df['texto'] = df['texto'].astype(str)

df.reset_index(inplace=True)

### Create target variable: 
True if the pair comes from the same desafio, false otherwise

In [5]:
# Create a list of tuples containing all possible pairs of strings and ID's
import itertools
id_pairs = list(itertools.combinations(df['ID'].values, 2))

In [6]:
# Create target: 1 if both ID's are from the same desafio, 0 otherwise
target = []
for id1, id2 in id_pairs:
    for desafio in df['desafio'].unique():
        ids_desafio = df[df['desafio']==desafio]['ID'].values
        if id1 in ids_desafio and id2 in ids_desafio:
            target.append(1)
            break
        elif id1 in ids_desafio or id2 in ids_desafio:
            target.append(0)
            break

In [7]:
df_predic = pd.DataFrame([id_pairs, target]).T
df_predic.columns = ['id_pairs','target']

# Are this two ideas from the same desafio?

## Sentence embedding feature

Three feature alternatives for sentence embedding:

1. A single column (tuples of (1, 768) dimensions)
2. One column for each of the 768 D
3. Reduce dimension with PCA or Lasso/Ridge

In [8]:
model = SentenceTransformer('hiiamsid/sentence_similarity_spanish_es')

In [6]:
original_documents = df['texto'].values

pp_object = Preprocess(lemma=False, stopwords=False, alphanumeric=False, join=False)

pp_documents = pp_object.preprocess(original_documents)

emb_docs = model.encode(pp_documents)

In [10]:
# Add as featrues the document vectors for each document in the pair

first_doc_emb = []
second_doc_emb = []

for pair1, pair2 in df_predic['id_pairs'].values:
    
    indice_pair1 = df[df['ID']==pair1].index[0]
    first_doc_emb.append(emb_docs[indice_pair1])
    
    indice_pair2 = df[df['ID']==pair2].index[0]
    second_doc_emb.append(emb_docs[indice_pair2])
    

df_predic['first_doc_emb'] = first_doc_emb
df_predic['second_doc_emb'] = second_doc_emb

## Sentiment Analysis feature

In [4]:
df_predic = pd.read_excel('embedding_df.xlsx')

df_predic['id_pairs'] = df_predic['id_pairs'].apply(lambda x: string_to_tuple(x)) # recovering tuples

In [7]:
# Analyze the three sentiments for each doc
from MyModule.SentimentAnalysisFunctions import sentiment_analyzer_3d

all_emotions = {}
analyzer = sentiment_analyzer_3d()

for i, this_id in enumerate(df['ID'].values):
    print('Doing {} of {}'.format(i, len(df['ID'].values)))
    all_emotions[this_id] = analyzer.predict_sentiment_3d(pp_documents[i])

loading configuration file config.json from cache at C:\Users\Felipe/.cache\huggingface\hub\models--pysentimiento--robertuito-sentiment-analysis\snapshots\e3be95c8efad7f480ce8aab2221188ecb78e40f3\config.json
Model config RobertaConfig {
  "_name_or_path": "pysentimiento/robertuito-sentiment-analysis",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NEG",
    "1": "NEU",
    "2": "POS"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "NEG": 0,
    "NEU": 1,
    "POS": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type"

Doing 0 of 505
Doing 1 of 505
Doing 2 of 505
Doing 3 of 505
Doing 4 of 505
Doing 5 of 505
Doing 6 of 505
Doing 7 of 505
Doing 8 of 505
Doing 9 of 505
Doing 10 of 505
Doing 11 of 505
Doing 12 of 505
Doing 13 of 505
Doing 14 of 505
Doing 15 of 505
Doing 16 of 505
Doing 17 of 505
Doing 18 of 505
Doing 19 of 505
Doing 20 of 505
Doing 21 of 505
Doing 22 of 505
Doing 23 of 505
Doing 24 of 505
Doing 25 of 505
Doing 26 of 505
Doing 27 of 505
Doing 28 of 505
Doing 29 of 505
Doing 30 of 505
Doing 31 of 505
Doing 32 of 505
Doing 33 of 505
Doing 34 of 505
Doing 35 of 505
Doing 36 of 505
Doing 37 of 505
Doing 38 of 505
Doing 39 of 505
Doing 40 of 505
Doing 41 of 505
Doing 42 of 505
Doing 43 of 505
Doing 44 of 505
Doing 45 of 505
Doing 46 of 505
Doing 47 of 505
Doing 48 of 505
Doing 49 of 505
Doing 50 of 505
Doing 51 of 505
Doing 52 of 505
Doing 53 of 505
Doing 54 of 505
Doing 55 of 505
Doing 56 of 505
Doing 57 of 505
Doing 58 of 505
Doing 59 of 505
Doing 60 of 505
Doing 61 of 505
Doing 62 of 505
Do

In [11]:
# Add as featrues the document vectors for each document in the pair

first_doc_pos = []
first_doc_neg = []
first_doc_neu = []

second_doc_pos = []
second_doc_neg = []
second_doc_neu = []

for pair1, pair2 in df_predic['id_pairs'].values:
    
    first_doc_pos.append(all_emotions[pair1][0])
    first_doc_neg.append(all_emotions[pair1][1])
    first_doc_neu.append(all_emotions[pair1][2])
    
    second_doc_pos.append(all_emotions[pair2][0])
    second_doc_neg.append(all_emotions[pair2][1])
    second_doc_neu.append(all_emotions[pair2][2])
    

df_predic['first_doc_pos'] = first_doc_pos
df_predic['first_doc_neg'] = first_doc_neg
df_predic['first_doc_neu'] = first_doc_neu

df_predic['second_doc_pos'] = second_doc_pos
df_predic['second_doc_neg'] = second_doc_neg
df_predic['second_doc_neu'] = second_doc_neu


In [None]:
# Length of each documents

df_predic = pd.read_excel('embedding_sent_df.xlsx')

df_predic['id_pairs'] = df_predic['id_pairs'].apply(lambda x: string_to_tuple(x)) # recovering tuples