# 1. Imports

In [4]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import copy

#Preprocessing
import re
import nltk

def check_nltk_packages():
  packages = ['punkt','stopwords','omw-1.4','wordnet']

  for package in packages:
    try:
      nltk.data.find('tokenizers/' + package)
    except LookupError:
      nltk.download(package)
check_nltk_packages()

try:
  import lxml
except ModuleNotFoundError:
  %pip install lxml

try:
  import contractions
except ModuleNotFoundError:
  %pip install contractions
  import contractions

from bs4 import BeautifulSoup

from nltk.tokenize import wordpunct_tokenize, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

#Vectorization
from gensim.corpora import Dictionary
from gensim.models.phrases import Phrases
from gensim.models import TfidfModel
from gensim.matutils import corpus2dense, corpus2csc
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.word2vec import LineSentence
from sklearn.manifold import TSNE
from scipy.sparse import csr_array, lil_array, save_npz, load_npz

#Transformers
from transformers import RobertaTokenizer, RobertaModel
import torch
from tqdm import tqdm

# NN
from torch import optim, nn

#Tesauros
from nltk.corpus import wordnet



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 2. Data loading

In [None]:
path_to_folder = '/data'

#Import del excel
excel_file_path = path_to_folder + '/projects.xlsx'
df_projects = pd.read_excel(excel_file_path)

#Ejecutar solo una vez para crear archivo parquet
parquet_file_path = path_to_folder + '/projects.parquet'
df_projects.to_parquet(parquet_file_path, engine='pyarrow')

#Podemos cargar el archivo .parquet en vez del excel una vez esté creado para mayor rapidez
# parquet_file_path = path_to_folder + '/projects.parquet'
# df_projects = pd.read_parquet(parquet_file_path, engine='pyarrow')

#Creamos un nuevo DataFrame que utilizaremos posteriormente
df = pd.DataFrame()
df['raw_text'] = df_projects.title + ' ' + df_projects.summary #concat de titulo y summary
print(df.head())

# 3. Preprocessing

In [None]:
#Función que realiza todo el proceso de preprocesamiento
def prepare_data(text):

  preprocessed_text = BeautifulSoup(text,"lxml").get_text() #quitar etiquetas html
  preprocessed_text = re.sub(r'https://\S+|www\.\S+','',preprocessed_text) #quitar URLs
  preprocessed_text = contractions.fix(preprocessed_text) #expandir contracciones
  preprocessed_text = wordpunct_tokenize(preprocessed_text) #tokenizar
  preprocessed_text = [preprocessed_text[i].lower() for i in range(len(preprocessed_text))] #minusculas
  preprocessed_text = [preprocessed_text[i] for i in range(len(preprocessed_text)) if(preprocessed_text[i].isalnum())] #quitar caracteres especiales
  wnl = WordNetLemmatizer() #lematizar
  preprocessed_text = [wnl.lemmatize(el) for el in preprocessed_text]
  stopwords_en = stopwords.words('english') #quitar stopwords
  preprocessed_text = [preprocessed_text[i] for i in range(len(preprocessed_text)) if(preprocessed_text[i] not in stopwords_en)]

  return preprocessed_text

#Aplicamos el preprocesado y lo guardamos en una nueva columna
df['lemmas'] = df['raw_text'].apply(prepare_data)

# 3.(Extra): Thesaurus

In [None]:
# Si se desea aplicar esta tecnica, se deben ejecutar todas las celdas posteriores de nuevo
# para la columna 'lemmas_thesaurus' del dataframe en lugar de la columna 'lemmas'

# Función para obtener sinónimos de una palabra utilizando WordNet
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower())
    return list(synonyms)

# Función para expandir términos en un texto utilizando sinonimos, basada en preprocessing
def expand_text_with_synonyms(text):
    max_synonyms_per_token = 5
    preprocessed_text = BeautifulSoup(text,"lxml").get_text() #quitar etiquetas html
    preprocessed_text = re.sub(r'https://\S+|www\.\S+','',preprocessed_text) #quitar URLs
    preprocessed_text = contractions.fix(preprocessed_text) #expandir contracciones
    preprocessed_text = wordpunct_tokenize(preprocessed_text) #tokenizar
    preprocessed_text = [preprocessed_text[i].lower() for i in range(len(preprocessed_text))] #minusculas
    preprocessed_text = [preprocessed_text[i] for i in range(len(preprocessed_text)) if(preprocessed_text[i].isalnum())] #quitar caracteres especiales
    wnl = WordNetLemmatizer() #lematizar
    preprocessed_text = [wnl.lemmatize(el) for el in preprocessed_text]
    stopwords_en = stopwords.words('english') #quitar stopwords
    preprocessed_text = [preprocessed_text[i] for i in range(len(preprocessed_text)) if(preprocessed_text[i] not in stopwords_en)]

    expanded_tokens = []
    for token in preprocessed_text:
      # Obtener sinónimos y agregar hasta max_synonyms_per_token por token
      synonyms = get_synonyms(token)
      expanded_tokens.extend(synonyms[:max_synonyms_per_token])
    return expanded_tokens


# Se crea una nueva columna con los nuevos lemmas
df['lemmas_thesaurus'] = df.raw_text.apply(expand_text_with_synonyms)

# 4.Vectorization

## 4.1. TF-IDF

In [None]:
#Crear un corpus iterable (eficiente en terminos de memoria) para crear un diccionario
class IterableCorpus_fromdf:
    def __init__(self, df):
        self.__df = df
    def __iter__(self):
        for index, value in self.__df.lemmas.items():
            yield value

MyIterCorpus = IterableCorpus_fromdf(df)
D = Dictionary(MyIterCorpus)
no_below = 4 # Minimo número de documentos para guardar una palabra en el diccionario
no_above = .80 # Maximo porcentaje de documentos en el que aparece una palabra para guardarla en el diccionario
D.filter_extremes(no_below=no_below,no_above=no_above)

#Obtenemos la representación TF-IDF
bow = [D.doc2bow(doc) for doc in MyIterCorpus]
tfidf = TfidfModel(bow)
reviews_tfidf = tfidf[bow]
df['emb_TFIDF'] = reviews_tfidf

#Conversion a matriz sparse para su utilizacion en la posterior regresion
n_tokens = len(D)
num_docs = len(bow)
corpus_tfidf_sparse = corpus2csc(reviews_tfidf, num_terms=n_tokens, num_docs=num_docs).T

## 4.2. Word2Vec

In [None]:
#Obtencion de la representacion word2vec

def lemmas_to_line(row):
    return ' '.join(row['lemmas'])

iterable_sentences = [lemmas_to_line(row) for index, row in df.iterrows()]
tokenized_sentences = [sentence.split() for sentence in iterable_sentences]
vector_size = 200
window = 5
min_count = 10
seed = 42
sg = 1

model_w2v = Word2Vec(sentences=tokenized_sentences,
                 vector_size = vector_size,
                 window = window,
                 min_count = min_count,
                 sg = sg,
                 seed = seed,
                 workers=4)
wv = model_w2v.wv

#Convertir en matriz sparse como promedio de los embeddings de las palabras que forman cada texto
def get_review_vector(model, review):
    tokens_without_OOV = [token for token in review if token in model]

    if not tokens_without_OOV:
        vec = csr_array(np.zeros(model.vector_size))
        return vec
    #Calculo del vector medio
    review_vector = np.mean([model[token] for token in tokens_without_OOV], axis=0)
    vec = csr_array(review_vector)
    return vec

# Guardado en matriz sparse
embedding_size = wv.vector_size
corpus_word2vec_sparse = lil_array((len(df), embedding_size), dtype=np.float32)

for i,row in zip(range(len(df)),df.iterrows()):
    row = row[1]
    review_tokens = row['lemmas']
    review_vector_sparse = get_review_vector(wv, review_tokens)
    corpus_word2vec_sparse[i, :] = review_vector_sparse

corpus_word2vec_sparse = csr_array(corpus_word2vec_sparse)

## 4.3. Transformers

In [None]:
# Seleccion de modelo
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

#Se selecciona la GPU si esta disponible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

#Lista para guardar los embeddings
review_embeddings_roberta = []

batch_size = 8
for i in tqdm(range(0, len(df['raw_text']), batch_size)):
    batch_reviews = df['raw_text'][i:i+batch_size]
    #Tokenizacion
    tokens = tokenizer(batch_reviews.tolist(), return_tensors='pt', truncation=True, padding=True)
    tokens = {key: val.to(device) for key, val in tokens.items()}

    with torch.no_grad():
        output = model(**tokens)

    cls_embedding = output.last_hidden_state[:, 0, :].cpu().numpy()
    review_embeddings_roberta.append(cls_embedding)


# Conversion a matriz de numpy
review_embeddings_roberta = np.vstack(review_embeddings_roberta)

# 5. Regression

## 5.1. Train/test sets definition

In [None]:
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer

#Obtener el numero de publicaciones y patentes
df_projects.patentID.fillna(value='[]', inplace=True)
df_projects.publicationID.fillna(value='[]', inplace=True)

#Fila index=6623 error tipografico en publicationID (falta '])
for index,row in df_projects.iterrows():
  if(str(row.publicationID)[-1]!=']'):
    df_projects.publicationID[index] = df_projects.publicationID[index]+'\']'
y = [len(eval(df_projects.publicationID[i])) + len(eval(df_projects.patentID[i])) for i in range(df_projects.shape[0])]

#Elegimos cada vez una fila diferente para elegir el modelo deseado en cada caso. Dejamos por defecto word2vec aunque puedes elegir cualquiera

# embeddings = corpus_tfidf_sparse.toarray()
embeddings = corpus_word2vec_sparse.toarray()
# embeddings = embeddings_roberta
# embeddings = np.asarray(corpus_word2vec_dense)
# embeddings = corpus_tfidf_sparse_thesaurus.toarray()
# embeddings = corpus_word2vec_sparse_thesaurus.toarray()

# Crear un MultiLabelBinarizer para convertir listas de países en vectores binarios
mlb = MultiLabelBinarizer()
labels_binary = mlb.fit_transform(df_projects['coordinatorCountry'])
numeros_decimales = np.dot(labels_binary, 2 ** np.arange(labels_binary.shape[1])[::-1])
matriz_columna = numeros_decimales.reshape(-1, 1)
df_projects['coordinatorCountry_bin'] = matriz_columna

#Normalización de los datos
scaler = StandardScaler()
y = np.array(y)[:,np.newaxis]

#Si queremos introducir más columnas del excel parar mayor información del conjunto, descomentamos las posteriores líneas
data = np.hstack((embeddings,
                  # np.array(df_projects.totalCost)[:,np.newaxis],
                  # np.array(df_projects.ecMaxContribution)[:,np.newaxis],
                  # np.array(df_projects.rcn)[:,np.newaxis],
                  # np.array(df_projects.coordinatorCountry_bin)[:,np.newaxis],
                  y))
scaler.fit(data)
data_scaled = scaler.transform(data)

#Crear los subconjuntos de prueba y test
X = data_scaled[:,:-1]
y = data_scaled[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def sparse_matrix2sparse_tensor(sm):
  coo = sm.tocoo()
  indices = torch.LongTensor([coo.row, coo.col])
  values = torch.FloatTensor(coo.data)
  shape = torch.Size(sm.shape)
  st = torch.sparse.FloatTensor(indices, values, shape)
  return st


# Convert to Torch tensors
# X_train_torch = sparse_matrix2sparse_tensor(X_train)
# X_test_torch = sparse_matrix2sparse_tensor(X_test)
X_train_torch = torch.tensor(X_train, dtype=torch.float32)
X_test_torch = torch.tensor(X_test, dtype=torch.float32)
y_train_torch = torch.tensor(y_train, dtype=torch.float32)
y_test_torch = torch.tensor(y_test, dtype=torch.float32)

input_dim=X_train.shape[1]

## 5.2. RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

#Cogemos sólo los 5000 primeros textos por rapidez de ejecución, si se quiere se pueden utilizar más textos de los disponibles
X_train, X_test, y_train, y_test = train_test_split(X[:5000,:], y[:5000], test_size=0.2, random_state=42)


# Crea el modelo de regresión con RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=120, max_features='sqrt', random_state=42, verbose=2, n_jobs=-1)

# Entrena el modelo
rf_model.fit(X_train, y_train)

# Realiza predicciones en el conjunto de prueba
y_pred = rf_model.predict(X_test)

# Método de validación cruzada
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')

# Evalúa el rendimiento del modelo
mse = mean_squared_error(y_test, y_pred)
print(f"Error cuadrático medio: {mse}")
print(f'Puntuación validación cruzada: {cv_scores}')
print(f'Media validación cruzada: {cv_scores.mean()}')
print(f'Desviación estándar: {cv_scores.std()}')

## 5.3. Neural Network

In [None]:
torch.manual_seed(42)

#Modelo de la red, ciertas capas de dropout estan comentadas ya que se han realizado varias pruebas con ellas
model = nn.Sequential(
    nn.Linear(input_dim, 240),
    nn.BatchNorm1d(240), #Utilizar esta capa solo en los casos especificados en la memoria
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(240, 120),
    nn.ReLU(),
    # nn.Dropout(0.5),
    nn.Linear(120, 60),
    nn.ReLU(),
    # nn.Dropout(0.8),
    nn.Linear(60, 30),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(30, 15),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(15, 7),
    nn.ReLU(),
    nn.Linear(7, 1)
)

#Seleccion de funcion de perdidas y optimizador
loss=nn.MSELoss()
optimizers=optim.Adam(params=model.parameters(), lr = 0.01, weight_decay=1e-5)


#Verificar si la GPU está disponible y enviar variables
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Dispositivo utilizado:", device)

X_train_torch = X_train_torch.to(device)
y_train_torch = y_train_torch.to(device)
X_test_torch = X_test_torch.to(device)
y_test_torch = y_test_torch.to(device)
model = model.to(device)
loss = loss.to(device)

#Entrenamiento
num_of_epochs=500
loss_train = np.zeros(num_of_epochs)
loss_test = np.zeros(num_of_epochs)

#Parametros para early stopping
best_test_loss = float('inf')
patience = 150
early_stop_counter = 0
epoch_early_stop = num_of_epochs
best_epoch = 0
best_weights = None

for epoch in range(num_of_epochs):
  #Entrenamiento para epoch actual
  model.train()
  y_train_prediction=model(X_train_torch)
  loss_value=loss(y_train_prediction.squeeze(),y_train_torch.squeeze())
  loss_train[epoch] = loss_value

  #Retropropagacion y actualizacion de pesos
  optimizers.zero_grad()
  loss_value.backward()
  optimizers.step()

  #Evaluacion de test para la epoch actual
  model.eval()
  with torch.no_grad():
    y_test_prediction=model(X_test_torch)
    loss_test[epoch] = loss(y_test_prediction.squeeze(),y_test_torch.squeeze())

  #Comprobacion del early stop
  if (loss_test[epoch] < best_test_loss):
    best_test_loss = loss_test[epoch]
    best_epoch = epoch
    early_stop_counter = 0
    best_weights = copy.deepcopy(model.state_dict())

  else:
    early_stop_counter += 1

  if (early_stop_counter >= patience):
    print(f'Early stopping at epoch {epoch}')
    epoch_early_stop = epoch
    break

  #Evolucion del error con las epochs
  if epoch % 20 == 0:
    print(f'[epoch:{epoch}]: Training loss={loss_train[epoch]}, Test loss={loss_test[epoch]}')


#Evaluacion final
with torch.no_grad():
  #Se recupera el mejor modelo
  model.load_state_dict(best_weights)
  model.eval()
  y_test_prediction=model(X_test_torch)
  test_loss=loss(y_test_prediction.squeeze(),y_test_torch)
  print(f'Best test loss value : {best_test_loss:.4f} obtainted at epoch {best_epoch}')

#Si se desea visualizar una grafica con la evolucion de los errores de train y test
plt.figure(figsize=(14,5))
plt.subplot(1, 2, 1), plt.plot(loss_train[0:epoch_early_stop+1], 'b'), plt.plot(loss_test[0:epoch_early_stop+1], 'r'), plt.legend(['train', 'test']), plt.title('MSE loss')
plt.show()