# Load Data

In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import gensim  # para cargar modelo w2v
from sklearn.model_selection import train_test_split
import tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.initializers import Constant
from tensorflow.keras.utils import plot_model

In [None]:
#Read Data
data = pd.read_csv("deceptive-opinion.csv", usecols=['polarity','text'])
data.head(10)

In [None]:

# Function to remove punctuation and numbers
def remove_puntuacion(text):
  """Remove puntuación and numbers"""
  new_text = re.sub('[^a-z]+', ' ', text)  
  return new_text   

In [None]:
# Convert text to lower case
data['text'] = data['text'].str.lower()
data['text_p'] = data['text'].apply(remove_puntuacion)
# Estimate lenght of text review
data['text_len'] = data['text_p'].str.split().str.len()
data.head(10)
     

In [None]:
# Get Vocabulary
def vocabulary(pandas_series):
    lista_palabras = " ".join(sec for sec in pandas_series).split(" ")
    x = Counter(lista_palabras)

    return [i for i,_ in x.most_common() ]

In [None]:
vocabulary_words = vocabulary(data['text_p'])
vocabulary_words

In [None]:
# Get vocabulary  Unique words
len(vocabulary_words)

In [None]:
#Map words to tokens

def get_dict_map(token_or_tag):
    if token_or_tag == 'token':
        # agregamos 'PAD_token' al vocabulario de tokens, para ser usado más
        # adelante a fin de que todas las oraciones tengan la misma cantidad de
        # tokens
        vocab = ['PAD_token'] + [token for token in vocabulary(data['text_p']) if token is not None] 
    elif token_or_tag == 'tag':
        vocab = vocabulary(data['Tags'])

    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok


token2idx, idx2token = get_dict_map('token')
print(token2idx['PAD_token'])
print(len(token2idx))   


In [None]:
data['text_idx'] = data['text_p'].apply(lambda x: list(map(token2idx.get, x.split())))

In [None]:
#Histogram with reviews lenght
data['text_len'].hist(bins=40) 

In [None]:
# Estimate percentiles
custom_percentiles = list(range(5, 101,5))
custom_percentile_values = np.percentile(data['text_len'], custom_percentiles)
percentile_table = pd.DataFrame({
    'Percentile': custom_percentiles,
    'Value': custom_percentile_values
})
print(percentile_table)

In [None]:
# Define max lenght
max_len = int(data['text_len'].quantile(0.95))
max_len

In [None]:
data.head(10)

In [None]:

# Print the entire 'text_idx' column
print("Contents of data['text_idx']:")
print(data['text_idx'])

# Another way to check for None within lists (more concise)
print("\nConcise check for None within lists in data['text_idx']:")
has_none = False
for index, text_indices in data['text_idx'].items():
    if isinstance(text_indices, list) and any(value is None for value in text_indices):
        print(f"List at DataFrame index {index} contains None values.")
        print(data.loc[index]['text_p'])
        has_none = True
    elif text_indices is None:
        print(f"The entire entry at DataFrame index {index} is None")
        has_none = True

if not has_none:
    print("\nNo None values found within the lists in data['text_idx'].")

In [None]:
#data.loc[407]['text_idx']
print(data.loc[403]['text'])
print(data.loc[403]['text_idx'])

In [None]:
# Padding
pad_tokens = pad_sequences(data['text_idx'], maxlen= max_len,
                           dtype='int32', padding='post',
                           value = token2idx['PAD_token'])