In [1]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
df = pd.read_csv("../files/reddit_mobile_operators_peru_opinions5.csv")

In [3]:
# Función para limpiar el texto
def limpiar_texto(texto):
    # Poner el texto en minúsculas
    texto = texto.lower()
    # Tokenizar el texto y quitar los signos de puntuación
    texto = [word.strip(string.punctuation) for word in texto.split(" ")]
    # Quitar las palabras que contengan números
    texto = [word for word in texto if not any(c.isdigit() for c in word)]
    # Quitar las stop words
    stop = stopwords.words('english')
    texto = [x for x in texto if x not in stop]
    # Quitar los tokens vacíos
    texto = [t for t in texto if len(t) > 0]
    # Pos tags
    pos_tags = pos_tag(texto)
    # Lematizar el texto
    texto = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # Quitar las palabras con sólo una letra
    texto = [t for t in texto if len(t) > 1]
    # Unir todo
    texto = " ".join(texto)
    return(texto)

# Función para dibujar la nube de palabras
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color = 'white',
        max_words = 200,
        max_font_size = 40, 
        scale = 3,
        random_state = 42
    ).generate(str(data))

    fig = plt.figure(1, figsize = (20, 20))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)

    plt.imshow(wordcloud)
    plt.show()
    

# Etiquetado de nombres, verbos, adjetivos o adverbios
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
# Para el train_validate_test_split (para probar los modelos y mejorarlos)
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=101):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.loc[perm[:train_end]]
    validate = df.loc[perm[train_end:validate_end]]
    test = df.loc[perm[validate_end:]]
    return train, validate, test

In [4]:
print("El df tiene un total de {} columnas y un total de {} registros".format(df.shape[1], df.shape[0]))

El df tiene un total de 15 columnas y un total de 3718 registros


In [6]:
df.columns

Index(['post_id', 'post_author', 'comment_id', 'comment_author', 'post_title',
       'post_created_utc', 'link_flair_text', 'selftext', 'subreddit',
       'upvote_ratio', 'comment', 'comment_score', 'comment_created_utc',
       'operadora', 'mobile_operator_sentiment'],
      dtype='object')