In [23]:
import pandas as pd
from unicodedata import normalize
import re
import os
import csv
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk.stem.snowball import SnowballStemmer
import pickle
import itertools
import lda

stopwords = nltk.corpus.stopwords.words('spanish')
stemmer = SnowballStemmer("spanish")

In [24]:
munis = pd.read_csv('./data/municipios.csv')[:-1]
page_ids = munis['pagina de facebook']

In [25]:
fullDf = pd.DataFrame({'comentario': [], 'creado': [], 'muni': []})
for page_id in page_ids:
    has_file = os.path.isfile('./data/%s.csv' % page_id)
    if has_file:
        df = pd.read_csv('./data/%s.csv' % page_id, encoding='utf-8')
        df['muni'] = page_id
        fullDf = pd.concat([fullDf, df])
        df = None

In [26]:
fullDf = fullDf.dropna()

In [27]:
fullDf['comentario'] = fullDf['comentario']\
    .apply(lambda x: re.sub(r'http\S+', '', x))\
    .apply(lambda x: re.sub(r'www\S+', '', x))\
    .apply(lambda x: re.sub(r'facebook\S+', '', x))\
    .apply(lambda x: normalize('NFKD', x.encode('utf-8').decode('utf8')).encode('ASCII', 'ignore').lower())\
    .apply(lambda x: re.sub('[^A-Za-z\ ]+', ' ', x))\
    .apply(lambda x: re.sub(r'(.)\1+', r'\1\1', x))
    
fullDf.dropna(how='any')
fullDf = fullDf[fullDf['comentario'].map(len) > 20].sample(frac=1)
fullDf = fullDf[fullDf['comentario'].map(len) < 400].sample(frac=1)
fullDf.to_csv('./data/full.csv', index=False, encoding='utf-8')
print(len(fullDf))

446659


In [28]:
muestra = 400000

In [29]:
comentarios = fullDf.sample(muestra)
comentarios.to_csv('./data/20-400-%s.csv'%muestra, index=False, encoding='utf-8')
comentarios = comentarios['comentario']

In [30]:
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        def stem (doc):
            return [" ".join([stemmer.stem(y) for y in nltk.word_tokenize(x)]) for x in analyzer(doc)]
        return stem
    
tfvectorizer = StemmedTfidfVectorizer(max_df=0.9, min_df=5,\
                                      max_features=10000,\
                                      stop_words=stopwords,\
                                      ngram_range=(1,2))
tvz = tfvectorizer.fit_transform(comentarios)

pickle.dump(tfvectorizer, open("./data/tvz-20-400-%s.pickle"%muestra, "wb"))
pickle.dump(tvz, open("./data/tvzm-20-400-%s.pickle"%muestra, "wb"))

In [31]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        def stem (doc):
            return [" ".join([stemmer.stem(y) for y in nltk.word_tokenize(x)]) for x in analyzer(doc)]
        return stem
    
cvectorizer = StemmedCountVectorizer(max_df=0.9, min_df=5,\
                                     max_features=10000,\
                                     stop_words=stopwords,\
                                     ngram_range=(1,2))
cvz = cvectorizer.fit_transform(comentarios)

pickle.dump(cvectorizer, open("./data/cvz-20-400-%s.pickle"%muestra, "wb"))
pickle.dump(cvz, open("./data/cvzm-20-400-%s.pickle"%muestra, "wb"))