In [None]:
import pandas as pd
import feather
from sklearn.svm import OneClassSVM

Load secop data

In [None]:
secop = pd.read_pickle('../data/secop_union_all.pickle')
secop = secop.drop(['urlproceso'], axis=1).reset_index(drop=True)

Basic text preprocessing

In [None]:
import gensim.parsing.preprocessing as gsp
from toolz import pipe
from nltk.corpus import stopwords

In [None]:
stopset = stopwords.words('spanish')
remove_stopwords_spanish = lambda s: ' '.join([w for w in s.split() if w not in stopset])

preproc_filters = [
    gsp.strip_non_alphanum,
    lambda s: gsp.strip_short(s, minsize=4),
    gsp.strip_multiple_whitespaces,
    remove_stopwords_spanish
]


preproc_function = lambda s: pipe(s, *preproc_filters)
# gsp.preprocess_string('asdgasgasg.asdhgasdh.fdh.sdfh. sdfgh.dfsh. dsf. dfh.;;l', filters = preproc_filters)

In [None]:
test = 'asdgasgasg 45 63 gahfdh 43623642.asdhgasdh.fdh.sdfh. sdfgh.dfsh. dsf. dfh.;;l'
preproc_function(test)

In [None]:
secop['clean_description'] = secop.descripcion_del_proceso.apply(preproc_function)

## Word Analysis

Build dictionary and corpus with gensim

In [None]:
from gensim.corpora import Dictionary

In [None]:
documents = secop.clean_description.str.split()

dictionary = Dictionary(documents)
dictionary.filter_extremes(no_below=5, no_above=0.7)

corpus = [dictionary.doc2bow(sent) for sent in documents]
vocab = list(dictionary.values()) #list of terms in the dictionary
vocab_set = set(dictionary.values()) #list of terms in the dictionary

# vocab_tf = [dict(i) for i in corpus]
# vocab_tf = list(pd.DataFrame(vocab_tf).sum(axis=0))

In [None]:
len(vocab)

In [None]:
from collections import Counter
import itertools

flatten = itertools.chain.from_iterable

In [None]:
documents = secop.clean_description.str.split()
documents = documents.apply(lambda words: [w for w in words if w in vocab_set])

word_counter = Counter()
word_counter.update(flatten(documents))
#for doc in documents:
#    word_counter.update(doc)

In [None]:
word_counts = df = pd.DataFrame.from_dict(word_counter, orient='index').reset_index()
word_counts.columns = ['word', 'n']

In [None]:
word_counts.sort_values('n', ascending=False).head(10)

In [None]:
(word_counts
    .assign(single = lambda x: x.n==1)
    .groupby('single').count()
)

## Stemming
Spacy tests. maybe this will be useful for lemmatization

In [None]:
import spacy
nlp = spacy.load("es_core_news_md", disable=['tagger', 'parser', 'ner'])

In [None]:
doc = nlp('esto puede ser un texto para Maria')

In [None]:
' '.join([w.lemma_ for w in doc])

process the descriptions with stemming

In [None]:
clean_docs = pd.Series(nlp.pipe(secop.clean_description))

In [None]:
stemmed_descriptions = clean_docs.apply(lambda doc: ' '.join([w.lemma_ for w in doc]))

In [None]:
secop['stemmed_descriptions'] = stemmed_descriptions

Save processed text