## Implementação da solução

### Carregando os dados...

In [21]:
"""
Imports
"""
import codecs
from langdetect import detect as detect_language
import pandas as pd
import regex
from sklearn.model_selection import StratifiedShuffleSplit
import spacy

In [2]:
# Global variables
nlp = spacy.load('en')
vowels_list = [u'a', u'e', u'i', u'o', u'u']
neg_words_list = [u'cannot', u'neither', u'no', u'nobody', u'none',
                  u'noone', u'nor', u'not', u'nothing', u'nowhere',
                  u'off']
irregular_not_contractions = {u"won't": u'will not', u"ain't": u'is not'}
REX_NOT_CONTRACTIONS_FULL_WORD = regex.compile(ur"[a-zA-Z]+n\'t")
REX_NOT_CONTRACTIONS = regex.compile(ur"n\'t")
REX_HTML_TAG = regex.compile(ur'<.{0,10}>')
REX_NON_ALPHANUM_SP = regex.compile(ur'[^a-zA-Z0-9\s]')
REX_EXTRA_SPACE = regex.compile(ur'\s+')

In [3]:
def read_data_to_df():
    """ Read data into dataframe
    """
    df = pd.DataFrame()
    filenames = ['data/rt-polarity.neg', 'data/rt-polarity.pos']
    for filename in filenames:
        with codecs.open(filename, 'r', 'utf-8') as f:
            lines = f.readlines()
        # remove breaklines if they exist
        lines = [u' '.join(line.split()).strip() for line in lines]
        df_aux = pd.DataFrame()
        df_aux['review'] = lines
        positive_label = u'.pos' in filename
        df_aux['label'] = int(positive_label)
        df = pd.concat([df, df_aux])
    return df

In [4]:
df = read_data_to_df()
df.head()

Unnamed: 0,review,label
0,"simplistic , silly and tedious .",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0


### Pré processamento do texto...

* expansão de contrações como "doesn't" para "does not"
* remoção de pontuação e caracteres especiais
* remoção de tags html

In [5]:
def expand_not_contractions(text):
    """ Expand contractions for not, e.g.:
        doesn't -> does not
        didn't -> did not
    """
    to_replace_words = REX_NOT_CONTRACTIONS_FULL_WORD.findall(text)
    to_replace_words = list(set(to_replace_words))
    contr2expanded = {}
    for word in to_replace_words:
        if not len(word):
            continue
        if irregular_not_contractions.has_key(word):
            new_word = irregular_not_contractions[word]
        else:
            new_word = REX_NOT_CONTRACTIONS.sub(u' not', word)
            last_char_first_token = new_word.split()[0][-1]
            if last_char_first_token.lower() == u'a':
                new_word = REX_NOT_CONTRACTIONS.sub(u'n not', word)
        contr2expanded[word] = new_word
    tokens = [contr2expanded.get(token, token) for token in text.split()]
    return u' '.join(tokens)


def remove_punct_html_tag(text):
    """ Remove punctuation and html tags
    """
    text = REX_HTML_TAG.sub(u' ', text)
    text = REX_NON_ALPHANUM_SP.sub(u' ', text)
    text = REX_EXTRA_SPACE.sub(u' ', text)
    return text.strip()


def normalize_text(text):
    text = expand_not_contractions(text)
    text = remove_punct_html_tag(text)
    return text

In [6]:
df['norm_review'] = df['review'].apply(normalize_text)
df.head()

Unnamed: 0,review,label,norm_review
0,"simplistic , silly and tedious .",0,simplistic silly and tedious
1,"it's so laddish and juvenile , only teenage bo...",0,it s so laddish and juvenile only teenage boys...
2,exploitative and largely devoid of the depth o...,0,exploitative and largely devoid of the depth o...
3,[garbus] discards the potential for pathologic...,0,garbus discards the potential for pathological...
4,a visually flashy but narratively opaque and e...,0,a visually flashy but narratively opaque and e...


### Filtrando somente as reviews em inglês...

**Obs:** Inicialmente, pensei em usar somente o `langdetect` para filtrar essas reviews por língua, mas aparentemente, essa abordagem deixa de fora alguns reviews em inglês que são classificados erroneamente, possivelmente por serem muito curtos. Para contornar a situação, como um dos objetivos posteriores é fazer *pos tag*, criei um critério alternativo para manter a review no dataset quando o review é curto: se o *pos tag* com o modelo da língua inglesa do `spacy` funciona no texto.

In [7]:
def english_pos_tag_works(text):
    """ Check if pos tag would work in this `text`
    """
    doc = nlp(text)
    for token in doc:
        if token.pos_ == u'X':
            return False
    return True

In [8]:
# calculate fields `language` and `english_pos_tag_works`
df['language'] = df['norm_review'].apply(detect_language)
df['english_pos_tag_works'] = df['norm_review'].apply(english_pos_tag_works)

In [9]:
def should_keep(row):
    """ Whether to keep a row or not. Three criteria are used, depending 
    on the length of the text:
        1. length(text) > 10
        2. if length(text) <= 40, we keep it if language was identified as english
        or if pos tag would work in the text
        3. if length(text) > 40, we keep it only if language was identified as english
    """
    text = row['norm_review']
    if len(text) < 10:
        return False
    language = row['language']
    english_pos_tag_works = row['english_pos_tag_works']
    # for long sentences, get language id from langdetect
    if len(text) > 40:
        return (language == 'en')
    return (language == 'en' or english_pos_tag_works == True)

In [10]:
# calculate `should_keep`        
df['keep'] = df.apply(should_keep, axis=1)
df.head()

Unnamed: 0,review,label,norm_review,language,english_pos_tag_works,keep
0,"simplistic , silly and tedious .",0,simplistic silly and tedious,en,True,True
1,"it's so laddish and juvenile , only teenage bo...",0,it s so laddish and juvenile only teenage boys...,en,True,True
2,exploitative and largely devoid of the depth o...,0,exploitative and largely devoid of the depth o...,en,True,True
3,[garbus] discards the potential for pathologic...,0,garbus discards the potential for pathological...,en,True,True
4,a visually flashy but narratively opaque and e...,0,a visually flashy but narratively opaque and e...,en,True,True


In [11]:
# remove lines with `keep = False` and useless cols
df = df[df['keep']==True]
useful_cols = ['review', 'norm_review', 'label']
df = df[useful_cols]
df.head()

Unnamed: 0,review,norm_review,label
0,"simplistic , silly and tedious .",simplistic silly and tedious,0
1,"it's so laddish and juvenile , only teenage bo...",it s so laddish and juvenile only teenage boys...,0
2,exploitative and largely devoid of the depth o...,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,garbus discards the potential for pathological...,0
4,a visually flashy but narratively opaque and e...,a visually flashy but narratively opaque and e...,0


### Mais etapas de processamento do texto...

* remoção das stopwords (com exceção de expressões que indicam negação)
* remoção de nomes de entidades (como nomes de pessoas, nomes de cidades, datas, eventos etc.)
* colunas com as combinações:
    * adjetivos e advérbios (`adj_adv`);
    * adjetivos e verbos (`adj_verb`);
    * adjetivos, verbos e advérbios (`adj_verb_adv`)

In [12]:
def get_word2postag(df, text_col):
    """ Get the pos tag of all words from col `text_col` of `df`
        regarded that the word is not a stopword nor an entity
    """
    # get all unique words from the normalized reviews
    concat_reviews = u' '.join(df[text_col].tolist())
    all_words = list(set(concat_reviews.split()))

    # modify stopwords list from spacy to exclude negation words
    for neg_word in neg_words_list:
        nlp.vocab[neg_word].is_stop = False

    # 3 steps using spacy - English model:
    # (i) do part-of-speech tagging
    # (ii) check if token is a stopword
    # (iii) check if token is an entity name
    word2postag = {}
    concat_all_words = u' '.join(all_words)
    doc = nlp(concat_all_words)
    for token in doc:
        is_stopword = token.is_stop
        is_entity = (token.ent_type_ != u'')
        if is_stopword or is_entity:
            continue
        pos_tag = token.pos_
        word2postag[token.text.strip()] = pos_tag
    return word2postag


def filter_out_words(text, words_to_keep):
    """ Only keep words from `words_to_keep` in the given text
    """
    tokens = text.split()
    tokens = [tok for tok in tokens if tok in words_to_keep]
    return u' '.join(tokens)


def filter_words_with_postags(text, allowed_postags, word2postag):
    """ Only keep words which have the postag specified in the list 
    `allowed_postags`
    """
    tokens = []
    for word in text.split():
        postag = word2postag[word]
        if postag in allowed_postags:
            tokens.append(word)
    return u' '.join(tokens)

In [13]:
# remove words that are 
word2postag = get_word2postag(df, u'norm_review')
words_to_keep = word2postag.keys()
df['filtered_review'] = df['norm_review'].apply(
    filter_out_words,
    args=[words_to_keep]
)
colname2postags = {
    'adj_adv':[u'ADJ', u'ADP', u'ADV'],
    'adj_verb':[u'ADJ', u'ADP', u'VERB'],
    'adj_adv_verb': [u'ADJ', u'ADP', u'ADV', u'VERB']
}
for colname, allowed_postags in colname2postags.items():
    df[colname] = df['filtered_review'].apply(
        filter_words_with_postags,
        args=[allowed_postags, word2postag]
    )
df.head()

Unnamed: 0,review,norm_review,label,filtered_review,adj_adv,adj_adv_verb,adj_verb
0,"simplistic , silly and tedious .",simplistic silly and tedious,0,simplistic silly tedious,simplistic silly tedious,simplistic silly tedious,simplistic silly tedious
1,"it's so laddish and juvenile , only teenage bo...",it s so laddish and juvenile only teenage boys...,0,s laddish juvenile teenage boys possibly find ...,laddish juvenile teenage possibly funny,laddish juvenile teenage possibly find funny,laddish juvenile teenage find funny
2,exploitative and largely devoid of the depth o...,exploitative and largely devoid of the depth o...,0,exploitative largely devoid depth sophisticati...,exploitative largely devoid graphic bearable,exploitative largely devoid watching graphic b...,exploitative devoid watching graphic bearable
3,[garbus] discards the potential for pathologic...,garbus discards the potential for pathological...,0,garbus discards potential pathological study e...,potential pathological instead circumstantial,discards potential pathological exhuming inste...,discards potential pathological exhuming skewe...
4,a visually flashy but narratively opaque and e...,a visually flashy but narratively opaque and e...,0,visually flashy narratively opaque emotionally...,visually flashy narratively opaque emotionally...,visually flashy narratively opaque emotionally...,flashy opaque vapid mystification


#### Contagem de cada coluna

In [15]:
def not_empty(series):
    """ Calculate length of a series without empty cells
    """
    full_list = series.astype(unicode).tolist()
    full_list = filter(lambda value: value != '', full_list)
    return len(set(full_list))

# exhibits table with lengths of dataset for each text column
count_table = df.aggregate(
    {
        'filtered_review': not_empty,
        'adj_adv':not_empty,
        'adj_adv_verb':not_empty,
        'adj_verb': not_empty
    }
)
count_table = count_table.reset_index()
count_table.columns = ['col', 'count']
count_table

Unnamed: 0,col,count
0,adj_adv,9255
1,adj_adv_verb,10277
2,adj_verb,10025
3,filtered_review,10553


### Prepara inputs do fasttext...

* divide conjunto de dados em 10 folds

* cria arquivo texto com a formatação de input para treinamento do modelo supervisionado:
    * cada linha deve ter a `target_label` seguida do `texto`: 
        "\__label\__<`target_label`> <`texto`>"

In [35]:
def prepare_data_fasttext(df, text_col, target_col, index_list, suffix=u''):
    """ Save textfile with text and the correct label in the format
    required by fasttext.
        Return filename
    """
    df = df[df.index.isin(index_list)]
    texts = df[text_col].tolist()
    targets = df[target_col].tolist()
    lines = []
    for i, target in enumerate(targets):
        line = u'__label__{target} {text}'.format(
            target=target,
            text=texts[i]
        )
        lines.append(line)
    with codecs.open('data/input_{}.txt'.format(suffix), 'w', 'utf-8') as f:
        f.write('\n'.join(lines))
    return 'data/input_{}.txt'.format(suffix)


def prepare_data_fasttext_kfolds(df, text_col, target_col, k, test_size=0.25):
    """ Prepare input data for fasttext for k folds. Default test_size is 25%
    of the dataset.
        Return dictionary with list of train and test filenames
        `{'train_files': [], 'test_files': []}`
    """
    # initialize Stratified ShuffleSplit cross-validator
    sss = StratifiedShuffleSplit(n_splits=k, test_size=test_size)
    # drop rows with no text
    df = df[df[text_col]!=u'']
    df = df.reset_index(drop=True)
    # get targets and indices from df
    targets = df[target_col].tolist()
    X = df.index.tolist()
    
    # wrapper for function prepare_data_fasttext
    get_filename_fasttext = lambda index_list, suffix: prepare_data_fasttext(
            df,
            text_col,
            target_col,
            index_list,
            suffix=suffix
        )
    
    # get filenames and append them to the appropriate lists
    train_files = []
    test_files = []    
    for count, indices in enumerate(sss.split(X, targets)):
        train_index, test_index = indices
        train_suffix = u'train_set_{text_col}_{num}'.format(text_col=text_col, num=count)
        train_filename = get_filename_fasttext(train_index, train_suffix)
        train_files.append(train_filename)
        test_suffix = u'test_set_{text_col}_{num}'.format(text_col=text_col, num=count)
        test_filename = get_filename_fasttext(test_index, test_suffix)
        test_files.append(test_filename)
    
    # returns dict
    return {'train_files': train_files, 'test_files': test_files}

In [36]:
# set number of folds k and use Stratified ShuffleSplit cross-validator 
# to divide data into k splits, with test_size of 25% of the dataset
k = 10
test_size = 0.25
# set text_col and target_col to be used
text_col = u'filtered_review'
target_col = u'label'
train_test_filenames = prepare_data_fasttext_kfolds(df, text_col, target_col, k, test_size=0.25)

### Treinamento do modelo de análise de sentimentos...

### Validação do conjunto teste...