## Implementação da solução

### Carregando os dados...

In [30]:
"""
Imports
"""
import codecs
from langdetect import detect as detect_language
import pandas as pd
import regex
import spacy

In [19]:
# Global variables
nlp = spacy.load('en')

In [2]:
def read_data_to_df():
    """ Read data into dataframe
    """
    df = pd.DataFrame()
    filenames = ['data/rt-polarity.neg', 'data/rt-polarity.pos']
    for filename in filenames:
        with codecs.open(filename, 'r', 'utf-8') as f:
            lines = f.readlines()
        # remove breaklines if they exist
        lines = [u' '.join(line.split()).strip() for line in lines]
        df_aux = pd.DataFrame()
        df_aux['review'] = lines
        positive_label = u'.pos' in filename
        df_aux['label'] = int(positive_label)
        df = pd.concat([df, df_aux])
    return df

In [27]:
df = read_data_to_df()
df.head()

Unnamed: 0,review,label
0,"simplistic , silly and tedious .",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0


### Processando o texto...

In [None]:
def remove_punct_html_tag(text):
    """ Remove punctuation and html tags
    """
    text = regex.sub(ur'<.{0,10}>', u' ', text)
    text = regex.sub(ur'[^a-zA-Z0-9\s]', u' ', text)
    text = regex.sub(ur'\s+', u' ', text)
    return text.strip()

In [None]:
df['review_no_punct'] = df['review'].apply(remove_punct_html_tag)

### Filtrando somente as reviews em inglês...

**Obs:** Inicialmente, pensei em usar o `langdetect` para essa função, mas aparentemente, há muitos exemplos mal classificados. Além disso, como um dos objetivos posteriores é fazer *pos tag*, uso o modelo `en` do `spacy` para checar se 

In [31]:
def english_pos_tag_works(text):
    """ Check if pos tag would work in this `text`
    """
    doc = nlp(text)
    for token in doc:
        if token.pos_ == u'X':
            return False
    return True

In [32]:
# calculate fields `language` and `english_pos_tag_works`
df['language'] = df['review_no_punct'].apply(detect_language)
df['english_pos_tag_works'] = df['review_no_punct'].apply(english_pos_tag_works)

In [57]:
def should_keep(row):
    """ Whether to keep a row or not. Three criteria are used, depending 
    on the length of the text:
        1. length(text) > 10
        2. if length(text) <= 40, we keep it if language was identified as english
        or if pos tag would work in the text
        3. if length(text) > 40, we keep it only if language was identified as english
    """
    text = row['review_no_punct']
    if len(text) < 10:
        return False
    language = row['language']
    english_pos_tag_works = row['english_pos_tag_works']
    # for long sentences, get language id from langdetect
    if len(text) > 40:
        return (language == 'en')
    return (language == 'en' or english_pos_tag_works == True)

In [58]:
# calculate `should_keep`        
df['keep'] = df.apply(should_keep, axis=1)
df.head()

Unnamed: 0,review,label,review_no_punct,language,english_pos_tag_works,keep
0,"simplistic , silly and tedious .",0,simplistic silly and tedious,en,True,True
1,"it's so laddish and juvenile , only teenage bo...",0,it s so laddish and juvenile only teenage boys...,en,True,True
2,exploitative and largely devoid of the depth o...,0,exploitative and largely devoid of the depth o...,en,True,True
3,[garbus] discards the potential for pathologic...,0,garbus discards the potential for pathologica...,en,True,True
4,a visually flashy but narratively opaque and e...,0,a visually flashy but narratively opaque and e...,en,True,True


In [59]:
# remove lines with `keep = False` and useless cols
df = df[df['keep']==True]
useful_cols = ['review', 'review_no_punct', 'label']
df = df[useful_cols]
df.head()

Unnamed: 0,review,review_no_punct,label
0,"simplistic , silly and tedious .",simplistic silly and tedious,0
1,"it's so laddish and juvenile , only teenage bo...",it s so laddish and juvenile only teenage boys...,0
2,exploitative and largely devoid of the depth o...,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,garbus discards the potential for pathologica...,0
4,a visually flashy but narratively opaque and e...,a visually flashy but narratively opaque and e...,0
