# Data pre-processing

imports

In [46]:
# DataFrame
import pandas as pd

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer,WordNetLemmatizer

# Utility
import re
import pytz
import dateutil.parser
import urllib.request
import zipfile


Constants

In [47]:
# DATASET
DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING_ISO = "ISO-8859-1"
DATASET_ENCODING_UTF = "UTF-8"

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"

# TIMEZONE
TZINFOS = { 'PDT': pytz.timezone('US/Pacific')}

# FILE
URL = 'https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip'
TEST_FILE_NAME = "testdata.manual.2009.06.14.csv"
TRAIN_FILE_NAME = "training.1600000.processed.noemoticon.csv"

Load file

In [48]:
filehandle, _ = urllib.request.urlretrieve(URL)
zip_file_object = zipfile.ZipFile(filehandle, 'r')
train_file = zip_file_object.open(TRAIN_FILE_NAME)
test_file = zip_file_object.open(TEST_FILE_NAME)
f = open(TRAIN_FILE_NAME, "wb")
f.write(train_file.read())
f = open(TEST_FILE_NAME, "wb")
f.write(test_file.read())

74326

load Dataframe

In [49]:
df = pd.read_csv(TRAIN_FILE_NAME, names=DATASET_COLUMNS, encoding=DATASET_ENCODING_ISO, header=None, converters={'date': lambda date: dateutil.parser.parse(date, tzinfos=TZINFOS)})
df_test = pd.read_csv(TEST_FILE_NAME, names=DATASET_COLUMNS, encoding=DATASET_ENCODING_ISO, header=None, converters={'date': lambda date: dateutil.parser.parse(date, tzinfos=TZINFOS)})

print infos

In [50]:
print(df.head(5))
print(df.dtypes)
print(df.info())    

   sentiment         ids                      date      flag             user  \
0          0  1467810369 2009-04-06 23:12:45-07:00  NO_QUERY  _TheSpecialOne_   
1          0  1467810672 2009-04-06 23:12:49-07:00  NO_QUERY    scotthamilton   
2          0  1467810917 2009-04-06 23:12:53-07:00  NO_QUERY         mattycus   
3          0  1467811184 2009-04-06 23:12:57-07:00  NO_QUERY          ElleCTF   
4          0  1467811193 2009-04-06 23:12:57-07:00  NO_QUERY           Karoli   

                                                text  
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1  is upset that he can't update his Facebook by ...  
2  @Kenichan I dived many times for the ball. Man...  
3    my whole body feels itchy and like its on fire   
4  @nationwideclass no, it's not behaving at all....  
sentiment                         int64
ids                               int64
date         datetime64[ns, US/Pacific]
flag                             object
user                   

load stopwords

In [51]:
print(df_test.head(5))
print(df_test.dtypes)
print(df_test.info())

   sentiment  ids                      date     flag      user  \
0          4    3 2009-05-11 03:17:40+00:00  kindle2    tpryan   
1          4    4 2009-05-11 03:18:03+00:00  kindle2    vcu451   
2          4    5 2009-05-11 03:18:54+00:00  kindle2    chadfu   
3          4    6 2009-05-11 03:19:04+00:00  kindle2     SIX15   
4          4    7 2009-05-11 03:21:41+00:00  kindle2  yamarama   

                                                text  
0  @stellargirl I loooooooovvvvvveee my Kindle2. ...  
1  Reading my kindle2...  Love it... Lee childs i...  
2  Ok, first assesment of the #kindle2 ...it fuck...  
3  @kenburbary You'll love your Kindle2. I've had...  
4  @mikefish  Fair enough. But i have the Kindle2...  
sentiment                      int64
ids                            int64
date         datetime64[ns, tzutc()]
flag                          object
user                          object
text                          object
dtype: object
<class 'pandas.core.frame.DataFrame'>

In [52]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Felix\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Felix\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

map sentiment to constants

In [53]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

stemming and removing stopwords and letters which occur more than 2 times consecutive

In [55]:
def tokenize_text(text):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = nltk.word_tokenize(text)
    reduced_tokens = []
    wrapper = [False]
    def replace(hit):
        if not wrapper[0]:
            wrapper[0] = True
            return hit[0]
        else:
            return ""
    for word in tokens:
        if word not in stop_words:
            reduced_tokens.append(re.sub(r"(.)(?=\1+)", replace, word))
        wrapper[0] = False
    return reduced_tokens

In [64]:
def stem_text(tokens):
    return ' '.join([stemmer.stem(word) for word in tokens])

In [65]:
def lemmatize_text(tokens):
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens])

preprocessing

In [66]:
df['mapped_sentiment'] = df.sentiment.apply(decode_sentiment)
df['tokenized_text'] = df.text.apply(tokenize_text)
df['stemmed_text'] = df.tokenized_text.apply(stem_text)
df['lemmatized_text'] = df.tokenized_text.apply(lemmatize_text)


In [67]:
df_test['mapped_sentiment'] = df_test.sentiment.apply(decode_sentiment)
df_test['tokenized_text'] = df_test.text.apply(tokenize_text)
df_test['stemmed_text'] = df_test.tokenized_text.apply(stem_text)
df_test['lemmatized_text'] = df_test.tokenized_text.apply(lemmatize_text)

In [68]:
print(df.head(5))
df.info()

   sentiment         ids                      date      flag             user  \
0          0  1467810369 2009-04-06 23:12:45-07:00  NO_QUERY  _TheSpecialOne_   
1          0  1467810672 2009-04-06 23:12:49-07:00  NO_QUERY    scotthamilton   
2          0  1467810917 2009-04-06 23:12:53-07:00  NO_QUERY         mattycus   
3          0  1467811184 2009-04-06 23:12:57-07:00  NO_QUERY          ElleCTF   
4          0  1467811193 2009-04-06 23:12:57-07:00  NO_QUERY           Karoli   

                                                text mapped_sentiment  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...         NEGATIVE   
1  is upset that he can't update his Facebook by ...         NEGATIVE   
2  @Kenichan I dived many times for the ball. Man...         NEGATIVE   
3    my whole body feels itchy and like its on fire          NEGATIVE   
4  @nationwideclass no, it's not behaving at all....         NEGATIVE   

                                      tokenized_text  \
0  [aww, bummer, s

write data to file

In [69]:
print(df_test.head(5))
df_test.info()

   sentiment  ids                      date     flag      user  \
0          4    3 2009-05-11 03:17:40+00:00  kindle2    tpryan   
1          4    4 2009-05-11 03:18:03+00:00  kindle2    vcu451   
2          4    5 2009-05-11 03:18:54+00:00  kindle2    chadfu   
3          4    6 2009-05-11 03:19:04+00:00  kindle2     SIX15   
4          4    7 2009-05-11 03:21:41+00:00  kindle2  yamarama   

                                                text mapped_sentiment  \
0  @stellargirl I loooooooovvvvvveee my Kindle2. ...         POSITIVE   
1  Reading my kindle2...  Love it... Lee childs i...         POSITIVE   
2  Ok, first assesment of the #kindle2 ...it fuck...         POSITIVE   
3  @kenburbary You'll love your Kindle2. I've had...         POSITIVE   
4  @mikefish  Fair enough. But i have the Kindle2...         POSITIVE   

                                      tokenized_text  \
0    [loove, kindle2, dx, cool, 2, fantastic, right]   
1  [reading, kindle2, love, lee, childs, good, read]

In [77]:
df.to_csv("train_data_prepared.csv", encoding=DATASET_ENCODING_UTF, index=False)

In [78]:
df_test.to_csv("test_data_prepared.csv", encoding=DATASET_ENCODING_UTF, index=False)