# Data pre-processing

imports

In [1]:
# DataFrame
import pandas as pd

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer,WordNetLemmatizer

# Utility
import re
import pytz
import dateutil.parser
import urllib.request
import zipfile


Constants

In [2]:
# DATASET
DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"

# TIMEZONE
TZINFOS = { 'PDT': pytz.timezone('US/Pacific')}

# FILE
URL = 'https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip'
TEST_FILE_NAME = "testdata.manual.2009.06.14.csv"
TRAIN_FILE_NAME = "training.1600000.processed.noemoticon.csv"

Load file

In [3]:
filehandle, _ = urllib.request.urlretrieve(URL)
zip_file_object = zipfile.ZipFile(filehandle, 'r')
train_file = zip_file_object.open(TRAIN_FILE_NAME)
f = open(TRAIN_FILE_NAME, "wb")
f.write(train_file.read())

238803811

load Dataframe

In [4]:
df = pd.read_csv(TRAIN_FILE_NAME, names=DATASET_COLUMNS, encoding=DATASET_ENCODING, header=None, converters={'date': lambda date: dateutil.parser.parse(date, tzinfos=TZINFOS)})

print infos

In [5]:
print(df.head(5))
print(df.dtypes)
print(df.info())    

   sentiment         ids                      date      flag             user  \
0          0  1467810369 2009-04-06 23:12:45-07:00  NO_QUERY  _TheSpecialOne_   
1          0  1467810672 2009-04-06 23:12:49-07:00  NO_QUERY    scotthamilton   
2          0  1467810917 2009-04-06 23:12:53-07:00  NO_QUERY         mattycus   
3          0  1467811184 2009-04-06 23:12:57-07:00  NO_QUERY          ElleCTF   
4          0  1467811193 2009-04-06 23:12:57-07:00  NO_QUERY           Karoli   

                                                text  
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1  is upset that he can't update his Facebook by ...  
2  @Kenichan I dived many times for the ball. Man...  
3    my whole body feels itchy and like its on fire   
4  @nationwideclass no, it's not behaving at all....  
sentiment                         int64
ids                               int64
date         datetime64[ns, US/Pacific]
flag                             object
user                   

load stopwords

In [6]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Felix\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Felix\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

map sentiment to constants

In [7]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

stemming and removing stopwords and letters which occur more than 2 times consecutive

In [41]:
def tokenize_text(text):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = nltk.word_tokenize(text)
    reduced_tokens = []
    wrapper = [False]
    def replace(hit):
        if not wrapper[0]:
            wrapper[0] = True
            return hit[0]
        else:
            return ""
    for word in tokens:
        if word not in stop_words:
            reduced_tokens.append(re.sub(r"(.)(?=\1+)", replace, word))
        wrapper[0] = False
    return reduced_tokens

In [9]:
def stem_text(tokens):
    return [stemmer.stem(word) for word in tokens]

In [10]:
def lemmatize_text(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

preprocessing

In [42]:
df['mapped_sentiment'] = df.sentiment.apply(decode_sentiment)
df['tokenized_text'] = df.text.apply(tokenize_text)
df['stemmed_text'] = df.tokenized_text.apply(stem_text)
df['lemmatized_text'] = df.tokenized_text.apply(lemmatize_text)


In [43]:
print(df.head(5))
df.info()

   sentiment         ids                      date      flag             user  \
0          0  1467810369 2009-04-06 23:12:45-07:00  NO_QUERY  _TheSpecialOne_   
1          0  1467810672 2009-04-06 23:12:49-07:00  NO_QUERY    scotthamilton   
2          0  1467810917 2009-04-06 23:12:53-07:00  NO_QUERY         mattycus   
3          0  1467811184 2009-04-06 23:12:57-07:00  NO_QUERY          ElleCTF   
4          0  1467811193 2009-04-06 23:12:57-07:00  NO_QUERY           Karoli   

                                                text mapped_sentiment  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...         NEGATIVE   
1  is upset that he can't update his Facebook by ...         NEGATIVE   
2  @Kenichan I dived many times for the ball. Man...         NEGATIVE   
3    my whole body feels itchy and like its on fire          NEGATIVE   
4  @nationwideclass no, it's not behaving at all....         NEGATIVE   

                                      tokenized_text  \
0  [aww, bummer, s

write data to file

In [45]:
df[['mapped_sentiment', 'lemmatized_text']].to_csv("train_data_prepared.csv", encoding="UTF-8", index=False)