# Data pre-processing

imports

In [3]:
# DataFrame
import pandas as pd

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Utility
import re
import pytz
import dateutil.parser
import urllib.request
import zipfile


Constants

In [4]:
# DATASET
DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"

# TIMEZONE
TZINFOS = { 'PDT': pytz.timezone('US/Pacific')}

# FILE
URL = 'https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip'
TEST_FILE_NAME = "testdata.manual.2009.06.14.csv"
TRAIN_FILE_NAME = "training.1600000.processed.noemoticon.csv"

Load file

In [5]:
filehandle, _ = urllib.request.urlretrieve(URL)
zip_file_object = zipfile.ZipFile(filehandle, 'r')
train_file = zip_file_object.open(TRAIN_FILE_NAME)
f = open(TRAIN_FILE_NAME, "wb")
f.write(train_file.read())

238803811

load Dataframe

In [6]:
df = pd.read_csv(TRAIN_FILE_NAME, names=DATASET_COLUMNS, encoding=DATASET_ENCODING, header=None)

print infos

In [7]:
print(df.head(5))
print(df.dtypes)
print(df.info())    

   sentiment         ids                          date      flag  \
0          0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1          0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2          0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3          0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4          0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  
sentiment     int64
ids           int64
date         object
flag         object
user         object
text         object
dtype: object
<class 'pandas.core.frame

load stopwords

In [8]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

map sentiment to constants

In [9]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

map date

In [10]:
def convert_date(date):
    return dateutil.parser.parse(date, tzinfos= TZINFOS)

stemming and removing stopwords

In [11]:
def preprocess_text(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

preprocessing

In [18]:
df_preprocessed = df.copy()
df_preprocessed.sentiment = df_preprocessed.sentiment.apply(decode_sentiment)
df_preprocessed.date = df_preprocessed.date.apply(convert_date)
df_preprocessed.text = df_preprocessed.text.apply(lambda t: preprocess_text(t, True))

In [19]:
df_preprocessed.head(5)

Unnamed: 0,sentiment,ids,date,flag,user,text
0,NEGATIVE,1467810369,2009-04-06 23:12:45-07:00,NO_QUERY,_TheSpecialOne_,awww bummer shoulda got david carr third day
1,NEGATIVE,1467810672,2009-04-06 23:12:49-07:00,NO_QUERY,scotthamilton,upset updat facebook text might cri result sch...
2,NEGATIVE,1467810917,2009-04-06 23:12:53-07:00,NO_QUERY,mattycus,dive mani time ball manag save 50 rest go bound
3,NEGATIVE,1467811184,2009-04-06 23:12:57-07:00,NO_QUERY,ElleCTF,whole bodi feel itchi like fire
4,NEGATIVE,1467811193,2009-04-06 23:12:57-07:00,NO_QUERY,Karoli,behav mad see
