In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.tokenize import WordPunctTokenizer

In [8]:
df_raw_data = pd.read_csv('data/sentiment140.csv', encoding='latin', header=None)
df_raw_data.columns = ['target', 'id', 'date', 'flag', 'username', 'tweet']
df_raw_data.head()

Unnamed: 0,target,id,date,flag,username,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [9]:
print("Min date: " + str(df_raw_data.date.min()) + ". Max date: " + str(df_raw_data.date.max()))
print("Unique user: " + str(df_raw_data.username.nunique()))
print("Number of data on each class: ")
print(df_raw_data.target.value_counts())

Min date: Fri Apr 17 20:30:31 PDT 2009. Max date: Wed May 27 07:27:38 PDT 2009
Unique user: 659775
Number of data on each class: 
4    800000
0    800000
Name: target, dtype: int64


In [10]:
df_raw_data = df_raw_data[['target', 'tweet']]
df_raw_data.head()

Unnamed: 0,target,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [11]:
df_raw_data[df_raw_data.target==0].head()

Unnamed: 0,target,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [12]:
df_raw_data[df_raw_data.target==4].head()

Unnamed: 0,target,tweet
800000,4,I LOVE @Health4UandPets u guys r the best!!
800001,4,im meeting up with one of my besties tonight! ...
800002,4,"@DaRealSunisaKim Thanks for the Twitter add, S..."
800003,4,Being sick can be really cheap when it hurts t...
800004,4,@LovesBrooklyn2 he has that effect on everyone


In [13]:
tok = WordPunctTokenizer()
username_pat = r'@[A-Za-z0-9_]+|https?://[^ ]+'
web_pat = r'www\.[^ ]+'
negation_dic = {("isnt", "isn't"):"is not", ("arent", "aren't"):"are not", ("wasnt", "wasn't"):"was not", 
                ("werent","weren't"): "were not", ("havent","haven't"): "have not", ("hasnt", "hasn't"): "has not", 
                ("hadnt", "hadn't"): "had not", ("wont", "won't"): "will not", ("wouldnt","wouldn't"): "would not", 
                ("dont", "don't"):"do not", ("doesnt", "doesn't"): "does not", ("didnt", "didn't") : "did not", 
                ("cant", "can't"):"can not", ("couldnt", "couldn't"):"could not", ("shouldnt","shouldn't"):"should not",
               ("mightnt", "mightn't"): "might not", ("musnt", "musn't"): "must not"}
negation_dic = {k:v for kl,v in negation_dic.items() for k in kl}
negation_pat = re.compile(r'\b('+'|'.join(negation_dic.keys()) + r')\b')

def data_cleansing(tweet):
    soup = BeautifulSoup(tweet, 'html5lib')
    tweet = soup.get_text()
    
    try:
        tweet = tweet.decode("utf-8-sig").replace(u"\ufffd","?")
    except:
        tweet = tweet
    
    tweet = re.sub(username_pat, '', tweet)
    tweet = re.sub(web_pat, '', tweet)
    tweet = tweet.lower()
    tweet = negation_pat.sub(lambda x: negation_dic[x.group()], tweet)
    tweet = re.sub("[^A-Za-z]", " ", tweet)
    
    tweet = [word for word in tok.tokenize(tweet) if len(word) > 1]
    
    tweet = (" ".join(tweet)).strip()
    
    return tweet

In [14]:
df_cleaned_tweets = []
for i in range(0, 1600000):
    if (i+1)%100000 == 0:
        print('{} of {} have been cleaned'.format(i+1, 1600000))
    df_cleaned_tweets.append(data_cleansing(df_raw_data.tweet[i]))

100000 of 1600000 have been cleaned
200000 of 1600000 have been cleaned
300000 of 1600000 have been cleaned
400000 of 1600000 have been cleaned
500000 of 1600000 have been cleaned
600000 of 1600000 have been cleaned
700000 of 1600000 have been cleaned


  ' Beautiful Soup.' % markup)


800000 of 1600000 have been cleaned
900000 of 1600000 have been cleaned
1000000 of 1600000 have been cleaned
1100000 of 1600000 have been cleaned
1200000 of 1600000 have been cleaned


  ' Beautiful Soup.' % markup)


1300000 of 1600000 have been cleaned
1400000 of 1600000 have been cleaned
1500000 of 1600000 have been cleaned
1600000 of 1600000 have been cleaned


In [15]:
df_cleaned_tweets = pd.DataFrame(df_cleaned_tweets, columns=['tweet'])
df_cleaned_tweets['target'] = df_raw_data.target
df_cleaned_tweets = df_cleaned_tweets.replace('', np.nan, regex=True)
df_cleaned_tweets.dropna(inplace=True)
df_cleaned_tweets.reset_index(drop=True,inplace=True)
df_cleaned_tweets.loc[df_cleaned_tweets.target == 4, 'target'] = 1
df_cleaned_tweets.head()

Unnamed: 0,tweet,target
0,awww that bummer you shoulda got david carr of...,0
1,is upset that he can not update his facebook b...,0
2,dived many times for the ball managed to save ...,0
3,my whole body feels itchy and like its on fire,0
4,no it not behaving at all mad why am here beca...,0


In [29]:
df_cleaned_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1596170 entries, 0 to 1596169
Data columns (total 2 columns):
tweet     1596170 non-null object
target    1596170 non-null int64
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [18]:
df_cleaned_tweets.to_csv('data/cleaned_tweets.csv', index=False)

In [3]:
df_cleaned_tweets = pd.read_csv('data/cleaned_tweets.csv')

In [4]:
test_neg = df_cleaned_tweets[df_cleaned_tweets.target==0].sample(500000)
test_pos = df_cleaned_tweets[df_cleaned_tweets.target==1].sample(500000)
train_neg = df_cleaned_tweets[(df_cleaned_tweets.target==0) & (~df_cleaned_tweets.tweet.isin(test_neg.tweet))]
train_pos = df_cleaned_tweets[(df_cleaned_tweets.target==1) & (~df_cleaned_tweets.tweet.isin(test_pos.tweet))]

test_neg.reset_index(drop=True, inplace=True)
test_pos.reset_index(drop=True, inplace=True)
train_neg.reset_index(drop=True, inplace=True)
train_pos.reset_index(drop=True, inplace=True)

In [5]:
test = pd.concat([test_neg, test_pos], ignore_index=True)
train = pd.concat([train_neg, train_pos], ignore_index=True)

In [6]:
test.to_csv('data/cleaned_test.csv', index=False)
train.to_csv('data/cleaned_train.csv', index=False)