In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('stopwords')
#nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')


pd.options.display.max_colwidth = None


DATA_NAME = 'sentiment140/training.1600000.processed.noemoticon.csv'
ENCODING = 'latin-1'
COLUMN_NAMES = ['sentiment', 'id', 'date', 'flag', 'user', 'tweet']
NROWS = 1600000
stop_words = stopwords.words('english')


df = pd.read_csv(DATA_NAME,
                 encoding=ENCODING,
                 header=None,
                 names=COLUMN_NAMES,
                 nrows=NROWS,
                 parse_dates=['date'])

df.head(5)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Anne\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anne\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  df = pd.read_csv(DATA_NAME,


Unnamed: 0,sentiment,id,date,flag,user,tweet
0,0,1467810369,2009-04-06 22:19:45,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,2009-04-06 22:19:49,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,2009-04-06 22:19:53,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,2009-04-06 22:19:57,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,2009-04-06 22:19:57,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


In [33]:
#sorted(stop_words, key=str.casefold)
#stop_words

# Preprocessing methods

In [2]:
def shorten_multiple_letters(tweet):
  reduced_tokens = []
  wrapper = [False]

  def replace(hit):
      if not wrapper[0]:
        wrapper[0] = True
        return hit[0]
      else:
        return ""

  for word in tweet.split():
      reduced_tokens.append(re.sub(r"(.)(?=\1+)", replace, word))
      wrapper[0] = False

  return ' '.join(reduced_tokens)

In [3]:

sentiment_recoding_map = {
        0: 0,
        4: 1
    }

def sentiment_recoding(sentiment):
  return sentiment_recoding_map[int(sentiment)]

In [4]:
lemmatizer = WordNetLemmatizer()

# csrt with apostrophe
def lem_tweets_with_apostrophe(tweet):
  # clean tweets from mentions, hashtags, URLs, special characters except "'"
  tweet = re.sub(r'@\w+|#\w+|http\S+|[^A-Za-z0-9\']+', ' ', tweet).lower().strip()

  # shorten multiple letters
  tweet = shorten_multiple_letters(tweet)

  # remove stopwords
  tweet = ' '.join([word for word in tweet.split() if word not in stop_words])

  # tokenize
  tweet = nltk.word_tokenize(tweet)

  # lemmatize
  tweet = [lemmatizer.lemmatize(word) for word in tweet]

  return ' '.join(tweet)


# cstr no apostrophe
def lem_tweets_no_apostrophe(tweet):
  # clean tweets from mentions, hashtags, URLs, special characters except "'"
  tweet = re.sub(r'@\w+|#\w+|http\S+|[^A-Za-z0-9]+', ' ', tweet).lower().strip()

  # shorten multiple letters
  tweet = shorten_multiple_letters(tweet)

  # tokenize
  tweet = nltk.word_tokenize(tweet)

  # remove stopwords
  tweet = [word for word in tweet if word not in stop_words]

  # lemmatize
  tweet = [lemmatizer.lemmatize(word) for word in tweet]

  return ' '.join(tweet)

In [5]:
stemmer = PorterStemmer()

# csrt with apostrophe
def stem_tweets_with_apostrophe(tweet):
  # clean tweets from mentions, hashtags, URLs, special characters except "'"
  tweet = re.sub(r'@\w+|#\w+|http\S+|[^A-Za-z0-9\']+', ' ', tweet).lower().strip()

  # shorten multiple letters
  tweet = shorten_multiple_letters(tweet)

  # remove stopwords
  tweet = ' '.join([word for word in tweet.split() if word not in stop_words])

  # tokenize
  tweet = nltk.word_tokenize(tweet)

  # lemmatize
  tweet = [stemmer.stem(word) for word in tweet]

  return ' '.join(tweet)


# cstr no apostrophe
def stem_tweets_no_apostrophe(tweet):
  # clean tweets from mentions, hashtags, URLs, special characters except "'"
  tweet = re.sub(r'@\w+|#\w+|http\S+|[^A-Za-z0-9]+', ' ', tweet).lower().strip()

  # shorten multiple letters
  tweet = shorten_multiple_letters(tweet)

  # tokenize
  tweet = nltk.word_tokenize(tweet)

  # remove stopwords
  tweet = [word for word in tweet if word not in stop_words]

  # lemmatize
  tweet = [stemmer.stem(word) for word in tweet]

  return ' '.join(tweet)

# Preprocessing

In [6]:
# remove unnecessary columns
df.drop(columns=['id', 'date', 'flag', 'user'], inplace=True)

# sentement recoding
df.sentiment = df.sentiment.apply(lambda sentiment: sentiment_recoding(sentiment))

### Creating different preprocessed datasets by applying preprocessing steps
#### Lemmatized datasets

In [7]:
df_with_apostrophe_lem = df.copy()
df_no_apostrophe_lem = df.copy()


# 1) preprocessing with lemmatizing and with apostrophe
df_with_apostrophe_lem.tweet = df_with_apostrophe_lem.tweet.apply(lambda tweet: lem_tweets_with_apostrophe(tweet))

# removing empty rows
df_with_apostrophe_lem.drop(df_with_apostrophe_lem[df_with_apostrophe_lem.tweet == ''].index, inplace=True)
#df_with_apostrophe_lem.tweet = df_with_apostrophe_lem.tweet.apply(lambda tweet: tweet.split())

print(df_with_apostrophe_lem.head(5))
print('with ', df_with_apostrophe_lem.shape)


# 2) preprocessing with lemmatizing and no apostrophe
df_no_apostrophe_lem.tweet = df_no_apostrophe_lem.tweet.apply(lambda tweet: lem_tweets_no_apostrophe(tweet))

# removing empty rows
df_no_apostrophe_lem.drop(df_no_apostrophe_lem[df_no_apostrophe_lem.tweet == ''].index, inplace=True)
#df_no_apostrophe_lem.tweet = df_no_apostrophe_lem.tweet.apply(lambda tweet: tweet.split())

print(df_no_apostrophe_lem.head(5))
print('no ', df_no_apostrophe_lem.shape)

   sentiment  \
0          0   
1          0   
2          0   
3          0   
4          0   

                                                                          tweet  
0                           aww that 's bummer shoulda got david carr third day  
1  upset ca n't update facebook texting might cry result school today also blah  
2                            dived many time ball managed save 50 rest go bound  
3                                               whole body feel itchy like fire  
4                                                  behaving i 'm mad ca n't see  
with  (1592606, 2)
   sentiment  \
0          0   
1          0   
2          0   
3          0   
4          0   

                                                                   tweet  
0                            aww bummer shoulda got david carr third day  
1  upset update facebook texting might cry result school today also blah  
2                     dived many time ball managed save 50 rest go bou

In [9]:
# save to csv
df_with_apostrophe_lem.to_csv('sentiment140/sentiment140_lem_all_with_apostrophe.csv', index=False, header=False)

df_no_apostrophe_lem.to_csv('sentiment140/sentiment140_lem_all_no_apostrophe.csv', index=False, header=False)

#### Stemmed datasets

In [10]:
df_with_apostrophe_stem = df.copy()
df_no_apostrophe_stem = df.copy()


# 1) preprocessing with stemming and with apostrophe
df_with_apostrophe_stem.tweet = df_with_apostrophe_stem.tweet.apply(lambda tweet: stem_tweets_with_apostrophe(tweet))

# removing empty rows
df_with_apostrophe_stem.drop(df_with_apostrophe_stem[df_with_apostrophe_stem.tweet == ''].index, inplace=True)
#df_with_apostrophe_stem.tweet = df_with_apostrophe_stem.tweet.apply(lambda tweet: tweet.split())

print(df_with_apostrophe_stem.head(5))
print('with ', df_with_apostrophe_stem.shape)


# 2) preprocessing with stemming and no apostrophe
df_no_apostrophe_stem.tweet = df_no_apostrophe_stem.tweet.apply(lambda tweet: stem_tweets_no_apostrophe(tweet))

# removing empty rows
df_no_apostrophe_stem.drop(df_no_apostrophe_stem[df_no_apostrophe_stem.tweet == ''].index, inplace=True)
#df_no_apostrophe_stem.tweet = df_no_apostrophe_stem.tweet.apply(lambda tweet: tweet.split())

print(df_no_apostrophe_stem.head(5))
print('no ', df_no_apostrophe_stem.shape)

   sentiment  \
0          0   
1          0   
2          0   
3          0   
4          0   

                                                                      tweet  
0                       aww that 's bummer shoulda got david carr third day  
1  upset ca n't updat facebook text might cri result school today also blah  
2                           dive mani time ball manag save 50 rest go bound  
3                                           whole bodi feel itchi like fire  
4                                                 behav i 'm mad ca n't see  
with  (1592606, 2)
   sentiment  \
0          0   
1          0   
2          0   
3          0   
4          0   

                                                               tweet  
0                        aww bummer shoulda got david carr third day  
1  upset updat facebook text might cri result school today also blah  
2                    dive mani time ball manag save 50 rest go bound  
3                                  

In [11]:
# save to csv
df_with_apostrophe_stem.to_csv('sentiment140/sentiment140_stem_all_with_apostrophe.csv', index=False, header=False)

df_no_apostrophe_stem.to_csv('sentiment140/sentiment140_stem_all_no_apostrophe.csv', index=False, header=False)