## This notebook will give you a walk through of nltk preprocessing library to preprocess twitter sentiment analysis data

In [2]:
import re
import string
import nltk
import random
from nltk.corpus import twitter_samples
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

## download data

In [6]:
nltk.download("twitter_samples")

In [8]:
positive_samples = twitter_samples.strings("positive_tweets.json")
negative_samples = twitter_samples.strings("negative_tweets.json")

## download stop words

In [9]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/deepak/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## remove hyper links, hashtags and RT which can occur commonly in many tweets

In [13]:
tweet = positive_samples[2277]
# remove old style retweet text "RT"
tweet2 = re.sub(r'^RT[\s]+', '', tweet)
# remove hyperlinks
tweet2 = re.sub(r'https?://[^\s\n\r]+', '', tweet2)
# remove hashtags
# only removing the hash # sign from the word
tweet2 = re.sub(r'#', '', tweet2)

## tokenize the tweets into tokens or words

In [15]:
tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
tweet_tokens = tokenizer.tokenize(tweet2)
print(tweet2)
print(tweet_tokens)

My beautiful sunflowers on a sunny Friday morning off :) sunflowers favourites happy Friday off… 
['my', 'beautiful', 'sunflowers', 'on', 'a', 'sunny', 'friday', 'morning', 'off', ':)', 'sunflowers', 'favourites', 'happy', 'friday', 'off', '…']


## remove stop words

In [18]:
stop_words_english = stopwords.words("english")
tweet_clean = []
for word in tweet_tokens:
    if (word not in stop_words_english and word not in string.punctuation):
        tweet_clean.append(word)
print("removed words")
print(tweet_clean)

removed words
['beautiful', 'sunflowers', 'sunny', 'friday', 'morning', ':)', 'sunflowers', 'favourites', 'happy', 'friday', '…']


## stemming using the porter stemmer to replace the words by thier stem of common words

In [20]:
stemmer = PorterStemmer()
tweets_stem = []
for word in tweet_clean:
    stem_word = stemmer.stem(word)
    tweets_stem.append(stem_word)

print("stemmed words:")
print(tweets_stem)

stemmed words:
['beauti', 'sunflow', 'sunni', 'friday', 'morn', ':)', 'sunflow', 'favourit', 'happi', 'friday', '…']


## A made a preprocess tweet function which you can use to preprocess tweets 

In [21]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean