# Data Preparation

In [3]:
import nltk
from nltk.corpus import twitter_samples  # Samples twitter dataset for doing pre-processing

In [5]:
nltk.download('twitter_samples') # Download the data

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\gopal\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [8]:
# Set of all positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [9]:
# Total number of tweets
print(f'Total number of positive tweets : {len(all_positive_tweets)}')
print(f'Total number of negative tweets : {len(all_negative_tweets)}')

Total number of positive tweets : 5000
Total number of negative tweets : 5000


In [13]:
# Print one sample
print(all_positive_tweets[0]) # Positive sample
print(all_negative_tweets[0]) # Negative sample

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
hopeless for tmr :(


Four important steps to follow:
* Tokenizing the string
* Lowercasing
* Removing stop words and punctuation
* Stemming

In [31]:
# One complex tweet to check all steps
tweet = all_positive_tweets[2277]
print(tweet)

My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i


In [32]:
# Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gopal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [49]:
import re
print("Original tweet : ",tweet)
# Remove hyperlinks
tweet2 = re.sub(r'https?:\/\/.*[a-zA-Z0-9\r\n]*','',tweet)
print("Without hyperlinks : ",tweet2)
# Remove hastags
tweet2 = re.sub(r'#','',tweet2)
print("Without hashtags : ",tweet2)
# Remove retweets : Structure like RT
tweet2 = re.sub(r'^RT[\s]+','',tweet2)
print("Without retweets tag : ",tweet2)

Original tweet :  My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i
Without hyperlinks :  My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… 
Without hashtags :  My beautiful sunflowers on a sunny Friday morning off :) sunflowers favourites happy Friday off… 
Without retweets tag :  My beautiful sunflowers on a sunny Friday morning off :) sunflowers favourites happy Friday off… 


In [35]:
from nltk import TweetTokenizer # Tokenizer for tweets
from nltk.stem import PorterStemmer # Used for stemming

# Tokenize the string

In [37]:
tokenizer = TweetTokenizer(preserve_case = False,reduce_len = True,strip_handles = False)

In [50]:
# Tokenize
print(tweet2)
tweet_tokens = tokenizer.tokenize(tweet2)
print(tweet_tokens)

My beautiful sunflowers on a sunny Friday morning off :) sunflowers favourites happy Friday off… 
['my', 'beautiful', 'sunflowers', 'on', 'a', 'sunny', 'friday', 'morning', 'off', ':)', 'sunflowers', 'favourites', 'happy', 'friday', 'off', '…']


# Remove stopwords and punctuation

In [54]:
import string
from nltk.corpus import stopwords

stopwords_english = stopwords.words('english')
print(string.punctuation)
print(stopwords_english)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only

In [55]:
cleaned_tokens = []
for word in tweet_tokens:
    if (word not in stopwords_english and word not in string.punctuation):
        cleaned_tokens.append(word)

In [56]:
print(cleaned_tokens)

['beautiful', 'sunflowers', 'sunny', 'friday', 'morning', ':)', 'sunflowers', 'favourites', 'happy', 'friday', '…']


# Stem the tokens

In [58]:
stemmer = PorterStemmer()
stemmed_tokens = []
for token in cleaned_tokens:
    stemmed_word = stemmer.stem(token)
    stemmed_tokens.append(stemmed_word)

In [59]:
print(stemmed_tokens)

['beauti', 'sunflow', 'sunni', 'friday', 'morn', ':)', 'sunflow', 'favourit', 'happi', 'friday', '…']


That's it for the pre-processing