In [1]:
# importing all necessary lib 
# using the twitter tweets samples from nltk package

#Resources :

#nltk.download('twitter_samples')
#The punkt module is a pre-trained model that helps you tokenize words and sentences
#nltk.download('punkt')
#wordnet is a lexical database for the English language that helps the script determine the base word. You need the averaged_perceptron_tagger resource to determine the context of a word in a sentence.
#nltk.download('wordnet')
#nltk.dowmload('averaged_perceptron_tagger')
#nltk.download('stopwords')
# find out which are the most common words using the FreqDist class of NLTK

import re,string,random
from nltk import FreqDist, classify, NaiveBayesClassifier
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples ,stopwords
from nltk.tokenize import word_tokenize

In [2]:
#Next, create variables for positive_tweets, negative_tweets
#The strings() method of twitter_samples will print all of the tweets within a dataset as strings
stop_words = stopwords.words('english')
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

#The first part of making sense of the data is through a process called tokenization, or splitting strings into smaller parts called tokens.A token is a sequence of characters in text that serves as a unit
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

In [3]:
# Step 3 — Normalizing the Data 
#Words have different forms—for instance, “ran”, “runs”, and “running” are various forms of the same verb, “run”.
# Normalization in NLP is the process of converting a word to its canonical form.

#  types : stemming and lemmatization
print(pos_tag(tweet_tokens[0]))

def lemmatizer_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word,tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word,pos))
    return lemmatized_sentence

lemmatizer_sentence(tweet_tokens[0])
# to know the meaning of tags : https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

#In general, if a tag starts with NN, the word is a noun and if it stars with VB, the word is a verb


[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


['#FollowFriday',
 '@France_Inte',
 '@PKuchly57',
 '@Milipol_Paris',
 'for',
 'be',
 'top',
 'engage',
 'member',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

In [4]:
# Step 4 — Removing Noise from the Data

#Noise is any part of the text that does not add meaning or information to data.

'''
Hyperlinks - All hyperlinks in Twitter are converted to the URL shortener t.co. Therefore, keeping them in the text processing would not add any value to the analysis.

Twitter handles in replies - These Twitter usernames are preceded by a @ symbol, which does not convey any meaning.

Punctuation and special characters - While these often provide context to textual data, this context is often difficult to process. For simplicity, you will remove all punctuation and special characters from tweets.
'''

def remove_noise(tweet_tokens,stop_words = ()):
    cleaned_tokens = []
    
    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
            
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token,pos)
        
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

print(remove_noise(tweet_tokens[0],stop_words))


['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [5]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [6]:
#Step 5 — Determining Word Density
#The most basic form of analysis on textual data is to take out the word frequency.

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token
            
all_positive_words = get_all_words(positive_cleaned_tokens_list)
all_negative_words = get_all_words(negative_cleaned_tokens_list)


# to know most occured words using freqdist
freq_dist_pos = FreqDist(all_positive_words)
freq_dist_neg = FreqDist(all_negative_words)

print("top positive words : ",freq_dist_pos.most_common(10))
print("\ntop negative words : ",freq_dist_neg.most_common(10))

top positive words :  [(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]

top negative words :  [(':(', 4585), (':-(', 501), ("i'm", 343), ('...', 332), ('get', 325), ('miss', 291), ('go', 275), ('please', 275), ('want', 246), ('like', 218)]


In [7]:
#Step 6 — Preparing Data for the Model

'''
Converting Tokens to a Dictionary

First, you will prepare the data to be fed into the model. 
You will use the Naive Bayes classifier in NLTK to perform the modeling exercise. 
Notice that the model requires not just a list of words in a tweet, but a Python dictionary with words as keys and True as values. 
The following function makes a generator function to change the format of the cleaned data.'''

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token ,True] for token in tweet_tokens)
    
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [8]:
#Splitting the Dataset for Training and Testing the Model

positive_dataset = [(tweet_dict,"Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict,"Negative") for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset+negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:] 

In [9]:
#Step 7 — Building and Testing the Model

'''Finally, you can use the NaiveBayesClassifier class to build the model. 
Use the .train() method to train the model and the .accuracy() method to test the model on the testing data.'''

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is : ",classify.accuracy(classifier,test_data))
print(classifier.show_most_informative_features(10))

Accuracy is :  0.996
Most Informative Features
                      :( = True           Negati : Positi =   2066.0 : 1.0
                followed = True           Negati : Positi =     22.9 : 1.0
                follower = True           Positi : Negati =     21.9 : 1.0
                     sad = True           Negati : Positi =     19.8 : 1.0
                    glad = True           Positi : Negati =     19.1 : 1.0
                    luck = True           Positi : Negati =     15.1 : 1.0
                     x15 = True           Negati : Positi =     14.9 : 1.0
              appreciate = True           Positi : Negati =     14.4 : 1.0
                  arrive = True           Positi : Negati =     13.7 : 1.0
                    miss = True           Negati : Positi =     12.3 : 1.0
None


In [10]:
#Next, you can check how the model performs on random tweets from Twitter

custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

custom_token = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token,True] for token in custom_token)))

Negative


In [11]:
custom_tweet = "Congrats #SportStar on your 7th best goal from last season winning goal of the year :) #Baller #Topbin #oneofmanyworldies"

custom_token = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token,True] for token in custom_token)))

Positive
