# Environment setup

In [1]:
# import required modules
import nltk
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples

import re
import pickle
import matplotlib
import random

from nltk import classify
from nltk import NaiveBayesClassifier


In [2]:
# Download module packages
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('twitter_samples')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/danielkalemi/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielkalemi/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/danielkalemi/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/danielkalemi/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/danielkalemi/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

# Required functions

In [3]:
# functions that cleans and applies lemmatization
# input: list of tokens
# Output: list of tokens 
def normalize_tweet(tweet_tokens):
    clean_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub('(@[A-Za-z0-9_]+)','', token)
        token = re.sub('(#[A-Za-z0-9_]+)','', token)

        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)
        
        clean_tokens.append(token)

    return clean_tokens


In [4]:
# function that removes stopwords and normalizes
# input: list of tokenized tweets (list of tokens) i.e. list of lists , list of stopwords
# output: list of clean tokens

def clean_sample_tokens(tokens, stpwrds):
    clean=[]
    for t in tokens:
        # get list of salient words
        salient_words = [word for word in t if (word not in stpwrds)]

        cc = normalize_tweet(salient_words)
        sw = [w for w in cc if len(w)>1]

        clean.append(sw)
    return clean

In [5]:
# generator function that yields a dictionary as an input for NaiveBayesClassifier
# input: list of tokens
# output: dictionary of [token, bool]
def generate_token_dict (tokens_list):
    for tokens in tokens_list:
        yield dict([token, True] for token in tokens)
        

# Clean Training and Testing Datasets

In [6]:
# tokenize positive and negative samples
# nltk gives us a function to do just twitter sample tokenizing
positive_tweets = twitter_samples.tokenized('positive_tweets.json')
negative_tweets = twitter_samples.tokenized('negative_tweets.json')


In [7]:
positive_tweets[:2]

[['#FollowFriday',
  '@France_Inte',
  '@PKuchly57',
  '@Milipol_Paris',
  'for',
  'being',
  'top',
  'engaged',
  'members',
  'in',
  'my',
  'community',
  'this',
  'week',
  ':)'],
 ['@Lamb2ja',
  'Hey',
  'James',
  '!',
  'How',
  'odd',
  ':/',
  'Please',
  'call',
  'our',
  'Contact',
  'Centre',
  'on',
  '02392441234',
  'and',
  'we',
  'will',
  'be',
  'able',
  'to',
  'assist',
  'you',
  ':)',
  'Many',
  'thanks',
  '!']]

In [8]:
# import stopwords into a list 
stpwrds = stopwords.words('english')

In [9]:
# Clean positive and Negative tweets by calling clean_sample_tokens(tokens, stpwrds)
clean_pos = clean_sample_tokens(positive_tweets, stpwrds)
clean_neg = clean_sample_tokens(negative_tweets, stpwrds)


In [10]:
clean_pos[:2]

[['top', 'engage', 'member', 'community', 'week', ':)'],
 ['Hey',
  'James',
  'How',
  'odd',
  ':/',
  'Please',
  'call',
  'Contact',
  'Centre',
  '02392441234',
  'able',
  'assist',
  ':)',
  'Many',
  'thanks']]

# Prepare/Transform Training and Testing Datasets for ML

In [11]:
# prepare positive and negative dictionaries for model
pos_dict = generate_token_dict(clean_pos)
neg_dict = generate_token_dict(clean_neg)


In [12]:
# Label the positive and negative datasets accordingly
pos_dataset = [(pos,'Positive') for pos in pos_dict]
neg_dataset = [(neg,'Negative') for neg in neg_dict]


In [13]:
# Build the final ML dataset
fullset = pos_dataset + neg_dataset

# dataset is ordered --> positives and then negatives; hence will shuffle to remove order
random.shuffle(fullset)

In [14]:
# Split dataset into training and test sets based on a 70:30 ratio
cutoff = round(len(fullset)*0.7)
trainset = fullset[:cutoff]
testset = fullset[cutoff:]


# Build Naive Bayes Classifier Model

In [15]:
# Use NaiveBayesClassifier class
# Use .train() to train the training dataset 
# Use .accuracy() to test our model on the test set

classifier = NaiveBayesClassifier.train(trainset)

# check model accuracy by applying it to the test set
print('Accuracy: ', classify.accuracy(classifier, testset))

Accuracy:  0.997


# Prep our Tweets for ML

In [16]:
# unpickle our tweets file
with open('tweets.pickle','rb') as f:
    tweets = pickle.load(f)


In [17]:
# tokenize each tweet
tweet_tokens=[]
for tweet in tweets:
    tweet_tokens.append(nltk.word_tokenize(tweet))

In [18]:
# clean our tweets
clean_tweets = clean_sample_tokens(tweet_tokens, stpwrds)

# Apply the model to our list of clean tweet tokens

In [19]:
# create the required dictionary
tweet_dict = generate_token_dict(clean_tweets)

# score board
scores = {'positives':0, 'negatives':0}

# classify each tweet and update the score board
for td in tweet_dict:
    sentiment = classifier.classify(td)
    if sentiment == 'Positive':
        scores['positives']+=1
    elif sentiment == 'Negative':
        scores['negatives']+=1
        


In [20]:
pos = float(scores['positives'])
neg = float(scores['negatives'])

tot = pos+neg
print(f'{round(pos/tot,2)*100}% positives and {round(neg/tot,2)*100}% negative')

67.0% positives and 33.0% negative
