# Natural Language Processing

In [1]:
import nltk

In [2]:
from nltk.corpus import twitter_samples

In [3]:
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [4]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
all_tweets = twitter_samples.strings('tweets.20150430-223406.json')

In [5]:
len(positive_tweets)

5000

In [6]:
len(negative_tweets)

5000

In [7]:
negative_tweets[0]

'hopeless for tmr :('

In [9]:
from nltk.tokenize import TweetTokenizer

In [12]:
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

In [14]:
tweet_tokenizer.tokenize(positive_tweets[0])

['#followfriday',
 'for',
 'being',
 'top',
 'engaged',
 'members',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

In [15]:
from nltk.tag import pos_tag

In [16]:
positive_tokens = [tweet_tokenizer.tokenize(x) for x in positive_tweets]

In [18]:
negative_tokens = [tweet_tokenizer.tokenize(x) for x in negative_tweets]

In [21]:
#pos_tag(positive_tokens[3857])

In [20]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [22]:
def lemmatize(token_list: list) -> list:
    lemmatized_tokens = []
    for token, tag in pos_tag(token_list):
        if tag.startswith('VB'):
            pos = 'v'
        elif tag.startswith('NN'):
            pos = 'n'
        else:
            pos = 'a'
        lemmatized_tokens.append(lemmatizer.lemmatize(token, pos))
    
    return lemmatized_tokens   

In [23]:
positive_tokens[0]

['#followfriday',
 'for',
 'being',
 'top',
 'engaged',
 'members',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

In [29]:
test = lemmatize(positive_tokens[0])

In [26]:
from nltk.corpus import stopwords

In [27]:
stop_words = stopwords.words('english')

In [28]:
def clean_data(token_list: list, stop_words: list) -> list:
    trim_characters = ['!', '.', ',', '*', '&', '%', '$', '?', '""']
    remove_characters = ['@', '#', '//']
    cleaned_tokens = []
    
    for token in token_list:
        for character in trim_characters:
            if character in token:
                token = token.strip(character)
        for character in remove_characters:
            if character in token:
                token = ''
                
        if len(token) > 0 and token.lower() not in stop_words:
            cleaned_tokens.append(token)
    
    return cleaned_tokens

In [30]:
clean_data(test, stop_words)

['top', 'engage', 'member', 'community', 'week', ':)']

In [31]:
positive_tweets[0]

'#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'

In [32]:
def prepare_data(all_tokens: list, stop_words: list):
    cleaned_data = []
    for token_list in all_tokens:
        cleaned_tokens = clean_data(token_list, stop_words)
        lemmatized_tokens = lemmatize(cleaned_tokens)
        cleaned_data.append(cleaned_tokens)
    
    return cleaned_data

In [36]:
positive_cleaned = prepare_data(positive_tokens, stop_words)
negative_cleaned = prepare_data(negative_tokens, stop_words)

In [37]:
def convert_to_dict(all_tokens: list) -> dict:
    for token_list in all_tokens:
        yield dict([token, True] for token in token_list)

In [40]:
def prepare_data_for_model(positive_tokens: list, negative_tokens: list) -> list:
    positive_tweets = [(positive_dict, "Positive") for positive_dict in convert_to_dict(positive_tokens)]
    negative_tweets = [(negative_dict, "Negative") for negative_dict in convert_to_dict(negative_tokens)]
    
    return positive_tweets + negative_tweets

In [41]:
data = prepare_data_for_model(positive_cleaned, negative_cleaned)

In [43]:
import random

In [44]:
random.shuffle(data)

In [54]:
train = data[:round(len(data) * 0.3)]
test = data[round(len(data) * 0.3):]

In [46]:
from nltk import classify
from nltk import NaiveBayesClassifier

In [55]:
nbc = NaiveBayesClassifier.train(train)

In [56]:
classify.accuracy(nbc, test)

0.997

In [58]:
nbc.show_most_informative_features(10)

Most Informative Features
                      :( = True           Negati : Positi =    885.9 : 1.0
                      :) = True           Positi : Negati =    420.5 : 1.0
                     via = True           Positi : Negati =     14.9 : 1.0
                    miss = True           Negati : Positi =     14.3 : 1.0
                   enjoy = True           Positi : Negati =     12.3 : 1.0
                 arrived = True           Positi : Negati =     11.6 : 1.0
                   thank = True           Positi : Negati =     11.1 : 1.0
                     pls = True           Negati : Positi =     11.0 : 1.0
                    sick = True           Negati : Positi =     11.0 : 1.0
                   happy = True           Positi : Negati =      9.1 : 1.0


In [66]:
test_tweet = "my cute, non-menacing son"

In [67]:
test_tokens = tweet_tokenizer.tokenize(test_tweet)

In [68]:
trim_characters = ['#', '!', '.', ',', '*', '&', '%', '$', '?','"']
remove_characters = ['@', '#', '//']
cleaned_tokens = []
    
for token in test_tokens:
    for character in trim_characters:
        if character in token:
            token = token.strip(character)
    for character in remove_characters:
        if character in token:
            token = ''
    if len(token) > 0 and token.lower() not in stop_words:
        cleaned_tokens.append(token)

In [69]:
nbc.classify(dict([token, True] for token in cleaned_tokens))

'Positive'