# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('universal_tagset')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/garrethlee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/garrethlee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/garrethlee/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/garrethlee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrethlee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/garrethlee/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

# Initial Data Loading

In [2]:
columns=['sentiment', 'id', 'date', 'flag', 'user', 'tweet']
data = pd.read_csv('data/tweets.csv', encoding='latin-1', names=columns)

data.head()

Unnamed: 0,sentiment,id,date,flag,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# Basic Data Cleaning

We'll only need the actual tweet, other columns such as usernames, flags, dates, and id are irrelevant at this level

In [3]:
#Drop unecessary columns
data = data.drop(columns=['id','date','flag','user'])

# Original dataset has a scale of (4 - positive, 0 - negative), we will replace this for clarity
data['sentiment'] = data['sentiment'].replace({4:1}) 

# An overview
data.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


Then, we can clean up the tweets using regular expressions to remove formatting errors and tags

In [4]:
def remove_unecessary(tweet):
    """Removes whitespace and non-essential characters from tokens"""
    tweet = re.sub(r"((www.)?https?:\/\/)?[^\s]*\.([\w]{2,3})(\/\w*)*", "", tweet) #removes links
    tweet = re.sub(r"(RT )?@\w+:?", "", tweet) #removes RT and @
    tweet = re.sub(r"[^(a-zA-Z|\')]", " ", tweet)
    tweet = ' '.join(tweet.split()) #removes whitespace from text
    return tweet

data['tweet'] = data['tweet'].apply(remove_unecessary)

In [5]:
data

Unnamed: 0,sentiment,tweet
0,0,Awww that's a bummer You shoulda got David Car...
1,0,is upset that he can't update his Facebook by ...
2,0,I dived many times for the ball Managed to sav...
3,0,my whole body feels itchy and like its on fire
4,0,no it's not behaving at all i'm mad why am i h...
...,...,...
1599995,1,Just woke up Having no school is the best feel...
1599996,1,Very cool to hear old Walt interviews bmta
1599997,1,Are you ready for your MoJo Makeover Ask me fo...
1599998,1,Happy th Birthday to my boo of alll time Tupac...


# Tokenizing the data

Tokenizing is splitting strings to smaller parts called `tokens`, which will help when categorizing certain words to categories (nouns, adjectives, same words with different tenses, etc.)

In [78]:
from nltk.tokenize import word_tokenize

# Split dataset into positive tweets and negative tweets
positive_tweets = list(data[data['sentiment'] == 1]['tweet'])
negative_tweets = list(data[data['sentiment'] == 0]['tweet'])

# Tokenize each tweet (aka split each tweet into smaller subsets)
positive_tokens = list(map(lambda tweet: word_tokenize(tweet), positive_tweets[:300000]))
negative_tokens = list(map(lambda tweet: word_tokenize(tweet), negative_tweets[:300000]))

# Normalization

Per [Wikipedia](en.wikipedia.org/wiki/Text_normalization), the act of reducing a word to its simplest form. Words like catch, catching and caught are reduced to its bare bones, in this case 'catch'.

**Stemming**, on the other hand, is removing affixes from words.

**Lemmatization** is grouping several words to be analyzed under one group.

These are all popular techniques in NLP, but picking between one or the other is down to your preference for speed or accuracy.

___

For this dataset, we will first get each tweet's word tags using nltk's `pos_tag` function, then group words with similar word tags (nouns, verbs, and adjectives) as one group (*lemmatization*)

In [79]:
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer

def lemmatize_tweet(t:list) -> list:
    """Strips down words in a tweet into its simplest grammatical form"""
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(t):
        if tag[:2] in ("NN", "PRP"):
            pos = "n"
        elif tag[0] == "V":
            pos = "v"
        else:
            pos = "a"
        lemmatized_sentence.append(lemmatizer.lemmatize(word.lower(), pos))
    return lemmatized_sentence

lemmatized_positive_tokens = list(map(lemmatize_tweet, positive_tokens))
lemmatized_negative_tokens = list(map(lemmatize_tweet, negative_tokens))

# Removing Noise

We remove 'stopwords' to further simplify the tokens

In [80]:
stop_words = [word for word in stopwords.words('english') if word != "not"]

def clean_token(token:list) -> list:
    """Removes stopwords from the given token"""
    return [word.lower() for word in token if word.lower() not in stop_words]

In [81]:
cleaned_positive_tokens = list(map(clean_token,lemmatized_positive_tokens))
cleaned_negative_tokens = list(map(clean_token,lemmatized_negative_tokens))

# General Workflow

We have a tweet in text form, we first:

1. **Tokenize** the tweet (split into subparts)
2. **Lemmatize** the tweet (boil down to simplest word)
3. **Clean** the tweet (remove stopwords and other noise)

In [148]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield {token:True for token in tweet_tokens}

positive_tokens_for_model = get_tweets_for_model(cleaned_positive_tokens)
negative_tokens_for_model = get_tweets_for_model(cleaned_negative_tokens)

In [150]:
import random

# Format the dataset to have its according label
positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset

# Shuffle the dataset to maintain model objectivity
random.shuffle(dataset)

# Set 90:10 as training:testing ratio
train_data = dataset[:550000]
test_data = dataset[550000:]

In [138]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))

Accuracy is: 0.75726
Most Informative Features
                  asthma = True           Negati : Positi =     32.3 : 1.0
               depressed = True           Negati : Positi =     31.8 : 1.0
                coughing = True           Negati : Positi =     31.7 : 1.0
                  boohoo = True           Negati : Positi =     28.2 : 1.0
                  bummed = True           Negati : Positi =     28.1 : 1.0
                 electro = True           Positi : Negati =     27.0 : 1.0
               heartburn = True           Negati : Positi =     27.0 : 1.0
                  unwell = True           Negati : Positi =     27.0 : 1.0
                hayfever = True           Negati : Positi =     26.5 : 1.0
                  booooo = True           Negati : Positi =     26.4 : 1.0
None


In [153]:
import pickle

with open("naivebayes_model.h5", "wb") as f:
    pickle.dump(classifier, f)

In [129]:
def tweet_pipeline(tweet):
    token = word_tokenize(tweet)
    lemmatized_token = lemmatize_tweet(token)
    cleaned_token = clean_token(lemmatized_token)
    return dict((t.lower(),True) for t in cleaned_token)

In [89]:
from collections import Counter
import requests
import pandas as pd
import json
import ast
import yaml
import re

def create_url(user):
    """Creates request URL for the specified username"""
    username = user
    url = f"https://api.twitter.com/2/tweets/search/recent?query=from:{username}&max_results=100"
    return url

def get_token():
    with open("config.yaml") as f:
        data = yaml.safe_load(f)
        return data['twitter_api']['bearer_token']

def clean_tweets(tweets):
    """Removes whitespace and non-essential characters from tweets"""
    for i in range(len(tweets)):
        tweet = tweets[i]
        tweet = re.sub(r"((www.)?https?:\/\/)?[^\s]*\.([\w]{2,3})(\/\w*)*", "", tweet) #removes links
        tweet = re.sub(r"(RT )?@\w+:?", "", tweet) #removes RT and @
        tweet = re.sub(r"[^(a-zA-Z|\')]", " ", tweet)
        tweet = ' '.join(tweet.split()) #removes whitespace from text
        tweets[i] = tweet
    final_tweets = list(filter(lambda tweet: tweet != "", tweets))
    return final_tweets


def run(user="G2Jankos"):
    """Requests from Twitter API"""
    url = create_url(user)
    bearer_token = get_token()
    headers = {"Authorization": f"Bearer {bearer_token}"}
    response = requests.get(url, headers=headers)
    d = json.loads(response.text)
    try:
        tweets = [entry['text'] for entry in d['data']]
        cleaned_tweets = clean_tweets(tweets)
        return cleaned_tweets
    except KeyError as e:
        print('Username not found!')

In [130]:
def get_sentiments(user=None):
    tweets = run(user)
    total_pos = 0
    for tweet in tweets[:10]:
        token = tweet_pipeline(tweet)
        sentiment = classifier.prob_classify(token)
        pos = sentiment.prob('Positive')
        total_pos += pos
        print(f"Tweet: {tweet}\nNegative: {sentiment.prob('Negative')}\nPositive:{pos}\n")
    print(total_pos / 10)

Tweet: Ayyyyyeeeee That s cause is a BEAST Congrats QB
Negative: 0.3678654890484291
Positive:0.6321345109515714

Tweet: FACTS FACTS FACTS It literally makes ABSOLUTELY ZERO SENSE They say if common sense was common then we d all have it Ain t that the truth FreeKyrie
Negative: 0.2822793578434904
Positive:0.7177206421565137

Tweet: Coach POP CONGRATULATIONS ALL TIME WINS
Negative: 0.05164349473645165
Positive:0.9483565052635501

Tweet: I love that the refs let KD and JE talk that talk to each other and didn t TECH them up That s DOPE Understanding the assignment
Negative: 0.18738887806514876
Positive:0.8126111219348497

0.3110822780306485
