# Business Understanding

# Data Engineering

## Import Libraries

In [58]:
# import libraries required to load, transform, analyze and plot data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(context='paper', style='darkgrid', 
        rc={'figure.facecolor':'white'}, font_scale=1.2)

import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
import en_core_web_sm
from spacy.tokenizer import _get_regex_pattern
from sklearn.model_selection import train_test_split
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D

In [22]:
# remove scientific notation and restrictions on df rows/columns display
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_columns', None)
pd.set_option('max_rows', 200)
pd.set_option('display.max_colwidth', 150)

## Dataframe Basics

### Load File

In [23]:
# load primary source file to df, renaming columns, dropping non-ASCII
col_names = ['tweet_text', 'directed_at', 'emotion_label']
tweets = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding= 'unicode_escape', names=col_names, header=0)
tweets.head()


Unnamed: 0,tweet_text,directed_at,emotion_label
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion


### Data Values

In [24]:
# review data types and null counts
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   tweet_text     9092 non-null   object
 1   directed_at    3291 non-null   object
 2   emotion_label  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [25]:
# drop nan tweets from dataframe
tweets.dropna(subset = ['tweet_text'], inplace=True)
tweets.shape

(9092, 3)

In [26]:
# check value counts by column
print(tweets['directed_at'].value_counts(normalize=True, dropna=False))

NaN                               0.638
iPad                              0.104
Apple                             0.073
iPad or iPhone App                0.052
Google                            0.047
iPhone                            0.033
Other Google product or service   0.032
Android App                       0.009
Android                           0.009
Other Apple product or service    0.004
Name: directed_at, dtype: float64


In [27]:
# create brand feature
tweets['directed_at'].fillna('None', inplace=True)
brand_map = {'iPad': 'Apple', 'Apple': 'Apple', 'iPad or iPhone App': 'Apple', 
             'Google': 'Google', 'iPhone': 'Apple', 
             'Other Google product or service': 'Google',
            'Android App': 'Google', 'Android': 'Google',
             'Other Apple product or service': 'Apple',
             'None': 'None'
            }
tweets['brand'] = tweets.directed_at.map(brand_map, na_action='ignore')

In [28]:
# clean emotion labels
tweets['emotion_label'].replace({'No emotion toward brand or product': 'Neutral',
                                 'Positive emotion': 'Positive', 
                                 'Negative emotion': 'Negative', 
                                 'I can\'t tell': 'Unknown'}, inplace=True)

# check value counts by column
print(tweets['emotion_label'].value_counts(normalize=True, dropna=False))

Neutral    0.593
Positive   0.328
Negative   0.063
Unknown    0.017
Name: emotion_label, dtype: float64


In [29]:
# check value counts by column
tweets.groupby(by=['brand', 'emotion_label'])['tweet_text'].count()

brand   emotion_label
Apple   Negative          388
        Neutral            65
        Positive         1949
        Unknown             7
Google  Negative          131
        Neutral            26
        Positive          723
        Unknown             2
None    Negative           51
        Neutral          5297
        Positive          306
        Unknown           147
Name: tweet_text, dtype: int64

## Tweet Text Clean

In [30]:
# Get all the stop words and punctuation in the English language
punctuation = list(string.punctuation)
punctuation.remove('#') # keep # for hashtags

# sample stopwods and punctuations
print(f'Punctuation Count: {len(punctuation)} Sample 5: {punctuation[0:5]}')

Punctuation Count: 31 Sample 5: ['!', '"', '$', '%', '&']


In [31]:
def remove_punctuation(x):
    """
    Helper function to remove punctuation from a string x: any string
    """
    try:
        x = re.sub('@[A-Za-z0-9]+', '', x) # remove @mention users
        x = re.sub(r'http\S+', '', x) # remove URL references
        x = re.sub(r'\b[0-9]+\b', '', x) # remove stand-alone numbers  
        x = ''.join(ch for ch in x if ch not in punctuation) # remove punc
    except:
        pass
    return x

In [32]:
# function to clean text
def  clean_text(df, text_field, new_text_field):
    df[new_text_field] = df[text_field].str.lower()
    df[new_text_field] = df[new_text_field].apply(remove_punctuation) 
    return df

In [33]:
tweets_clean = clean_text(tweets, 'tweet_text', 'tweet_text_clean')
tweets_clean.head()

Unnamed: 0,tweet_text,directed_at,emotion_label,brand,tweet_text_clean
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative,Apple,i have a 3g iphone after hrs tweeting at #riseaustin it was dead i need to upgrade plugin stations at #sxsw
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive,Apple,know about awesome ipadiphone app that youll likely appreciate for its design also theyre giving free ts at #sxsw
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive,Apple,can not wait for #ipad also they should sale them down at #sxsw
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative,Apple,i hope this years festival isnt as crashy as this years iphone app #sxsw
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive,Google,great stuff on fri #sxsw marissa mayer google tim oreilly tech booksconferences amp matt mullenweg wordpress


In [39]:
# 
nlp = en_core_web_sm.load()
print(nlp.pipe_names)

# get default pattern for tokens that don't get split
re_token_match = _get_regex_pattern(nlp.Defaults.token_match)
# add your patterns (here: hashtags and in-word hyphens)
re_token_match = f"({re_token_match}|#\w+|\w+-\w+)"

# overwrite token_match function of the tokenizer
nlp.tokenizer.token_match = re.compile(re_token_match).match

['tagger', 'parser', 'ner']


In [40]:
# 
stops_sp = nlp.Defaults.stop_words
print(f'spaCy Stopword Count: {len(stops_sp)}')

def clean_token(doc):
    return [token.lemma_ for token in doc if not token.is_stop 
            and not token.is_punct and not token.is_digit 
            and not token.is_space]

tweets['tokens_sp'] = [clean_token(nlp(row)) for row in tweets.tweet_text_clean.apply(str)]

tweets.head()

spaCy Stopword Count: 326


Unnamed: 0,tweet_text,directed_at,emotion_label,brand,tweet_text_clean,tokens_sp
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative,Apple,i have a 3g iphone after hrs tweeting at #riseaustin it was dead i need to upgrade plugin stations at #sxsw,"[g, iphone, hrs, tweet, #riseaustin, dead, need, upgrade, plugin, station, #sxsw]"
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive,Apple,know about awesome ipadiphone app that youll likely appreciate for its design also theyre giving free ts at #sxsw,"[know, awesome, ipadiphone, app, will, likely, appreciate, design, give, free, ts, #sxsw]"
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive,Apple,can not wait for #ipad also they should sale them down at #sxsw,"[wait, #ipad, sale, #sxsw]"
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative,Apple,i hope this years festival isnt as crashy as this years iphone app #sxsw,"[hope, year, festival, not, crashy, year, iphone, app, #sxsw]"
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive,Google,great stuff on fri #sxsw marissa mayer google tim oreilly tech booksconferences amp matt mullenweg wordpress,"[great, stuff, fri, #sxsw, marissa, mayer, google, tim, oreilly, tech, booksconference, amp, matt, mullenweg, wordpress]"


In [41]:
# 
word_dict = {}

# Loop through all the tags
for i, row in tweets['tokens_sp'].iteritems():
    for word in row:
        if word not in word_dict:
            word_dict[word] = 1
        else:
            word_dict[word] +=1

word_counts = sorted(word_dict.items(), key=lambda kv: kv[1], reverse=True)
print(f'Total words: {len(word_counts)}')
word_counts[:25]

Total words: 9533


[('#sxsw', 8947),
 ('link', 4300),
 ('rt', 2953),
 ('ipad', 2245),
 ('google', 2102),
 ('apple', 1826),
 ('store', 1501),
 ('iphone', 1258),
 ('new', 1093),
 ('app', 992),
 ('austin', 848),
 ('launch', 819),
 ('amp', 724),
 ('social', 623),
 ('popup', 600),
 ('today', 573),
 ('open', 514),
 ('not', 496),
 ('sxsw', 481),
 ('network', 472),
 ('line', 446),
 ('circle', 444),
 ('android', 435),
 ('#apple', 409),
 ('party', 380)]

# Data Analysis

# Predictive Models

In [42]:
# filter tweets for identifiable emotions only (drop unknown)
sentiments = ['Positive', 'Negative', 'Neutral']
tweets_f = tweets[tweets['emotion_label'].isin(sentiments)]

# create X and y (one-hot encoded for 3 classes)
X = tweets_f['tokens_sp']
y = pd.get_dummies(tweets_f['emotion_label'])
print(X.iloc[:3], y.iloc[:3])

0            [g, iphone, hrs, tweet, #riseaustin, dead, need, upgrade, plugin, station, #sxsw]
1    [know, awesome, ipadiphone, app, will, likely, appreciate, design, give, free, ts, #sxsw]
2                                                                   [wait, #ipad, sale, #sxsw]
Name: tokens_sp, dtype: object    Negative  Neutral  Positive
0         1        0         0
1         0        0         1
2         0        0         1


In [43]:
# Split into training and test sets
SEED = 19
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=SEED)
print(f'X_train: {X_train.shape} X_test: {X_test.shape} ' 
      f'y_train: {y_train.shape} y_test: {y_test.shape}')

X_train: (7148,) X_test: (1788,) y_train: (7148, 3) y_test: (1788, 3)


In [57]:
# 
tokenizer = text.Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)
tokenized_tweets = tokenizer.texts_to_sequences(X_train)
X_train_seq = sequence.pad_sequences(tokenized_tweets, maxlen=25) # longest 23

In [63]:
model = Sequential()
embedding_size = 25
model.add(Embedding(20000, embedding_size))
model.add(LSTM(25, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

In [64]:
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [65]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 25)          500000    
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 25)          5100      
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 25)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 25)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                1300      
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 3)                

In [67]:
model.fit(X_train_seq, y_train, epochs=3, batch_size=32, validation_split=0.15)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6075 samples, validate on 1073 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7f93b24ec910>