# Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import re
import string

from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer

from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import Lasso
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, classification_report, plot_confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.pipeline import Pipeline as ImPipeline
from imblearn.over_sampling import SMOTE

In [2]:
from keras.layers import Dense, Dropout, Conv1D, MaxPool1D, GlobalMaxPool1D, Embedding, Activation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential

In [3]:
from tensorflow.keras.utils import to_categorical

# Dataframe Initialization and Observations

In [4]:
# load data and initialize dataframe
df = pd.read_csv('./data/judge-1377884607_tweet_product_company.csv',
                 encoding="ISO-8859-1")
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


In [5]:
# check for nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [6]:
# count nulls
df.isnull().sum()

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64

In [7]:
# view nulls vs subcategories
df['emotion_in_tweet_is_directed_at'].value_counts(dropna=False)

NaN                                5802
iPad                                946
Apple                               661
iPad or iPhone App                  470
Google                              430
iPhone                              297
Other Google product or service     293
Android App                          81
Android                              78
Other Apple product or service       35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [8]:
df['unprocessed_text'] = df['tweet_text'].str.split()
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,unprocessed_text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,"[.@wesley83, I, have, a, 3G, iPhone., After, 3..."
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,"[@jessedee, Know, about, @fludapp, ?, Awesome,..."
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,"[@swonderlin, Can, not, wait, for, #iPad, 2, a..."
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,"[@sxsw, I, hope, this, year's, festival, isn't..."
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,"[@sxtxstate, great, stuff, on, Fri, #SXSW:, Ma..."
...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,"[Ipad, everywhere., #SXSW, {link}]"
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,"[Wave,, buzz..., RT, @mention, We, interrupt, ..."
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,"[Google's, Zeiger,, a, physician, never, repor..."
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,"[Some, Verizon, iPhone, customers, complained,..."


### Summary

We have a large number of nulls in the data set. We will need to figure out how to handle these nulls in data cleaning in addition to the standard NLP data cleaning procedures such as RegEx, stopword removal, lemmatization,  and tokenization. 

# Data Cleanup

In [9]:
def process_string(text):
    """This function returns a processed list of words from the given text
    
    This function removes html elements and urls using regular expression, then
    converts string to list of workds, them find the stem of words in the list of words and
    finally removes stopwords and punctuation marks from list of words.
    
    Args:
        text(string): The text from which html elements, urls, stopwords, punctuation are removed and lemmatized
        
    Returns:
        clean_text(string): A text formed after text preprocessing.
    """
    # Remove twitter user handle from the text
    text = re.sub('@[^\s]+',
                  '',
                  str(text))
    
    # Remove any urls from the text
    text = re.sub(r'https:\/\/.*[\r\n]*',
                  "",
                  str(text))
    
    # Remove any urls starting from www. in the text
    text = re.sub(r'www\.\w*\.\w\w\w',
                  "",
                  str(text))
    
    # Remove any html elements from the text
    text = re.sub(r"<[\w]*[\s]*/>",
                  "",
                  str(text))
    
    # Remove prediods  marks
    text = re.sub(r"[\.]*",
                  "",
                  str(text))
    
    # Initialize RegexpTokenizer
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    tokenizer = RegexpTokenizer(pattern)

    # Tokenize text
    text_tokens = tokenizer.tokenize(text.lower())
    
    # Instantiate lemmatizer
    lemmatizer  = WordNetLemmatizer()
    
    # Get english stopwords
    english_stopwords = stopwords.words("english")
    new_list = ["mention", "sxsw", 'link', 'rt', 'quot', 'g']
    english_stopwords.extend(new_list)
    
    cleaned_text_tokens = [] # A list to hold cleaned text tokens
    
    for word in text_tokens:
        if((word not in english_stopwords) and # Remove stopwords
            (word not in string.punctuation)): # Remove punctuation marks
                
                lemmas = lemmatizer.lemmatize(word) # Get lemma of the current word
                cleaned_text_tokens.append(lemmas) # Appened lemma word to list of cleaned list
    
    # Combine list into single string
    clean_text = " ".join(cleaned_text_tokens)
    
    return clean_text

In [10]:
# apply cleanup function to tweet_text column
df['tweet_text'] = df['tweet_text'].apply(process_string)

In [11]:
df['preprocessed_text'] = df['tweet_text'].str.split()

In [12]:
# categorize Tweets by 'Manufacturer' based on if Tweet contains certain keywords words
# instantiate list of keywords
is_apple = ['ipad', 
            "ipad's", 
            'iphone', 
            'iphones', 
            "iphone's", 
            'apple', 
            "apple's", 
            'mac', 
            'macos' 
            'ios', 
            'os' 
            'macbook', 
            'macbook pro', 
            'm1', 
            'macbook air',
            'air',
            'airpod', 
            'airpods',
            'airtag'
            'watch'
            'monterey',
            'big sur',
            'catalina',
            'mojave',
            'high sierra',
            'sierra',
            'el capitan',
            'yosemite',
            'icloud']

is_google = ['windows', 
             'google', 
             "google's", 
             'googles' 
             'pixel', 
             "pixel's", 
             'pixels', 
             'android', 
             "android's", 
             "androids", 
             'nest']

In [13]:
# defining dictionary map and mapping
driver_operator_map = {
    'iPad': 'Apple',
    'Apple': 'Apple',
    'iPad or iPhone App': 'Apple',
    'iPhone': 'Apple',
    'Other Apple product or service1': 'Apple',
    
    'Google': 'Google',
    'Other Google product or service': 'Google',
    'Android App': 'Google',
    'Android': 'Google'}

df['Manufacturer'] = df['emotion_in_tweet_is_directed_at'].map(driver_operator_map)
df['Manufacturer'].value_counts(dropna=False)

NaN       5837
Apple     2374
Google     882
Name: Manufacturer, dtype: int64

In [14]:
# define functions that loop through keyword lists and assign a category in a new column in the df
def apple_sorter(x):
    for i in is_apple:
        if i.lower() in x.lower():
            return 'Apple'
        else:
            continue
        
def google_sorter(x):
    for i in is_google:
        if i.lower() in x.lower():
            return 'Google'
        else:
            continue

In [15]:
# apply apple and google sorters to tweet_text
df['Manufacturer'] = df['tweet_text'].apply(apple_sorter)
df['Google'] = df['tweet_text'].apply(google_sorter)

In [16]:
# merge created columns into 1 master column, 'Manufacturer'
df['Manufacturer'] = df['Manufacturer'].combine_first(df['Google'])

In [17]:
# drop extraneous columns
df.drop('Google', axis=1, inplace=True)
df.drop('emotion_in_tweet_is_directed_at', axis=1, inplace=True)

In [18]:
# recheck null counts
df['Manufacturer'].value_counts(dropna=False)

Apple     5577
Google    2749
NaN        767
Name: Manufacturer, dtype: int64

In [19]:
df.isnull().sum()

tweet_text                                              0
is_there_an_emotion_directed_at_a_brand_or_product      0
unprocessed_text                                        1
preprocessed_text                                       0
Manufacturer                                          767
dtype: int64

In [20]:
# # drop remaining NaN - does not contain major keywords related to Apple and Google
# df.dropna(inplace=True)

In [21]:
# # recheck null counts
# df.isnull().sum()

In [22]:
# # recheck null counts
# df['Manufacturer'].value_counts(dropna=False)

In [23]:
# check cleaned data
df.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product,unprocessed_text,preprocessed_text,Manufacturer
0,iphone hr tweeting rise austin dead need upgra...,Negative emotion,"[.@wesley83, I, have, a, 3G, iPhone., After, 3...","[iphone, hr, tweeting, rise, austin, dead, nee...",Apple
1,know awesome ipad iphone app likely appreciate...,Positive emotion,"[@jessedee, Know, about, @fludapp, ?, Awesome,...","[know, awesome, ipad, iphone, app, likely, app...",Apple
2,wait ipad also sale,Positive emotion,"[@swonderlin, Can, not, wait, for, #iPad, 2, a...","[wait, ipad, also, sale]",Apple
3,hope year's festival crashy year's iphone app,Negative emotion,"[@sxsw, I, hope, this, year's, festival, isn't...","[hope, year's, festival, crashy, year's, iphon...",Apple
4,great stuff fri marissa mayer google tim o'rei...,Positive emotion,"[@sxtxstate, great, stuff, on, Fri, #SXSW:, Ma...","[great, stuff, fri, marissa, mayer, google, ti...",Google


In [24]:
# rename columns
df.rename(columns={'tweet_text':'Text', 'is_there_an_emotion_directed_at_a_brand_or_product' : 'Sentiment'}, inplace=True)

In [25]:
# check value counts for Sentiment column
df['Sentiment'].value_counts(dropna=False)

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: Sentiment, dtype: int64

In [26]:
# drop 'I can't tell' as this will not be relevant for modeling
df = df[df['Sentiment'] != "I can't tell"]

In [27]:
# final check
df['Sentiment'].value_counts(normalize=True)

No emotion toward brand or product    0.602999
Positive emotion                      0.333221
Negative emotion                      0.063780
Name: Sentiment, dtype: float64

In [28]:
df.drop(index=df.index[-1],
        axis=0,
        inplace=True)

df.tail()

Unnamed: 0,Text,Sentiment,unprocessed_text,preprocessed_text,Manufacturer
9087,yup third app yet i'm android suggestion cc,No emotion toward brand or product,"[@mention, Yup,, but, I, don't, have, a, third...","[yup, third, app, yet, i'm, android, suggestio...",Google
9088,ipad everywhere,Positive emotion,"[Ipad, everywhere., #SXSW, {link}]","[ipad, everywhere]",Apple
9089,wave buzz interrupt regularly scheduled geek p...,No emotion toward brand or product,"[Wave,, buzz..., RT, @mention, We, interrupt, ...","[wave, buzz, interrupt, regularly, scheduled, ...",Google
9090,google's zeiger physician never reported poten...,No emotion toward brand or product,"[Google's, Zeiger,, a, physician, never, repor...","[google's, zeiger, physician, never, reported,...",Google
9091,verizon iphone customer complained time fell b...,No emotion toward brand or product,"[Some, Verizon, iPhone, customers, complained,...","[verizon, iphone, customer, complained, time, ...",Apple


In [None]:
# np.unique(df['label'], return_counts=True)

In [None]:
df = df.reset_index()

# create a new data frame with "id" and "comment" fields
df_subset = df[['index', 'Sentiment']].copy()

# data clean-up
# remove all non-aphabet characters
df_subset['Sentiment'] = df_subset['Sentiment'].str.replace("[^a-zA-Z#]", " ")

# covert to lower-case
df_subset['Sentiment'] = df_subset['Sentiment'].str.casefold()
# print (df_subset.head(10))

# set up empty dataframe for staging output
df1 = pd.DataFrame()
df1['index'] = ['99999999999']
df1['sentiment_type']='NA999NA'
df1['sentiment_score']=0

print('Processing sentiment analysis...')
sid = SentimentIntensityAnalyzer()
t_df = df1

for index, row in df_subset.iterrows():
    scores = sid.polarity_scores(row[1])

    for key, value in scores.items():
        temp = [key, value, row[0]]
        df1['index'] = row[0]
        df1['sentiment_type'] = key
        df1['sentiment_score'] = value
        t_df = t_df.append(df1)

# remove dummy row with row_id = 99999999999
t_df_cleaned = t_df[t_df.index != '99999999999']

# remove duplicates if any exist
t_df_cleaned = t_df_cleaned.drop_duplicates()

# only keep rows where sentiment_type = compound
t_df_cleaned = t_df[t_df.sentiment_type == 'compound']

# print(t_df_cleaned.head(10))

# merge dataframes
df = pd.merge(df, t_df_cleaned, on='index', how='inner')

df.head()

In [None]:
df.drop(columns='sentiment_type', inplace=True)

In [None]:
df['length_tweet'] = df['Text'].str.len()

In [None]:
label_encoder = preprocessing.LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Sentiment'])
df.head()

In [None]:
# Set up figure and axes
fig, axes = plt.subplots(nrows=2, figsize=(14, 14))

# Empty dict to hold words that have already been plotted and their colors
plotted_words_and_colors = {}

# Establish color palette to pull from
# (If you get an error message about popping from an empty list, increase this #)
color_palette = sns.color_palette('crest', n_colors=44)

# Creating a plot for each unique genre
data_by_manufacturer = [y for _, y in df.groupby('Manufacturer', as_index=False)]
for index, manufacturer_df in enumerate(data_by_manufacturer):
    
    # Find top 10 words in this genre
    all_words_in_manufacturer = manufacturer_df.preprocessed_text.explode()
    top_10 = all_words_in_manufacturer.value_counts()[:10]
    
    # Select appropriate colors, reusing colors if words repeat
    colors = []
    for word in top_10.index:
        if word not in plotted_words_and_colors:
            new_color = color_palette.pop(0)
            plotted_words_and_colors[word] = new_color
        colors.append(plotted_words_and_colors[word])
    
    # Select axes, plot data, set title
    ax = axes[index]
    ax.bar(top_10.index, top_10.values, color=colors)
    ax.set_title(manufacturer_df.iloc[0].Manufacturer.title())
    
fig.tight_layout()

plt.savefig('images/apple_google_top_words.png');

In [None]:
twitter_mask = np.array(Image.open("./images/twitter_mask.png"))
twitter_mask

In [None]:
# creating a word cloud image with a twitter mask
english_stopwords = stopwords.words("english")
new_list = ["mention", "sxsw", 'link', 'rt', 'quot' 'g']
english_stopwords.extend(new_list)

In [None]:
# creating a corpus for positive tweets only 
apple = df[df.Manufacturer == "Apple"]
apple_corpus = apple.Text.to_list()
apple_corpus = ",".join(apple_corpus)

In [None]:
# word tokenize and lowercase all words 
apple_tokens = word_tokenize(apple_corpus)
apple_stopped = [token.lower() for token in apple_tokens if token.lower() not in english_stopwords]

In [None]:
# generating a wordcloud with a twitter mask for positive words only 
wordcloud = WordCloud(stopwords=english_stopwords,
                      collocations=False, 
                      mask=twitter_mask, 
                      background_color='white', 
                      width=1800,
                      height=1400, 
                      contour_color='green', 
                      contour_width=2)

wordcloud.generate(','.join(apple_stopped))

plt.figure(figsize=(14, 14), 
           facecolor=None)

plt.imshow(wordcloud, 
           interpolation='bilinear')

plt.title('Apple Tweet Cloud', 
          size=20)

plt.axis("off")

plt.savefig('images/apple_tweet_cloud');

In [None]:
# creating a corpus for positive tweets only 
google = df[df.Manufacturer == "Google"]
google_corpus = google.Text.to_list()
google_corpus = ",".join(google_corpus)

In [None]:
# word tokenize and lowercase all words 
google_tokens = word_tokenize(google_corpus)
google_stopped = [token.lower() for token in google_tokens if token.lower() not in english_stopwords]

In [None]:
# generating a wordcloud with a twitter mask for positive words only 
wordcloud = WordCloud(stopwords=english_stopwords,
                      collocations=False, 
                      mask=twitter_mask, 
                      background_color='white', 
                      width=1800,
                      height=1400, 
                      contour_color='green', 
                      contour_width=2)

wordcloud.generate(','.join(google_stopped))

plt.figure(figsize=(14, 14), 
           facecolor=None)

plt.imshow(wordcloud, 
           interpolation='bilinear')

plt.title('Google Tweet Cloud', 
          size=20)

plt.axis("off")

plt.savefig('images/google_tweet_cloud');

# Train/Test Split, Label Encoding, Vectorization

In [63]:
# train test split with test size = 10% and random_state for result interpretability
X = df.Text
y = df.Sentiment

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.10, 
                                                    random_state=42, 
                                                    stratify=y)

In [69]:
# TFIDF vectorize X_train and X_test
tf_idf = TfidfVectorizer(ngram_range=(1, 2))

X_train = tf_idf.fit_transform(X_train.tolist())
X_test = tf_idf.transform(X_test.tolist())

In [70]:
# check shapes
print(X_train.shape, X_test.shape)

(8042, 41396) (894, 41396)


In [71]:
# LabelEncode
label_encoder = preprocessing.LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [72]:
y_train = to_categorical(y_train, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)

# Neural Networks

In [36]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import ModelCheckpoint

In [37]:
target = df.Sentiment
y_train = pd.get_dummies(target).values

In [38]:
tokenizer = text.Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(df['Text']))
list_tokenized_headlines = tokenizer.texts_to_sequences(df['Text'])
X_train = sequence.pad_sequences(list_tokenized_headlines, maxlen=100)

In [44]:
model = Sequential()
embedding_size = 128
model.add(Embedding(20000, 
                    embedding_size))
model.add(LSTM(25, 
               return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.5))
model.add(Dense(50, 
                activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, 
                activation='softmax'))
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['acc'])
checkpoint = ModelCheckpoint("best_model_lstm.hdf5", 
                             monitor='val_accuracy', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='auto', 
                             period=1,
                             save_weights_only=False)

print(model.summary())
history = model.fit(X_train,
                     y_train, 
                     epochs=25, 
                     validation_split=.1, 
                     callbacks=[checkpoint])

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 128)         2560000   
                                                                 
 lstm_2 (LSTM)               (None, None, 25)          15400     
                                                                 
 global_max_pooling1d_2 (Glo  (None, 25)               0         
 balMaxPooling1D)                                                
                                                                 
 dropout (Dropout)           (None, 25)                0         
                                                                 
 dense_4 (Dense)             (None, 50)                1300      
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                      

Epoch 24/25
Epoch 25/25


In [None]:
model = Sequential()
embedding_size = 128
model.add(Embedding(20000, embedding_size))
model.add(layers.SimpleRNN(15, return_sequences=True))
model.add(layers.SimpleRNN(15))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['acc'])
checkpoint = ModelCheckpoint("best_model_rnn.hdf5", 
                             monitor='val_accuracy', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='auto', 
                             period=1,
                             save_weights_only=False)
print(model.summary())
history = model.fit(X_train,
                     y_train, 
                     epochs=25, 
                     validation_split=.1, 
                     callbacks=[checkpoint])

In [39]:
model = Sequential()
embedding_size = 128
model.add(Embedding(20000, embedding_size))
model.add(layers.Conv1D(20, 
                        6, 
                        activation='relu',
                        kernel_regularizer=regularizers.l1_l2(l1=2e-3, l2=2e-3),
                        bias_regularizer=regularizers.l2(2e-3)))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(20, 
                        6, 
                        activation='relu',
                        kernel_regularizer=regularizers.l1_l2(l1=2e-3, l2=2e-3),
                        bias_regularizer=regularizers.l2(2e-3)))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(3,
                       activation='softmax'))
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['acc'])
checkpoint = ModelCheckpoint("best_model_cnn.hdf5", 
                             monitor='val_accuracy', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='auto', 
                             period=1,
                             save_weights_only=False)
print(model.summary())
history = model.fit(X_train,
                     y_train, 
                     epochs=25, 
                     validation_split=.1, 
                     callbacks=[checkpoint])



In [43]:
model = Sequential()
embedding_size = 128
model.add(Embedding(20000, 
                    embedding_size))
model.add(layers.Bidirectional(layers.LSTM(20, 
                                           dropout=0.6)))
model.add(layers.Dense(3, 
                       activation='softmax'))
model.compile(optimizer='rmsprop', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])
checkpoint = ModelCheckpoint("best_model_bidir.hdf5", 
                             monitor='val_accuracy', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='auto', 
                             period=1,
                             save_weights_only=False)
print(model.summary())
history = model.fit(X_train,
                     y_train, 
                     epochs=25, 
                     validation_split=.1, 
                     callbacks=[checkpoint])

Epoch 1/25
Epoch 1: val_accuracy improved from -inf to 0.65213, saving model to best_model2.hdf5
Epoch 2/25
Epoch 2: val_accuracy improved from 0.65213 to 0.65548, saving model to best_model2.hdf5
Epoch 3/25
Epoch 3: val_accuracy improved from 0.65548 to 0.68009, saving model to best_model2.hdf5
Epoch 4/25
Epoch 4: val_accuracy did not improve from 0.68009
Epoch 5/25
Epoch 5: val_accuracy improved from 0.68009 to 0.68456, saving model to best_model2.hdf5
Epoch 6/25
Epoch 6: val_accuracy did not improve from 0.68456
Epoch 7/25
Epoch 7: val_accuracy did not improve from 0.68456
Epoch 8/25
Epoch 8: val_accuracy did not improve from 0.68456
Epoch 9/25
Epoch 9: val_accuracy did not improve from 0.68456
Epoch 10/25
Epoch 10: val_accuracy did not improve from 0.68456
Epoch 11/25
Epoch 11: val_accuracy did not improve from 0.68456
Epoch 12/25
Epoch 12: val_accuracy did not improve from 0.68456
Epoch 13/25
Epoch 13: val_accuracy did not improve from 0.68456
Epoch 14/25
Epoch 14: val_accuracy di

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers
from keras import optimizers

In [None]:
# ⏰ This cell may take about thirty seconds to run

# Raw text complaints
tweets = df['Text'] 

# Initialize a tokenizer 
tokenizer = Tokenizer(num_words=2000) 

# Fit it to the complaints
tokenizer.fit_on_texts(tweets) 

# Generate sequences
sequences = tokenizer.texts_to_sequences(tweets) 
print('sequences type:', type(sequences))

# Similar to sequences, but returns a numpy array
one_hot_results= tokenizer.texts_to_matrix(tweets, mode='binary') 
print('one_hot_results type:', type(one_hot_results))

# Useful if we wish to decode (more explanation below)
word_index = tokenizer.word_index 

# Tokens are the number of unique words across the corpus
print('Found %s unique tokens.' % len(word_index)) 

# Our coded data
print('Dimensions of our coded results:', np.shape(one_hot_results)) 

In [None]:
# Your code here
reverse_index = dict([(value, key) for (key, value) in word_index.items()])

In [None]:
comment_idx_to_preview = 19
print('Original complaint text:')
print(tweets[comment_idx_to_preview])
print('\n\n')

# The reverse_index cell block above must be complete in order for this cell block to successively execute 
decoded_review = ' '.join([reverse_index.get(i) for i in sequences[comment_idx_to_preview]])
print('Decoded review from Tokenizer:')
print(decoded_review)

In [None]:
sentiment = df['Sentiment']

# Initialize
le = preprocessing.LabelEncoder() 
le.fit(sentiment)
print('Original class labels:')
print(list(le.classes_))
print('\n')
product_cat = le.transform(sentiment)  

# If you wish to retrieve the original descriptive labels post production
# list(le.inverse_transform([0, 1, 3, 3, 0, 6, 4])) 

print('New sentiment labels:')
print(product_cat)
print('\n')

# Each row will be all zeros except for the category for that observation 
print('One hot labels; 3 binary columns, one for each of the categories.') 
product_onehot = to_categorical(product_cat)
print(product_onehot)
print('\n')

print('One hot labels shape:')
print(np.shape(product_onehot))

In [None]:
random.seed(42)
test_index = random.sample(range(1, 8936), 1500)

test = one_hot_results[test_index]
train = np.delete(one_hot_results, test_index, 0)

label_test = product_onehot[test_index]
label_train = np.delete(product_onehot, test_index, 0)

print('Test label shape:', np.shape(label_test))
print('Train label shape:', np.shape(label_train))
print('Test shape:', np.shape(test))
print('Train shape:', np.shape(train))

In [None]:
# Initialize a sequential model
model = models.Sequential()

# Two layers with relu activation
model.add(layers.Dense(50, activation='relu', input_shape=(2000,)))
model.add(layers.Dense(25, activation='relu'))

# One layer with softmax activation 
model.add(layers.Dense(3, activation='softmax'))

In [None]:
# Compile the model
model.compile(optimizer='SGD',
              loss='categorical_crossentropy',
              metrics=['acc'])

In [None]:
# Train the model 
history = model.fit(train,
                    label_train,
                    epochs=100,
                    batch_size=256)

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
# Plot the loss vs the number of epoch
history_dict = history.history
loss_values = history_dict['loss']

epochs = range(1, len(loss_values) + 1)

visual = sns.lineplot(x=epochs, 
                      y=loss_values, 
                      label='Training Loss', 
                      color='#00A36C')
visual.set_title('Training Loss')
visual.set_xlabel('Epochs')
visual.set_ylabel('Loss');

In [None]:
# Plot the training accuracy vs the number of epochs
accuracy_values = history_dict['acc']

epochs = range(1, len(loss_values) + 1)

visual = sns.lineplot(x=epochs, 
                      y=loss_values, 
                      label='Training Accuracy', 
                      color='#FF3131')
visual.set_title('Training Accuracy')
visual.set_xlabel('Epochs')
visual.set_ylabel('Loss');

In [None]:
# Output (probability) predictions for the test set 
y_hat_test = model.predict(test)

In [None]:
# Print the loss and accuracy for the training set 
results_train = model.evaluate(train, label_train)
results_train

In [None]:
# Print the loss and accuracy for the test set 
results_test = model.evaluate(test, label_test)
results_test

# Models

#### Baseline Model (Dummy)

In [None]:
# Create Dummy/Baseliner
estimator = DummyClassifier(strategy='most_frequent')
estimator.fit(X_train, y_train)

# Get predictions with dummy model
y_pred = estimator.predict(X_test)

# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['Pos (+)', 'Neutral', 'Neg (-)']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=estimator,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_Dummy');

target_names = ['Pos (+)', 'Neutral', 'Neg (-)']
print(classification_report(y_test, y_pred, target_names=target_names))

Dummy model predicts 'Neutral' for all and has an accuracy of 60%

#### Logistic Regression with SMOTE

In [None]:
# strategy = {0:2000, 1:4200, 2:2657}

# pipe = ImPipeline(steps=[
#     ('sm', SMOTE(random_state=42,
#                  sampling_strategy=strategy)),
#     ('estimator', LogisticRegression(random_state=42))
# ])

# param_grid = {}
# param_grid['estimator__C'] = [100, 10, 1.0, 0.1, 0.01]
# param_grid['estimator__solver'] = ['newton-cg', 'lbfgs', 'liblinear']
# param_grid['estimator__penalty'] = ['l2']

# grid_search = GridSearchCV(estimator=pipe, 
#                            param_grid=param_grid, 
#                            cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
#                            return_train_score=True, 
#                            scoring='accuracy', 
#                            n_jobs=-1,
#                            verbose=2)

# # Fit models run gridsearch
# grid_search.fit(X_train, y_train)

# # Mean training score
# grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# # Mean test score
# grid_test_score = grid_search.score(X_test, y_test)

# best_grid = grid_search.best_estimator_
# best_grid.fit(X_train, y_train)
# y_pred = best_grid.predict(X_test)

# print(f"Mean Training Score: {grid_train_score:.2%}\n")
# print(f"Mean Test Score: {grid_test_score:.2%}\n")

# print(f"Optimal Parameters: {grid_search.best_params_}\n")
# print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")


# # Set figsize and font scale
# sns.set(rc={'figure.figsize':(8, 8)})
# sns.set(font_scale=1)

# # Set diply labesl for confusion matrix
# display_labels = ['Pos (+)', 'Neutral', 'Neg (-)']

# # Plot a confusion matrix on the test data
# plot_confusion_matrix(estimator=best_grid,
#                       X=X_test,
#                       y_true=y_test,
#                       display_labels=display_labels)


# target_names = ['Pos (+)', 'Neutral', 'Neg (-)']
# print(classification_report(y_test, y_pred, target_names=target_names))

# # Save confusion matrix as png and place it in the images folder
# plt.savefig('images/Confusion_Matrix_LogRegSMOTE');

#### Logistic without SMOTE

In [None]:
pipe = Pipeline(steps=[
    ('estimator', LogisticRegression(random_state=42))
])

param_grid = {}
param_grid['estimator__C'] = [100, 10, 1.0, 0.1, 0.01]
param_grid['estimator__solver'] = ['newton-cg', 'lbfgs', 'liblinear']
param_grid['estimator__penalty'] = ['l2']

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           n_jobs=-1,
                           verbose=2)

# Fit models run gridsearch
grid_search.fit(X_train, y_train)

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['Pos (+)', 'Neutral', 'Neg (-)']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=best_grid,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)


target_names = ['Pos (+)', 'Neutral', 'Neg (-)']
print(classification_report(y_test, y_pred, target_names=target_names))

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_LogReg');

Performs better on unseen data without SMOTE, we will not use SMOTE in our next models

#### Ridge Regression

In [None]:
pipe = Pipeline(steps=[
    ('estimator', RidgeClassifier(random_state=42))
])

param_grid = {}
param_grid['estimator__alpha'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           n_jobs=-1,
                           verbose=2)

# Fit models run gridsearch
grid_search.fit(X_train, y_train)

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['Pos (+)', 'Neutral', 'Neg (-)']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=best_grid,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)

target_names = ['Pos (+)', 'Neutral', 'Neg (-)']
print(classification_report(y_test, y_pred, target_names=target_names))

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_Ridge');

Ridge yields a more overfit model, but it performs slightly better

#### K-Nearest Neighbors (KNN)

In [None]:
pipe = ImPipeline(steps=[
    ('estimator', KNeighborsClassifier())])

param_grid = {}
param_grid['estimator__n_neighbors'] = [1, 5, 9, 13, 17, 21]
param_grid['estimator__metric'] = ['euclidean', 'manhattan', 'minkowski']
param_grid['estimator__weights'] = ['uniform', 'distance']
    
grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           n_jobs=-1,
                           verbose=2)

# Fit models run gridsearch
grid_search.fit(X_train, y_train)

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['Pos(+)', 'Neutral', 'Neg(-)']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=best_grid,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)

target_names = ['Pos(+)', 'Neutral', 'Neg(-)']
print(classification_report(y_test, y_pred, target_names=target_names))

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_KNN');

KNN not good

#### Support Vector Machine

In [None]:
pipe = ImPipeline(steps=[
    ('estimator', SVC())])

param_grid = {}
param_grid['estimator__kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']
param_grid['estimator__C'] = [10, 1.0, 0.1]

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           n_jobs=-1,
                           verbose=2)

# Fit models run gridsearch
grid_search.fit(X_train, y_train)

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['Pos(+)', 'Neutral', 'Neg(-)']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=best_grid,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)

target_names = ['Pos(+)', 'Neutral', 'Neg(-)']
print(classification_report(y_test, y_pred, target_names=target_names))

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_SVM');

#### Random Forest

In [None]:
pipe = Pipeline(steps=[
    ('estimator', RandomForestClassifier(random_state=42))
])

param_grid = {}
param_grid['estimator__n_estimators'] = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
param_grid['estimator__max_features'] = ['auto', 'sqrt', 'log2']
param_grid['estimator__max_depth'] = [int(x) for x in np.linspace(10, 110, num=11)]
param_grid['estimator__min_samples_split'] = [2, 5, 10]
param_grid['estimator__min_samples_leaf'] = [1, 2, 4]

grid_search = RandomizedSearchCV(estimator=pipe, 
                                 param_distributions=param_grid, 
                                 cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                                 return_train_score=True, 
                                 scoring='accuracy', 
                                 n_iter=100, 
                                 random_state=42, 
                                 n_jobs=-1, 
                                 verbose=2)

# Fit models run gridsearch
grid_search.fit(X_train, y_train)

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['Pos(+)', 'Neutral', 'Neg(-)']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=best_grid,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)

target_names = ['Pos(+)', 'Neutral', 'Neg(-)']
print(classification_report(y_test, y_pred, target_names=target_names))

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_RandomForest');

In [None]:
from sklearn.neural_network import MLPClassifier

pipe = Pipeline(steps=[
    ('estimator', MLPClassifier(random_state=42))
])

param_grid = {}
param_grid['estimator__learning_rate'] = ["constant", "invscaling", "adaptive"]
param_grid['estimator__hidden_layer_sizes'] = [(100,1), (100,2), (100,3)]
param_grid['estimator__alpha'] = [10.0 ** -np.arange(1, 9)]
param_grid['estimator__activation'] = ["logistic", "relu", "Tanh"]

grid_search = RandomizedSearchCV(estimator=pipe, 
                                 param_distributions=param_grid, 
                                 cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                                 return_train_score=True, 
                                 scoring='accuracy', 
                                 n_iter=100, 
                                 random_state=42, 
                                 n_jobs=-1, 
                                 verbose=2)

# Fit models run gridsearch
grid_search.fit(X_train, y_train)

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

# Set figsize and font scale
sns.set(rc={'figure.figsize':(8, 8)})
sns.set(font_scale=1)

# Set diply labesl for confusion matrix
display_labels = ['Pos(+)', 'Neutral', 'Neg(-)']

# Plot a confusion matrix on the test data
plot_confusion_matrix(estimator=best_grid,
                      X=X_test,
                      y_true=y_test,
                      display_labels=display_labels)

target_names = ['Pos(+)', 'Neutral', 'Neg(-)']
print(classification_report(y_test, y_pred, target_names=target_names))

# Save confusion matrix as png and place it in the images folder
plt.savefig('images/Confusion_Matrix_RandomForest');