##  Bag of Tweets

####  Eryk Wdowiak and Eric Adsetts

Module 4 project -- sentiment in Tweets

In [1]:
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline

# import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# from  nltk import FreqDist
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
##  load data
data = pd.read_csv('dataset/judge-1377884607_tweet_product_company_v2-clean.csv')
data.columns = ['tweet','direction','emotion']
# data.shape  # (9093, 3)

##  remove rows without tweet
data = data.dropna(subset=['tweet','emotion'],axis='index')
# data.shape  # (9092, 3)

##  clean emotions
emo_dict = {'Negative emotion':'negative', 
            'Positive emotion':'positive',
            'No emotion toward brand or product':'neutral', 
            "I can't tell":'neutral'}
data['emotion'] = data['emotion'].replace(emo_dict)
del emo_dict

##  define company and product
##  first convert NaN to a string
data['direction'] = data['direction'].map('{}'.format)

##  define company
comp_dict = {'iPhone':'Apple', 
             'iPad or iPhone App':'Apple', 
             'iPad':'Apple', 
             'Google':'Google', 
             'nan':'unknown', 
             'Android':'Google',
             'Apple':'Apple',
             'Android App':'Google', 
             'Other Google product or service':'Google',
             'Other Apple product or service':'Apple'}
data['company'] = data['direction'].replace(comp_dict)
del comp_dict

##  define product
prod_dict = {'iPhone':'device', 
             'iPad or iPhone App':'software', 
             'iPad':'device', 
             'Google':'company', 
             'nan':'unknown', 
             'Android':'device',
             'Apple':'company',
             'Android App':'software', 
             'Other Google product or service':'other',
             'Other Apple product or service':'other'}
data['product'] = data['direction'].replace(prod_dict)
del prod_dict

##  let's take a look
data.head(10)

Unnamed: 0,tweet,direction,emotion,company,product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,negative,Apple,device
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,positive,Apple,software
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,positive,Apple,device
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,negative,Apple,software
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,positive,Google,company
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,neutral,unknown,unknown
7,"#SXSW is just starting, #CTIA is around the co...",Android,positive,Google,device
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,positive,Apple,software
9,Counting down the days to #sxsw plus strong Ca...,Apple,positive,Apple,company
10,Excited to meet the @samsungmobileus at #sxsw ...,Android,positive,Google,device


In [3]:
##  prepare stop word list
stopwords_list = stopwords.words('english')
# stopwords_list += list(string.punctuation)
# stopwords_list += ['0','1','2','3','4','5','6','7','8','9']

##  process tweets
def process_tweets(tweet):
    tokens = word_tokenize(tweet)
    wnl = WordNetLemmatizer()
    no_stop_lemmas = [wnl.lemmatize(token.lower()) for token in tokens if token.lower() not in stopwords_list]
    ot_string = ' '.join(no_stop_lemmas)
    return ot_string

##  process tweets
data['tweet'] = list(map(process_tweets, list(data['tweet'])))

In [4]:
##  train test split
X_train, X_test, y_train, y_test = train_test_split(data[['tweet','company','product']], 
                                                    data['emotion'], 
                                                    test_size=0.20, random_state=42)

In [5]:
##  create TF-IDF matrix
vectorizer = TfidfVectorizer()
tf_idf_data_train = vectorizer.fit_transform(X_train['tweet'])
tf_idf_data_test = vectorizer.transform(X_test['tweet'])

In [6]:
##  naive bayes classifier
nb_classifier = MultinomialNB()

nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4}".format(nb_train_score))
print("Testing Accuracy:  {:.4}".format(nb_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.7548
Testing Accuracy:  0.6443


In [7]:
##  random forests classifier
rf_classifier = RandomForestClassifier(n_estimators=100)

rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print('Random Forests')
print("Training Accuracy: {:.4}".format(rf_train_score))
print("Testing Accuracy:  {:.4}".format(rf_test_score))

Random Forests
Training Accuracy: 0.9948
Testing Accuracy:  0.6839
