Steps to take

preprocess data
- stemming & lemmatization
- Tokenisation
- REGEX
- Stopwords removal
- Feature Engineering


In [18]:
import pandas as pd
import numpy as np
from scipy.stats import randint
import seaborn as sns  
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [19]:
df = pd.read_csv('tweets_dataset.csv')
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [21]:
list = ['emotion_in_tweet_is_directed_at','is_there_an_emotion_directed_at_a_brand_or_product']

for item in list:
    print(f'The column name is {item}')
    print(df[item].unique())
    print(df[item].value_counts())

The column name is emotion_in_tweet_is_directed_at
['iPhone' 'iPad or iPhone App' 'iPad' 'Google' nan 'Android' 'Apple'
 'Android App' 'Other Google product or service'
 'Other Apple product or service']
iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64
The column name is is_there_an_emotion_directed_at_a_brand_or_product
['Negative emotion' 'Positive emotion'
 'No emotion toward brand or product' "I can't tell"]
No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, 

In [22]:
## Removing the records which have an unknown sentiment option - Keeping the options to just Positive, Negative & Neutral
df = df[df['is_there_an_emotion_directed_at_a_brand_or_product'] != 'I can\'t tell']

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8937 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          8936 non-null   object
 1   emotion_in_tweet_is_directed_at                     3282 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  8937 non-null   object
dtypes: object(3)
memory usage: 279.3+ KB


In [23]:
tweet_df = df[['tweet_text', 'is_there_an_emotion_directed_at_a_brand_or_product']].copy()

tweet_df.columns = ['tweet_text', 'sentiment_classification']

tweet_df.head()

Unnamed: 0,tweet_text,sentiment_classification
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [24]:
tweet_df['category_id'] = tweet_df['sentiment_classification'].factorize()[0]
category_id_df = tweet_df[['sentiment_classification', 'category_id']].drop_duplicates()


In [25]:
# Dictionaries for future use
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'sentiment_classification']].values)

tweet_df.head()


Unnamed: 0,tweet_text,sentiment_classification,category_id
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion,1
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion,1
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion,1


In [26]:
id_to_category

{0: 'Negative emotion',
 1: 'Positive emotion',
 2: 'No emotion toward brand or product'}

In [27]:
tweet_df.dropna()

Unnamed: 0,tweet_text,sentiment_classification,category_id
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion,1
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion,1
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion,1
...,...,...,...
9088,Ipad everywhere. #SXSW {link},Positive emotion,1
9089,"Wave, buzz... RT @mention We interrupt your re...",No emotion toward brand or product,2
9090,"Google's Zeiger, a physician never reported po...",No emotion toward brand or product,2
9091,Some Verizon iPhone customers complained their...,No emotion toward brand or product,2


In [28]:
tweet_df.tweet_text

0       .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1       @jessedee Know about @fludapp ? Awesome iPad/i...
2       @swonderlin Can not wait for #iPad 2 also. The...
3       @sxsw I hope this year's festival isn't as cra...
4       @sxtxstate great stuff on Fri #SXSW: Marissa M...
                              ...                        
9088                        Ipad everywhere. #SXSW {link}
9089    Wave, buzz... RT @mention We interrupt your re...
9090    Google's Zeiger, a physician never reported po...
9091    Some Verizon iPhone customers complained their...
9092    �ϡ�����_��ʋ�΋�ҋ�������⋁_��������_���RT @mentio...
Name: tweet_text, Length: 8937, dtype: object

## EDA - Understanding which terms has highest correlation with each category

In [29]:
tf_idf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

# We transform each tweet into a vector
features = tf_idf.fit_transform(tweet_df.tweet_text.values.astype('U')).toarray()


labels = tweet_df.category_id

print("Each of the %d tweets is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))


Each of the 8937 tweets is represented by 4603 features (TF-IDF score of unigrams and bigrams)


In [30]:
# Finding the five most correlated terms with each of the product categories
N = 5
for sentiment_classification, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tf_idf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("\n==> %s:" %(sentiment_classification))
  print("  * Most Correlated Unigrams are: %s" %(', '.join(unigrams[-N:])))
  print("  * Most Correlated Bigrams are: %s" %(', '.join(bigrams[-N:])))



==> Negative emotion:
  * Most Correlated Unigrams are: classiest, fail, headaches, hate, fascist
  * Most Correlated Bigrams are: design headaches, news apps, fascist company, ipad news, company america

==> No emotion toward brand or product:
  * Most Correlated Unigrams are: begins, app, awesome, wins, cool
  * Most Correlated Bigrams are: comes cool, quot apple, begins apple, wins sxsw, apple wins

==> Positive emotion:
  * Most Correlated Unigrams are: comes, begins, awesome, wins, cool
  * Most Correlated Bigrams are: comes cool, apple comes, begins apple, wins sxsw, apple wins


## Baseline Model

In [31]:
X = tweet_df['tweet_text']
y = tweet_df['category_id']

In [62]:
X.shape

(8937,)

In [63]:
y.shape

(8937,)

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state=42, stratify=y)

In [66]:
X_train.shape

(7149,)

In [67]:
y_train.shape

(7149,)

### Tf-Idf and count Vectorisation

In [68]:
# creating a count 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(tweet_df['tweet_text'].values.astype('U'))

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(X_train.values.astype('U'))
xtest_count =  count_vect.transform(X_test.values.astype('U'))


In [69]:
# word-level tf-idf
tfidf = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=4000)
tfidf.fit(tweet_df['tweet_text'].values.astype('U'))
xtrain_tfidf =  tfidf.transform(X_train.values.astype('U'))
xtest_tfidf =  tfidf.transform(X_test.values.astype('U'))

# ngram (uni, bi)-level tf-idf
tfidf_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range = (1,2) , max_features=4000)
tfidf_ngram.fit(tweet_df['tweet_text'].values.astype('U'))
xtrain_tfidf_ngram =  tfidf_ngram.transform(X_train.values.astype('U'))
xtest_tfidf_ngram =  tfidf_ngram.transform(X_test.values.astype('U'))

In [70]:
def train_model(classifier, train_df, label, test_df):
    
    # fit the training dataset on the classifier
    classifier.fit(train_df, label)
    
    # predict the labels on test dataset
    predictions = classifier.predict(test_df)
    
    return metrics.accuracy_score(predictions, y_test)

In [71]:
# Naive Bayes on Count Vectors
accuracy = train_model(MultinomialNB(), xtrain_count, y_train, xtest_count)
print("NB, Count Vectors: ", accuracy)

NB, Count Vectors:  0.7013422818791947


In [72]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(MultinomialNB(), xtrain_tfidf, y_train, xtest_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

NB, WordLevel TF-IDF:  0.6817673378076062


In [73]:
# Naive Bayes on n-gram Level TF IDF Vectors
accuracy = train_model(MultinomialNB(), xtrain_tfidf_ngram, y_train, xtest_tfidf_ngram)
print("NB, n-gram TF-IDF: ", accuracy)

NB, n-gram TF-IDF:  0.6940715883668904


## Text Pre-Processing

In order to improve performance we want to complete the following steps to help reduce/remove the noise from the messy text data. These steps are:
- **Step 1:** Tokenize all tweets
- **Step 2:** Lower case  all tokens
- **Step 3:** Remove all punctuation
- **Step 4:** Remove @mentions
- **Step 5:** html.unescape(text) to remove HTML parsing
- **Step 6:** Remove urls
- **Step 7:** Remove all non asci characters
- **Step 8:** Split attached words
- **Step 9:** Remove common words related to the event itself such as **sxsw**
- **Step 10:** Standardise words (if they use too many letters)
- **Step 11:** Stem/lemmatise words

This will then allow us to fit out models with a cleaner, hopefully more robust dataset.
