In [1]:
import pandas as pd
import nltk
import regex as re
import string

In [2]:
df = pd.read_csv('./data/tweets.csv')

# EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [4]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [5]:
df.is_there_an_emotion_directed_at_a_brand_or_product.value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

This shows a big imbalance in the number of positive and negative sentiments, and also shows the majority of it having no opinion.<br />
We're joining the 'I cant't tell' column with the 'No emotion' one.

In [6]:
df['is_there_an_emotion_directed_at_a_brand_or_product'] = df['is_there_an_emotion_directed_at_a_brand_or_product'].apply(lambda x: 'No emotion toward brand or product' if  x == "I can't tell" else x)

In [7]:
df.emotion_in_tweet_is_directed_at.value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [8]:
df.isna().sum()

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64

Getting rid of the one instance of empty tweet:

In [9]:
df.dropna(subset=['tweet_text'], inplace=True)

Getting rid of duplicates:

In [10]:
df.duplicated().sum()

22

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9070 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9070 non-null   object
 1   emotion_in_tweet_is_directed_at                     3282 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9070 non-null   object
dtypes: object(3)
memory usage: 283.4+ KB


Renaming columns for easier understanding:

In [13]:
df.rename(columns={'tweet_text': 'text', 'emotion_in_tweet_is_directed_at': 'product', 'is_there_an_emotion_directed_at_a_brand_or_product': 'target'}, inplace=True)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9070 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     9070 non-null   object
 1   product  3282 non-null   object
 2   target   9070 non-null   object
dtypes: object(3)
memory usage: 283.4+ KB


Creating a length feature:

In [15]:
df['length'] = df['text'].apply(lambda x: len(x.split()))

Changing target to numeral:

In [16]:
df['target'] = df['target'].map({'No emotion toward brand or product': 0,
                                     'Negative emotion': 1,
                                     'Positive emotion': 2})

In [17]:
df.head()

Unnamed: 0,text,product,target,length
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,1,23
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,2,22
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,2,15
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,1,15
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,2,17


## Cleaning the text column

In [18]:
df_changed = df.copy()

#### Lower casing:

In [19]:
df_changed['text'][0]

'.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'

In [20]:
df_changed['text'] = df_changed['text'].apply(lambda x: x.lower())

In [21]:
df_changed['text'][0]

'.@wesley83 i have a 3g iphone. after 3 hrs tweeting at #rise_austin, it was dead!  i need to upgrade. plugin stations at #sxsw.'

#### Removing non-ascii:

In [22]:
def remove_non_ascii(text):
    return unidecode(unicode(text, encoding = "utf-8"))

In [23]:
df_changed['text'][9092]

'�ϡ�����_��ʋ�\u038b�ҋ�������⋁_��������_���rt @mention google tests ���check-in offers�\u06dd at #sxsw {link}'

In [24]:
df_changed['text'] = df_changed['text'].str.encode('ascii', 'ignore').str.decode('ascii')

In [25]:
df_changed['text'][9092]

'___rt @mention google tests check-in offers at #sxsw {link}'

#### Removing urls:

In [26]:
df_changed['text'][5]

'@teachntech00 new ipad apps for #speechtherapy and communication are showcased at the #sxsw conference http://ht.ly/49n4m #iear #edchat #asd'

In [27]:
df_changed['text'] = df_changed['text'].apply(lambda x: re.sub(r'http\S+', '', x))

In [28]:
df_changed['text'][5]

'@teachntech00 new ipad apps for #speechtherapy and communication are showcased at the #sxsw conference  #iear #edchat #asd'

#### Removing user tagging:

In [29]:
df_changed['text'][5]

'@teachntech00 new ipad apps for #speechtherapy and communication are showcased at the #sxsw conference  #iear #edchat #asd'

In [30]:
df_changed['text'] = df_changed['text'].apply(lambda x: re.sub(r'@\S*', '', x))

In [31]:
df_changed['text'][4]

" great stuff on fri #sxsw: marissa mayer (google), tim o'reilly (tech books/conferences) &amp; matt mullenweg (wordpress)"

#### Tokeninzing:

In [32]:
def tokenize(text):
    split=re.split("\W+",text) 
    return split

In [33]:
df_changed['text'] = df_changed['text'].apply(lambda x: tokenize(x))

In [34]:
df_changed.head()

Unnamed: 0,text,product,target,length
0,"[, i, have, a, 3g, iphone, after, 3, hrs, twee...",iPhone,1,23
1,"[, know, about, awesome, ipad, iphone, app, th...",iPad or iPhone App,2,22
2,"[, can, not, wait, for, ipad, 2, also, they, s...",iPad,2,15
3,"[, i, hope, this, year, s, festival, isn, t, a...",iPad or iPhone App,1,15
4,"[, great, stuff, on, fri, sxsw, marissa, mayer...",Google,2,17


#### Remove punctuation and empty indexes:

In [35]:
df_changed['text'][9092]

['___rt', 'google', 'tests', 'check', 'in', 'offers', 'at', 'sxsw', 'link', '']

In [36]:
def remove_punctuation(tokens):
    token_list = [''.join(letter for letter in word if letter not in string.punctuation) for word in tokens]
    return [word for word in token_list if word]

In [37]:
df_changed['text'] = df_changed['text'].apply(lambda x: remove_punctuation(x))

In [38]:
df_changed['text'][9092]

['rt', 'google', 'tests', 'check', 'in', 'offers', 'at', 'sxsw', 'link']

#### Removing stopwords:

In [39]:
stopword = nltk.corpus.stopwords.words('english')

In [40]:
stopword.remove('not')

In [41]:
stopword.append('rt')

In [42]:
def remove_stopwords(text):
    text=[word for word in text if word not in stopword]
    return text

In [43]:
df_changed['text'][9092]

['rt', 'google', 'tests', 'check', 'in', 'offers', 'at', 'sxsw', 'link']

In [44]:
df_changed['text'] = df_changed['text'].apply(lambda x: remove_stopwords(x))

In [45]:
df_changed['text'][9092]

['google', 'tests', 'check', 'offers', 'sxsw', 'link']

In [46]:
df_changed

Unnamed: 0,text,product,target,length
0,"[3g, iphone, 3, hrs, tweeting, riseaustin, dea...",iPhone,1,23
1,"[know, awesome, ipad, iphone, app, likely, app...",iPad or iPhone App,2,22
2,"[not, wait, ipad, 2, also, sale, sxsw]",iPad,2,15
3,"[hope, year, festival, crashy, year, iphone, a...",iPad or iPhone App,1,15
4,"[great, stuff, fri, sxsw, marissa, mayer, goog...",Google,2,17
...,...,...,...,...
9088,"[ipad, everywhere, sxsw, link]",iPad,2,4
9089,"[wave, buzz, interrupt, regularly, scheduled, ...",,0,18
9090,"[google, zeiger, physician, never, reported, p...",,0,19
9091,"[verizon, iphone, customers, complained, time,...",,0,23
