In [81]:
import pandas as pd

In [178]:
df = pd.read_csv('./data/tweets.csv')

# EDA

In [137]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [138]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [179]:
df.is_there_an_emotion_directed_at_a_brand_or_product.value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

This shows a big imbalance in the number of positive and negative sentiments, and also shows the majority of it having no opinion.<br />
We're joining the 'I cant't tell' column with the 'No emotion' one.

In [182]:
df['is_there_an_emotion_directed_at_a_brand_or_product'] = df['is_there_an_emotion_directed_at_a_brand_or_product'].apply(lambda x: 'No emotion toward brand or product' if  x == "I can't tell" else x)

In [183]:
df.emotion_in_tweet_is_directed_at.value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [184]:
df.isna().sum()

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64

Getting rid of the one instance of empty tweet:

In [185]:
df.dropna(subset=['tweet_text'], inplace=True)

Getting rid of duplicates:

In [186]:
df.duplicated().sum()

22

In [187]:
df.drop_duplicates(inplace=True)

In [188]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9070 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9070 non-null   object
 1   emotion_in_tweet_is_directed_at                     3282 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9070 non-null   object
dtypes: object(3)
memory usage: 283.4+ KB


Renaming columns for easier understanding:

In [189]:
df.rename(columns={'tweet_text': 'text', 'emotion_in_tweet_is_directed_at': 'product', 'is_there_an_emotion_directed_at_a_brand_or_product': 'target'}, inplace=True)

In [190]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9070 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     9070 non-null   object
 1   product  3282 non-null   object
 2   target   9070 non-null   object
dtypes: object(3)
memory usage: 283.4+ KB


Creating a length feature:

In [191]:
df['length'] = df['text'].apply(lambda x: len(x.split()))

In [198]:
df['target'] = df['target'].map({'No emotion toward brand or product': 0,
                                     'Negative emotion': 1,
                                     'Positive emotion': 2})

In [200]:
df.head()

Unnamed: 0,text,product,target,length
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,1,23
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,2,22
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,2,15
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,1,15
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,2,17


In [11]:
import string

In [89]:
def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

In [90]:
df['tweet_text_wo_punct']=df['tweet_text'].apply(lambda x: remove_punctuation(x))

In [93]:
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,tweet_text_wo_punct
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,wesley83 I have a 3G iPhone After 3 hrs tweeti...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,jessedee Know about fludapp Awesome iPadiPhon...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,swonderlin Can not wait for iPad 2 also They s...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,sxsw I hope this years festival isnt as crashy...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,sxtxstate great stuff on Fri SXSW Marissa Maye...
...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,Ipad everywhere SXSW link
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,Wave buzz RT mention We interrupt your regular...
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,Googles Zeiger a physician never reported pote...
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,Some Verizon iPhone customers complained their...


In [17]:
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [96]:
df['tweet_text_wo_punct'].values

array(['wesley83 I have a 3G iPhone After 3 hrs tweeting at RISEAustin it was dead  I need to upgrade Plugin stations at SXSW',
       'jessedee Know about fludapp  Awesome iPadiPhone app that youll likely appreciate for its design Also theyre giving free Ts at SXSW',
       'swonderlin Can not wait for iPad 2 also They should sale them down at SXSW',
       ...,
       'Googles Zeiger a physician never reported potential AE Yet FDA relies on physicians quotWere operating wout dataquot sxsw health2dev',
       'Some Verizon iPhone customers complained their time fell back an hour this weekend  Of course they were the New Yorkers who attended SXSW',
       '�ϡ�������ʋ�\u038b�ҋ�������⋁�����������RT mention Google Tests ���Checkin Offers�\u06dd At SXSW link'],
      dtype=object)

In [97]:
df = df[~df['tweet_text_wo_punct'].isin([string.ascii_lowercase])]

In [98]:
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,tweet_text_wo_punct
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,wesley83 I have a 3G iPhone After 3 hrs tweeti...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,jessedee Know about fludapp Awesome iPadiPhon...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,swonderlin Can not wait for iPad 2 also They s...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,sxsw I hope this years festival isnt as crashy...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,sxtxstate great stuff on Fri SXSW Marissa Maye...
...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,Ipad everywhere SXSW link
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,Wave buzz RT mention We interrupt your regular...
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,Googles Zeiger a physician never reported pote...
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,Some Verizon iPhone customers complained their...


In [99]:
df[df['tweet_text_wo_punct'].str.contains(r'[^\x00-\x7F]')]

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,tweet_text_wo_punct
38,@mention - False Alarm: Google Circles Not Co...,Google,Negative emotion,mention False Alarm Google Circles Not Comin...
41,HootSuite - HootSuite Mobile for #SXSW ~ Updat...,,No emotion toward brand or product,HootSuite HootSuite Mobile for SXSW Updates ...
42,Hey #SXSW - How long do you think it takes us ...,,No emotion toward brand or product,Hey SXSW How long do you think it takes us to...
45,#IPad2 's ���#SmartCover�۪ Opens to Instant Ac...,iPad or iPhone App,Positive emotion,IPad2 s ���SmartCover�۪ Opens to Instant Acces...
46,Hand-Held ���Hobo�۪: Drafthouse launches ���Ho...,,Positive emotion,HandHeld ���Hobo�۪ Drafthouse launches ���Hobo...
...,...,...,...,...
8925,umm that would be @mention ���@mention I keep ...,Other Apple product or service,Positive emotion,umm that would be mention ���mention I keep wi...
8945,FestivalExplorer iPhone App Finally Solves SXS...,iPad or iPhone App,Positive emotion,FestivalExplorer iPhone App Finally Solves SXS...
8963,Group #Texting War Heats Up: Fast Society Laun...,Android App,Positive emotion,Group Texting War Heats Up Fast Society Launch...
8982,In case my fairy god mother = reading mail; my...,,No emotion toward brand or product,In case my fairy god mother reading mail my �...


In [100]:
def remove_non_ascii(text):
    return unidecode(unicode(text, encoding = "utf-8"))

In [101]:
df['no_ascii'] = df['tweet_text_wo_punct'].str.encode('ascii', 'ignore').str.decode('ascii')

In [102]:
df.drop(['tweet_text', 'tweet_text_wo_punct'], axis=1, inplace=True)

In [103]:
df.head()

Unnamed: 0,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,no_ascii
0,iPhone,Negative emotion,wesley83 I have a 3G iPhone After 3 hrs tweeti...
1,iPad or iPhone App,Positive emotion,jessedee Know about fludapp Awesome iPadiPhon...
2,iPad,Positive emotion,swonderlin Can not wait for iPad 2 also They s...
3,iPad or iPhone App,Negative emotion,sxsw I hope this years festival isnt as crashy...
4,Google,Positive emotion,sxtxstate great stuff on Fri SXSW Marissa Maye...


In [52]:
import regex as re

In [104]:
def tokenize(text):
    split=re.split("\W+",text) 
    return split

In [105]:
df['no_ascii_split']=df['no_ascii'].apply(lambda x: tokenize(x.lower()))

In [106]:
df.head()

Unnamed: 0,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,no_ascii,no_ascii_split
0,iPhone,Negative emotion,wesley83 I have a 3G iPhone After 3 hrs tweeti...,"[wesley83, i, have, a, 3g, iphone, after, 3, h..."
1,iPad or iPhone App,Positive emotion,jessedee Know about fludapp Awesome iPadiPhon...,"[jessedee, know, about, fludapp, awesome, ipad..."
2,iPad,Positive emotion,swonderlin Can not wait for iPad 2 also They s...,"[swonderlin, can, not, wait, for, ipad, 2, als..."
3,iPad or iPhone App,Negative emotion,sxsw I hope this years festival isnt as crashy...,"[sxsw, i, hope, this, years, festival, isnt, a..."
4,Google,Positive emotion,sxtxstate great stuff on Fri SXSW Marissa Maye...,"[sxtxstate, great, stuff, on, fri, sxsw, maris..."


In [107]:
df.drop(['no_ascii'], axis=1, inplace=True)

In [108]:
df

Unnamed: 0,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,no_ascii_split
0,iPhone,Negative emotion,"[wesley83, i, have, a, 3g, iphone, after, 3, h..."
1,iPad or iPhone App,Positive emotion,"[jessedee, know, about, fludapp, awesome, ipad..."
2,iPad,Positive emotion,"[swonderlin, can, not, wait, for, ipad, 2, als..."
3,iPad or iPhone App,Negative emotion,"[sxsw, i, hope, this, years, festival, isnt, a..."
4,Google,Positive emotion,"[sxtxstate, great, stuff, on, fri, sxsw, maris..."
...,...,...,...
9088,iPad,Positive emotion,"[ipad, everywhere, sxsw, link]"
9089,,No emotion toward brand or product,"[wave, buzz, rt, mention, we, interrupt, your,..."
9090,,No emotion toward brand or product,"[googles, zeiger, a, physician, never, reporte..."
9091,,No emotion toward brand or product,"[some, verizon, iphone, customers, complained,..."


In [60]:
import nltk

In [109]:
stopword = nltk.corpus.stopwords.words('english')

In [110]:
stopword.append('rt')

In [111]:
def remove_stopwords(text):
    text=[word for word in text if word not in stopword]
    return text

In [112]:
df['no_stopwords'] = df['no_ascii_split'].apply(lambda x: remove_stopwords(x))

In [113]:
df.head()

Unnamed: 0,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,no_ascii_split,no_stopwords
0,iPhone,Negative emotion,"[wesley83, i, have, a, 3g, iphone, after, 3, h...","[wesley83, 3g, iphone, 3, hrs, tweeting, risea..."
1,iPad or iPhone App,Positive emotion,"[jessedee, know, about, fludapp, awesome, ipad...","[jessedee, know, fludapp, awesome, ipadiphone,..."
2,iPad,Positive emotion,"[swonderlin, can, not, wait, for, ipad, 2, als...","[swonderlin, wait, ipad, 2, also, sale, sxsw]"
3,iPad or iPhone App,Negative emotion,"[sxsw, i, hope, this, years, festival, isnt, a...","[sxsw, hope, years, festival, isnt, crashy, ye..."
4,Google,Positive emotion,"[sxtxstate, great, stuff, on, fri, sxsw, maris...","[sxtxstate, great, stuff, fri, sxsw, marissa, ..."


In [114]:
df.drop(['no_ascii_split'], axis=1, inplace=True)

In [115]:
df.head()

Unnamed: 0,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,no_stopwords
0,iPhone,Negative emotion,"[wesley83, 3g, iphone, 3, hrs, tweeting, risea..."
1,iPad or iPhone App,Positive emotion,"[jessedee, know, fludapp, awesome, ipadiphone,..."
2,iPad,Positive emotion,"[swonderlin, wait, ipad, 2, also, sale, sxsw]"
3,iPad or iPhone App,Negative emotion,"[sxsw, hope, years, festival, isnt, crashy, ye..."
4,Google,Positive emotion,"[sxtxstate, great, stuff, fri, sxsw, marissa, ..."


In [74]:
emoticon_query = r'(?:[\:;X=B][-^]?[)\]3D([OP/\\|])(?:(?=\s))'

In [77]:
df["no_stopwords"].str.contains(emoticon_query).unique()

array([nan])