In [1]:
from seaborn import heatmap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import spacy
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
import re

In [37]:
nlp = spacy.load('en_core_web_sm')

In [3]:
# my ascii cleaner function doesn't work unless we write the file to csv first
# for some reason.
df = pd.read_csv("https://query.data.world/s/nrbtbcd7jydlrybusl7mom5rvov4zd",
                 encoding = "ISO-8859-1")

df.to_csv("data/twitter_dataset.csv")

function that removes non ascii characters based on  https://stackoverflow.com/questions/26541968/delete-every-non-utf-8-symbols-from-string

In [4]:
def file_to_ascii(filename, writename = None):
    """rewrites a file with only ascii. returns None unless there's an error"""
    if writename is None:
        writename = filename[:-4] + "_only_ascii.csv"
    cleaned_doc = []
    with open(filename, "r") as readfile:
        for line in readfile:
            line = str(line.strip())
            cleaned_doc.append(line.encode("ascii", "ignore"))

    try:
        with open(writename, "xb") as writefile:
            for i in cleaned_doc:
                writefile.write(i + b"\n")
    except BaseException as e:
        print(e)
        return e
    
    return None

In [5]:
file_to_ascii("data/twitter_dataset.csv")

In [10]:
df = pd.read_csv("data/twitter_dataset_only_ascii.csv").drop(["Unnamed: 0"], axis = 1)

In [11]:
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


In [12]:
for i in df.loc[9092]: print(i)

___RT @mention Google Tests Check-in Offers At #SXSW {link}
nan
No emotion toward brand or product


In [13]:
df.columns = ['text', 'directed_at', 'emotion']

In [14]:
df.head(100)

Unnamed: 0,text,directed_at,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion


In [15]:
df.shape

(9093, 3)

1. Text, clean to have hastags and @symbols in a separate column
2. Have nlp ready column
3. directed@-> make "brand"


In [16]:
df.text.isnull().any()

True

In [17]:
df.text.dropna(inplace=True)

In [18]:
df.reset_index(inplace=True)

In [19]:
df.emotion.unique()

array(['Negative emotion', 'Positive emotion',
       'No emotion toward brand or product', "I can't tell"], dtype=object)

In [20]:
df.emotion.value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: emotion, dtype: int64

In [21]:
df.directed_at.unique()

array(['iPhone', 'iPad or iPhone App', 'iPad', 'Google', nan, 'Android',
       'Apple', 'Android App', 'Other Google product or service',
       'Other Apple product or service'], dtype=object)

In [22]:
conditions = [(df.emotion=='No emotion toward brand or product'),
  (df.emotion=='Positive emotion'),
  (df.emotion=='Negative emotion'),
  (df.emotion=="I can't tell")]
cases=[2, 0, 1, 3]
df['target']= np.select(conditions, cases) 

In [23]:
df.target.value_counts()

2    5389
0    2978
1     570
3     156
Name: target, dtype: int64

In [24]:
apple= ['iPhone', 'iPad or iPhone App', 'iPad', 'Apple', 'Other Apple product or service']
google = ['Google', 'Android', 'Android App', 'Other Google product or service']
conditions = [
    (df.directed_at == 'iPhone')|(df.directed_at == 'iPad or iPhone App')|
    (df.directed_at == 'iPad')|(df.directed_at == 'Apple')|(df.directed_at == 'Other Apple product or service'),
    (df.directed_at == 'Google')|(df.directed_at == 'Android')|
    (df.directed_at == 'Android App')|(df.directed_at == 'Other Google product or service')]
cases=['Apple', 'Google']
df['brand']=np.select(conditions, cases, None)

In [25]:
df.head()

Unnamed: 0,index,text,directed_at,emotion,target,brand
0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,1,Apple
1,1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,0,Apple
2,2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,0,Apple
3,3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,1,Apple
4,4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,0,Google


In [26]:
df.groupby(df.brand).target.value_counts(normalize=True)

brand   target
Apple   0         0.809049
        1         0.161063
        2         0.026982
        3         0.002906
Google  0         0.819728
        1         0.148526
        2         0.029478
        3         0.002268
Name: target, dtype: float64

In [27]:
df.target.value_counts()

2    5389
0    2978
1     570
3     156
Name: target, dtype: int64

In [28]:
df.drop(index=df[df.target>1].index, inplace=True)
df.target.value_counts()

0    2978
1     570
Name: target, dtype: int64

In [29]:
target = df["target"]
df = df.drop(["target"],axis=1)
target

0       1
1       0
2       0
3       1
4       0
       ..
9077    0
9079    0
9080    1
9085    0
9088    0
Name: target, Length: 3548, dtype: int64

## spacy remove stopwords and lemmatization
based on https://nbviewer.jupyter.org/github/matt8955/tweet-explorer/blob/master/flatiron_tweets.ipynb

In [33]:
def check_stop_punct(token):
    if token.is_stop or token.is_punct:
        return False
    else:
        return True

In [45]:
def clean_doc_lem_stop(document):
    no_stop_li = list(filter(check_stop_punct, document))
    lemma_li = list(map(lambda x: x.lemma_, no_stop_li))
    return " ".join(lemma_li)

In [46]:
testdoc = nlp("I'm a scorpion man.")
clean_doc_lem_stop(testdoc)

'scorpion man'

In [47]:
def remove_stop_lemmatize(text_series):
    """accepts a pandas series of strings and returns a lemmatized version
    that also drops stopwords.
    """
    doc_series = text_series.apply(nlp)
    # stopless is a list
    cleaned_series = doc_series.apply(clean_doc_lem_stop)
    return cleaned_series

In [49]:
df["text"].loc[0:5]

0    .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1    @jessedee Know about @fludapp ? Awesome iPad/i...
2    @swonderlin Can not wait for #iPad 2 also. The...
3    @sxsw I hope this year's festival isn't as cra...
4    @sxtxstate great stuff on Fri #SXSW: Marissa M...
Name: text, dtype: object

In [50]:
test_rows = remove_stop_lemmatize(df["text"].loc[0:5])

In [54]:
test_rows

'@swonderlin wait iPad 2 sale SXSW'

## Make train, test, and validation sets

In [30]:
x_full, y_full, x_test, y_test = train_test_split(df, target, test_size=.1)
full_train = pd.concat([x_full, y_full])
x_train, y_train, x_val, y_val = train_test_split(full_train, target, test_size=.2)

In [31]:
full_train

Unnamed: 0,index,text,directed_at,emotion,brand
8002,8002,Wishful: Several semis filled with Apple's iPa...,,Positive emotion,
1815,1815,Just got a free iPhone charger. Someone came u...,iPhone,Positive emotion,Apple
4516,4516,Man panhandling for an iPad 2 at SXSW. Whats t...,iPad,Negative emotion,Apple
964,964,"iPhone in one hand, caramel macchiato in the o...",iPhone,Positive emotion,Apple
5858,5858,RT @mention Google set to launch new social ne...,Google,Positive emotion,Google
...,...,...,...,...,...
456,456,Jeez guys dunno about an Apple pop-up over a G...,Apple,Negative emotion,Apple
8132,8132,Great #sxsw ipad app from @mention {link},iPad or iPhone App,Positive emotion,Apple
3866,3866,@mention is my favorite iphone app at the mome...,iPad or iPhone App,Positive emotion,Apple
3755,3755,Settling into first #SXSW session: #battledeck...,iPad or iPhone App,Positive emotion,Apple


In [32]:
x_train.to_csv("data/x_train.csv")
y_train.to_csv("data/y_train.csv")

x_val.to_csv("data/x_val.csv")
y_val.to_csv("data/y_val.csv")

x_test.to_csv("data/x_test.csv")
y_test.to_csv("data/y_test.csv")