In [1]:
from seaborn import heatmap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import spacy
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
import re

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
# my ascii cleaner function doesn't work unless we write the file to csv first
# for some reason.
df = pd.read_csv("https://query.data.world/s/nrbtbcd7jydlrybusl7mom5rvov4zd",
                 encoding = "ISO-8859-1")

df.to_csv("../data/twitter_dataset.csv")

function that removes non ascii characters based on  https://stackoverflow.com/questions/26541968/delete-every-non-utf-8-symbols-from-string

In [5]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [6]:
def file_to_ascii(filename, writename = None):
    """rewrites a file with only ascii. returns None unless there's an error"""
    if writename is None:
        writename = filename[:-4] + "_only_ascii.csv"
    cleaned_doc = []
    with open(filename, "r") as readfile:
        for line in readfile:
            line = str(line.strip())
            cleaned_doc.append(line.encode("ascii", "ignore"))

    try:
        with open(writename, "xb") as writefile:
            for i in cleaned_doc:
                writefile.write(i + b"\n")
    except BaseException as e:
        print(e)
        return e
    
    return None

In [7]:
file_to_ascii("../data/twitter_dataset.csv")

[Errno 17] File exists: '../data/twitter_dataset_only_ascii.csv'


FileExistsError(17, 'File exists')

In [22]:
df = pd.read_csv("../data/twitter_dataset_only_ascii.csv").drop(["Unnamed: 0"], axis = 1)

In [23]:
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


In [24]:
df.columns = ['text', 'directed_at', 'emotion']

1. Text, clean to have hastags and @symbols in a separate column
2. Have nlp ready column
3. directed@-> make "brand"


In [34]:
df.text.isnull().any()

False

In [51]:
df.drop(index=6, inplace=True)

In [52]:
df.text.isnull().any()

False

In [35]:
df.reset_index(inplace=True)

In [36]:
df.emotion.unique()

array(['Negative emotion', 'Positive emotion',
       'No emotion toward brand or product', "I can't tell"], dtype=object)

In [37]:
df.emotion.value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: emotion, dtype: int64

In [38]:
df.directed_at.unique()

array(['iPhone', 'iPad or iPhone App', 'iPad', 'Google', nan, 'Android',
       'Apple', 'Android App', 'Other Google product or service',
       'Other Apple product or service'], dtype=object)

In [39]:
conditions = [(df.emotion=='No emotion toward brand or product'),
  (df.emotion=='Positive emotion'),
  (df.emotion=='Negative emotion'),
  (df.emotion=="I can't tell")]
cases=[0, 2, 1, 3]
df['target']= np.select(conditions, cases) 

In [40]:
df.target.value_counts()

0    5389
2    2978
1     570
3     156
Name: target, dtype: int64

In [41]:
apple= ['iPhone', 'iPad or iPhone App', 'iPad', 'Apple', 'Other Apple product or service']
google = ['Google', 'Android', 'Android App', 'Other Google product or service']
conditions = [
    (df.directed_at == 'iPhone')|(df.directed_at == 'iPad or iPhone App')|
    (df.directed_at == 'iPad')|(df.directed_at == 'Apple')|(df.directed_at == 'Other Apple product or service'),
    (df.directed_at == 'Google')|(df.directed_at == 'Android')|
    (df.directed_at == 'Android App')|(df.directed_at == 'Other Google product or service')]
cases=['Apple', 'Google']
df['brand']=np.select(conditions, cases, None)

In [16]:
df.head()

Unnamed: 0,index,text,directed_at,emotion,target,brand
0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,0,Apple
1,1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,1,Apple
2,2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,1,Apple
3,3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,0,Apple
4,4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1,Google


In [42]:
df.groupby(df.brand).target.value_counts(normalize=True)

brand   target
Apple   2         0.809049
        1         0.161063
        0         0.026982
        3         0.002906
Google  2         0.819728
        1         0.148526
        0         0.029478
        3         0.002268
Name: target, dtype: float64

In [43]:
df.target.value_counts()

0    5389
2    2978
1     570
3     156
Name: target, dtype: int64

In [44]:
df.drop(index=df[df.target>2].index, inplace=True)
df.target.value_counts()

0    5389
2    2978
1     570
Name: target, dtype: int64

In [45]:
df.reset_index(inplace=True)

## spacy remove stopwords and lemmatization
based on https://nbviewer.jupyter.org/github/matt8955/tweet-explorer/blob/master/flatiron_tweets.ipynb

In [46]:
def check_stop_punct(token):
    if token.is_stop or token.is_punct:
        return False
    else:
        return True

In [47]:
def clean_doc_lem_stop(document):
    no_stop_li = list(filter(check_stop_punct, document))
    lemma_li = list(map(lambda x: x.lemma_, no_stop_li))
    return " ".join(lemma_li)

In [24]:
testdoc = nlp(".@I'm a scorpion man.")
clean_doc_lem_stop(testdoc)

".@I'm scorpion man"

In [48]:
def remove_stop_lemmatize(text_series):
    """accepts a pandas series of strings and returns a lemmatized version
    that also drops stopwords.
    """
    doc_series = text_series.apply(nlp)
    # stopless is a list
    cleaned_series = doc_series.apply(clean_doc_lem_stop)
    return cleaned_series

In [31]:
df["text"].loc[0:5]

0    .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1    @jessedee Know about @fludapp ? Awesome iPad/i...
2    @swonderlin Can not wait for #iPad 2 also. The...
3    @sxsw I hope this year's festival isn't as cra...
4    @sxtxstate great stuff on Fri #SXSW: Marissa M...
Name: text, dtype: object

In [32]:
test_rows = remove_stop_lemmatize(df["text"].loc[0:5])

In [53]:
df['doc']= remove_stop_lemmatize(df.text)

In [41]:
df.doc.isnull().any()

False

In [42]:
df.to_csv('../data/final.csv')

## Visualizations

In [30]:
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS

In [31]:
df.head()

Unnamed: 0,level_0,index,text,directed_at,emotion,target,brand,doc
0,0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,0,Apple,.@wesley83 3 G iPhone 3 hrs tweet rise_austin ...
1,1,1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,1,Apple,@jessedee know @fludapp Awesome iPad iPhone ap...
2,2,2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,1,Apple,@swonderlin wait iPad 2 sale SXSW
3,3,3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,0,Apple,@sxsw hope year festival crashy year iPhone ap...
4,4,4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1,Google,@sxtxstate great stuff Fri SXSW Marissa Mayer ...


### Count Vectorizer

In [76]:
from sklearn.feature_extraction.text import CountVectorizer


In [78]:
cv = CountVectorizer()
counts = cv.fit_transform(df.doc)
count_df = pd.DataFrame(counts.todense(),columns = cv.get_feature_names())


In [79]:
count_df['target']=df['target']
count_df['brand']=df['brand']

In [80]:
count_df.head()

Unnamed: 0,000,02,03,0310apple,08,10,100,100tc,101,106,10k,10mins,10x,11,11ntc,11th,12,120,12b,12th,13,130,14,1406,1413,1415,15,150,1500,157,15am,15k,15slides,16,16162,169,16mins,17,188,1986,1990style,1991,1k,1of,1pm,1st,20,200,2010,2011,2012,21,210,22,23,24,25,250k,25th,2b,2day,2honor,2moro,2nd,2nite,2s,2yrs,30,300,3000,30a,30am,30p,30pm,310409h2011,32,35,36,360,37,3blks,3d,3g,3gs,3rd,3x,40,400,40min,41,437,45,45am,47,48,4android,4chan,4nqv92l,4sq,4sq3,4square,50,54,55,58,59,59p,59pm,5hrs,5th,60,64,64gig,65,6hours,6th,70,75,7th,80,800,80s,81,82,83323324,83323414,89,8p,8th,90,900,911tweets,95,96,967,97,98,99,9th,a3xvwc6,aapl,abacus,abandon,aber,ability,able,abroad,absolute,absolutely,abt,abuzz,academy,acc,acceptable,access,accessibility,accessible,accessory,accesssxsw,accommodate,accord,accordion,account,acerbic,achieve,acknowledge,aclu,acquire,acrosse,act,action,activate,activation,activity,actor,actsofsharing,actual,actually,ad,adam,adams,adapt,adaptive,add,addict,addictedtotheinterwebs,addiction,addictive,addition,additional,address,adfonic,admire,admission,admit,ado,adopter,adoption,adpeopleproblem,advanced,advantage,advent,adventure,advertise,advertising,advice,advisory,aesthetic,affair,affirmative,afford,afraid,africans,afternoon,agchat,age,agencies,agency,agenda,agent,agents,agileagency,agnerd,ago,agree,ah,ahe,ahead,ahem,ahh,ahhh,aicn,aid,aim,air,airline,airlines,airplane,airport,airs,ajs2011,aka,akqas,al,alamo,alan,alarm,alas,album,alcoholic,...,vortex,vote,voxpop,vp,vs,vuelta,vufinder,vuitton,vuvuzela,w00,waaaaaa,wack,wait,wake,wakeup,wakeuplaughing,wal,walk,walkin,walking,wall,wallace,walmart,wam,wander,wanderer,wanna,wannabe,want,war,warm,warmth,warning,wars,wary,waste,watch,water,waterproof,watson,wave,way,waze,we,wean,wear,weasel,weather,web,web3,web30,webber,webdoc,webkit,webmail,webmaster,website,webvision,week,weekend,weep,weight,weinschenk,weird,welcome,welivehere,well,wesley83,west,wew,whale,what,whatcha,while,whimsical,white,whiteboarding,who,whoa,wholistic,whoohoo,whoooooo,whoops,whowillrise,whrrl,wi,wide,widfy,widget,wife,wifi,wii,wil,wild,wilderness,will,williams,willing,willpay,willpower,wilt,win,winamp,window,windows,windows7,wine,wings,winner,winning,wins,winssxsw,wintel,winwin,wipe,wire,wired,wireless,wisconsin,wise,wish,wishful,withdrawal,withme,without,witness,witty,wjchat,wk,wkd,wkend,wknd,woah,wodpress,woe,wohooo,wolfenstein,wolfram,woman,wonder,wonderful,woo,woohoo,wooooo,woops,woot,word,wordnerd,wordpress,words,work,worker,workin,workspace,world,worlds,worldwide,worried,worry,worse,worst,worth,worthwhile,wot,wow,wowwwwww,wozniak,wp7,wr,wrap,wrapper,write,writer,writing,wrong,wsj,wssxsw,wtf,wth,wundertablet,wut,wwsxsw,www,x6t1pi6av7,xbox,xd,xipad,xm,xmas,xml,xoom,xperia,xplat,xwave,ya,yai,yall,yawn,yay,yea,yeaayyy,yeah,year,years,yeasayer,yeay,yellow,yelp,yep,yer,yes,yesterday,yield,yikes,yo,yobongo,yonkers,york,you,youneedthis,your,youtube,yowza,yr,yrs,yummy,yup,zaarly,zaarlyiscoming,zagg,zaggle,zap,zappos,zazzle,zazzlesxsw,zazzlsxsw,ze,zelda,zeldman,zero,zimride,zing,zip,zite,zms,zombie,zomg,zone,zoom,zzzs
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Frequency Distribution

In [59]:
df.doc[1]

'@jessedee know @fludapp Awesome iPad iPhone app likely appreciate design give free Ts SXSW'

In [54]:
from collections import Counter

In [61]:
type(nlp(list(df.doc)[0]))

spacy.tokens.doc.Doc

In [56]:


# all tokens that arent stop words or punctuations
words = [token for doc in df.doc for token in doc if token.is_stop != True and token.is_punct != True]

# noun tokens that arent stop words or punctuations
nouns = [token for doc in df.doc for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "NOUN"]

# five most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common(5)

# five most common noun tokens
noun_freq = Counter(nouns)
common_nouns = noun_freq.most_common(5)



AttributeError: 'str' object has no attribute 'is_stop'

### Tf-IDF

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(df['doc'])


In [28]:
tf_df=pd.DataFrame(text_tf.todense(), columns=tf.get_feature_names())

In [32]:
tf_df['target']=df['target']

In [39]:
tf_df['brand']=df['brand']

In [83]:
tf_idf_pos = tf_df[(tf_df.target==1)].drop(["brand",'target'],axis=1)
tf_idf_neg= tf_df[(tf_df.target==0)].drop(["brand",'target'],axis=1)

tf_idf_pos_apple = tf_df[(tf_df.target==1) & (tf_df.brand=='Apple')].drop(["brand",'target'],axis=1)
tf_idf_neg_apple = tf_df[(tf_df.target==0) & (tf_df.brand=='Apple')].drop(["brand",'target'],axis=1)

tf_idf_pos_google = tf_df[(tf_df.target==1) & (tf_df.brand=='Google')].drop(["brand",'target'],axis=1)
tf_idf_neg_google = tf_df[(tf_df.target==0) & (tf_df.brand=='Google')].drop(["brand",'target'],axis=1)

In [86]:
tf_idf_pos.sum().sort_values(ascending=False)[:40]

mention      224.511734
sxsw         193.660593
link         162.527036
apple        142.798015
ipad         140.110957
rt           127.200135
store        105.167513
google       100.844504
quot          84.106635
iphone        82.813804
app           79.997179
new           67.620159
austin        64.200922
pop           53.255712
ipad2         51.657430
launch        45.885249
open          45.282172
android       44.932493
win           41.825648
amp           40.988251
line          39.219741
party         36.513045
come          35.592639
cool          34.877582
get           34.085662
great         32.178002
time          31.732782
good          31.329506
day           30.565960
social        30.322765
free          30.304184
love          30.232420
temporary     30.103913
today         28.830810
downtown      28.309631
awesome       27.942313
check         27.771641
like          27.202642
go            26.477913
sxswi         26.115112
dtype: float64

In [88]:
tf_idf_neg.sum().sort_values(ascending=False)[:40]

sxsw        33.309138
mention     28.898188
quot        28.640779
iphone      24.318413
ipad        24.270114
google      19.590876
rt          16.827898
apple       15.961096
app         15.450086
link        12.956440
like         9.306551
design       9.001104
store        8.184329
need         7.884855
new          7.729177
social       7.673989
people       7.585123
think        7.199903
circles      7.025353
launch       6.994233
fail         6.667648
long         6.483199
look         6.379914
battery      6.271350
headache     5.896996
line         5.737410
android      5.479356
news         5.391246
time         5.339933
fascist      5.029461
today        4.994361
come         4.936545
america      4.928237
day          4.881867
austin       4.869708
money        4.822884
talk         4.782872
company      4.748838
pop          4.695831
phone        4.628119
dtype: float64

In [90]:
pos_terms = list(tf_idf_pos.sum().sort_values(ascending=False)[:1000].index)
neg_terms= list(tf_idf_neg.sum().sort_values(ascending=False)[:1000].index)
common_features = [term for term in pos_terms if term in neg_terms]
common_features[:10]

['mention',
 'sxsw',
 'link',
 'apple',
 'ipad',
 'rt',
 'store',
 'google',
 'quot',
 'iphone']

In [91]:
len(common_features)

422

In [None]:
df

In [72]:
tf_idf_pos_apple.max().sort_values(ascending=False)[:40]

tempt          0.956088
heart          0.915550
congrats       0.910311
wish           0.905429
rock           0.892308
pick           0.890417
covet          0.885907
giveaway       0.879995
oh             0.866807
hipstamatic    0.839240
zomg           0.836724
periscope      0.828499
joke           0.826003
blackberry     0.822718
depressed      0.822363
exquisite      0.818384
hollrback      0.813522
plane          0.812825
denote         0.809943
truck          0.801934
reason         0.794659
retweet        0.792893
groove         0.788816
short          0.786381
comer          0.783179
checkin        0.776913
squeal         0.771193
aapl           0.769275
take           0.765545
fxsw           0.761995
domo           0.761672
course         0.752375
suffer         0.751943
piss           0.750462
channel        0.744708
jealous        0.742894
battlela       0.741003
sale           0.739540
softlayer      0.738480
austincrowd    0.737955
dtype: float64

In [71]:
tf_idf_neg_apple.max().sort_values(ascending=False)[:40]

mad                 0.886900
battery             0.870707
delete              0.847368
quot                0.827184
pop                 0.816352
glad                0.784463
shame               0.765821
abacus              0.758276
magic               0.757121
ridic               0.755594
sigh                0.752624
dangerous           0.725249
smcomedyfyeah       0.694250
idiot               0.694250
stress              0.688579
wilt                0.688579
boo                 0.680610
autocorrect         0.679454
conflagration       0.664624
doofusness          0.664624
shit                0.663018
suck                0.661513
embarrassed         0.660133
betterthingstodo    0.655770
precommerce         0.654577
rear                0.648288
douchebag           0.634334
headache            0.629028
mock                0.625632
usage               0.623180
news                0.621446
sustainability      0.614131
blur                0.611406
srsly               0.608826
year          

In [84]:
tf_idf_pos_google.max().sort_values(ascending=False)[:40]

orly             0.856847
pi               0.851599
rocks            0.850580
whoooooo         0.839803
anxious          0.827014
proud            0.808192
tshirt           0.780688
gt               0.750793
party            0.748174
bread            0.734646
people           0.734305
wowwwwww         0.726202
pdanet           0.725810
arrive           0.720615
domo             0.720507
suckas           0.718473
butt             0.714037
group            0.711303
syke             0.706071
check            0.700733
calyp            0.696055
marcelosomers    0.695678
hook             0.682226
offer            0.678742
awesome          0.676773
realtime         0.675339
soundcloud       0.665674
lose             0.657568
157              0.654464
block            0.654299
view512          0.654060
winning          0.652104
quot             0.651982
android          0.645695
tooth            0.645610
grindr           0.639157
useful           0.639019
survive          0.636535
true        

In [82]:
tf_idf_neg_google.sum().sort_values(ascending=False)[:40]

google       16.539695
mention       7.968708
quot          7.261575
sxsw          7.210344
social        5.705674
circles       5.452110
rt            5.294677
launch        4.782695
android       3.722686
link          3.312102
today         3.186554
product       3.179859
network       3.144058
new           2.921480
service       2.607064
bing          2.593906
need          2.569421
major         2.384683
fail          2.358433
mayer         2.327612
care          2.268219
app           2.014567
solution      2.002699
business      1.984743
vs            1.972296
lose          1.919931
not           1.893292
technical     1.873186
tv            1.816360
call          1.796950
way           1.770431
user          1.764733
tag           1.597295
suck          1.571639
think         1.543316
possibly      1.508791
time          1.468665
comment       1.424555
circle        1.393609
room          1.381906
dtype: float64

## Make train, test, and validation sets

In [None]:
target = df["target"]
features = df.drop(["target"],axis=1)
target

In [35]:
x_full, x_test, y_full, y_test = train_test_split(df, target, test_size=.1)
x_train, x_val, y_train, y_val = train_test_split(x_full, y_full, test_size=.2)

In [36]:
x_val

Unnamed: 0,index,text,directed_at,emotion,brand
4122,4122,nice! Apple to open pop-up store during #sxsw ...,Apple,Positive emotion,Apple
4240,4240,We're at Lustre Pearl for #teamandroidsxsw cau...,Android,Positive emotion,Google
6761,6761,RT @mention The next fin serv battle is vs App...,Google,Positive emotion,Google
5722,5722,RT @mention First person to stop by our booth ...,iPad or iPhone App,Positive emotion,Apple
7780,7780,So cool! RT @mention Updated NPR Music iPhone ...,iPad or iPhone App,Positive emotion,Apple
...,...,...,...,...,...
1790,1790,The iPad 2 Takes Over #SXSW [VIDEO]: {link},iPad,Positive emotion,Apple
5387,5387,RT @mention And it will suck. RT @mention RT @...,Other Google product or service,Negative emotion,Google
3754,3754,Just ran into @mention and apparently @mention...,Android App,Positive emotion,Google
1649,1649,Nice! RT @mention Yes! Gowalla wins best Andoi...,Android App,Positive emotion,Google


In [37]:
x_train.to_csv("../data/x_train.csv")
y_train.to_csv("../data/y_train.csv")

x_val.to_csv("../data/x_val.csv")
y_val.to_csv("../data/y_val.csv")

x_test.to_csv("../data/x_test.csv")
y_test.to_csv("../data/y_test.csv")