In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib  inline

In [6]:
"""
 analyze Twitter sentiment about Apple and Google products. The dataset comes from CrowdFlower via data.world. 
     Human raters rated the sentiment in over 9,000 Tweets as positive, negative, or neither.

Your task is to:

Build a model that can rate the sentiment of a Tweet based on its content.

Aim for a Proof of Concept
There are many approaches to NLP problems - start with something simple and iterate from there. 
For example, you could start by limiting your analysis to positive and negative Tweets only, allowing you to build a binary classifier. 
Then you could add in the neutral Tweets to build out a multiclass classifier. 
You may also consider using some of the more advanced NLP methods in the Mod 4 Appendix.

Evaluation
Evaluating multiclass classifiers can be trickier than binary classifiers because there are multiple ways to 
    mis-classify an observation, and some errors are more problematic than others. 
    Use the business problem that your NLP project sets out to solve to inform your choice of evaluation metrics.

"""

df = pd.read_csv('judge-1377884607_tweet_product_company.csv',encoding= 'unicode_escape')

In [7]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [8]:
#df['tweet_text'].to_list()
df.shape

(9093, 3)

In [9]:
df.isna().sum()

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64

In [10]:
#df = df.drop('emotion_in_tweet_is_directed_at',axis=1, inplace=True)
df_data = df[['tweet_text','is_there_an_emotion_directed_at_a_brand_or_product']]

In [11]:
df_data.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [12]:
df_data.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
df_data.isna().sum()

tweet_text                                            0
is_there_an_emotion_directed_at_a_brand_or_product    0
dtype: int64

In [14]:
df_data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

No emotion toward brand or product    5388
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [15]:
df_data_1 = df_data.copy()

In [16]:
#df['trump_tweet'] = (df['source']=='Twitter for Android').astype(int)
#df_data_1['sentiment'] = (df_data_1['is_there_an_emotion_directed_at_a_brand_or_product']== 'No emotion toward brand or product').astype(int)
#df['n'] = df['n'].replace({'a': 'x', 'b': 'y', 'c': 'w', 'd': 'z'})

df_data_1['sentiment'] = df_data_1['is_there_an_emotion_directed_at_a_brand_or_product'].replace({
               'No emotion toward brand or product':0,
               'Positive emotion':1,
               'Negative emotion':2,
               "I can't tell":3})


In [17]:
"""
0    No emotion toward brand or product
1    Positive emotion
2    Negative emotion
3    I can't tell

"""
df_data_1['sentiment'].value_counts()

0    5388
1    2978
2     570
3     156
Name: sentiment, dtype: int64

In [18]:
df_data_1.isna().sum()

tweet_text                                            0
is_there_an_emotion_directed_at_a_brand_or_product    0
sentiment                                             0
dtype: int64

In [19]:
import re

In [20]:
def remove_pattern(input_text,pattern):
    r = re.findall(pattern,input_text)
    for i in r:
        input_txt = re.sub(i,'', input_text)
        return input_txt

In [21]:
df_data_1['tiddy_tweet'] = np.vectorize(remove_pattern)(df_data_1['tweet_text'],'@[\w]*')

In [22]:
df_data_1.head(10)

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product,sentiment,tiddy_tweet
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,2,. I have a 3G iPhone. After 3 hrs tweeting at ...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion,1,Know about @fludapp ? Awesome iPad/iPhone app...
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion,1,Can not wait for #iPad 2 also. They should sa...
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion,2,I hope this year's festival isn't as crashy a...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion,1,great stuff on Fri #SXSW: Marissa Mayer (Goog...
5,@teachntech00 New iPad Apps For #SpeechTherapy...,No emotion toward brand or product,0,New iPad Apps For #SpeechTherapy And Communic...
7,"#SXSW is just starting, #CTIA is around the co...",Positive emotion,1,
8,Beautifully smart and simple idea RT @madebyma...,Positive emotion,1,Beautifully smart and simple idea RT @thenext...
9,Counting down the days to #sxsw plus strong Ca...,Positive emotion,1,
10,Excited to meet the @samsungmobileus at #sxsw ...,Positive emotion,1,Excited to meet the at #sxsw so I can show th...


### Removing Punctuations, Numbers, and special Characters

In [23]:
#replace w/space
df_data_1['tiddy_tweet'] = df_data_1['tiddy_tweet'].str.replace('[^a-zA-Z#]+',' ')

In [24]:
df_data_1.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product,sentiment,tiddy_tweet
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,2,I have a G iPhone After hrs tweeting at #RISE...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion,1,Know about fludapp Awesome iPad iPhone app th...
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion,1,Can not wait for #iPad also They should sale ...
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion,2,I hope this year s festival isn t as crashy a...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion,1,great stuff on Fri #SXSW Marissa Mayer Google...


In [None]:
df_data_1

### Remove Short Words

In [25]:
#df_data_1['tiddy_tweet'] = df_data_1['tiddy_tweet'].apply(lambda x: [for w in ])

SyntaxError: invalid syntax (<ipython-input-25-a33d36e4d231>, line 1)

In [None]:
df_data['tweet_text'].isnull().values.any()

In [None]:
df_data['tweet_text'].isna().sum()

In [None]:
df_data[df_data['tweet_text']== np.NaN]

In [None]:
df

In [24]:
## Create a variable "corpus" containing all text

corpus = df_data['tweet_text'].to_list()
corpus[:10]

['.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.',
 "@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",
 '@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.',
 "@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw",
 "@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",
 '@teachntech00 New iPad Apps For #SpeechTherapy And Communication Are Showcased At The #SXSW Conference http://ht.ly/49n4M #iear #edchat #asd',
 '#SXSW is just starting, #CTIA is around the corner and #googleio is only a hop skip and a jump from there, good time to be an #android fan',
 'Beautifully smart and simple idea RT @madebymany @thenextweb wrote about our #hollergram iPad app for #sxsw! http://bit.ly/ieaV

In [25]:
#Make a Bag-of-Words Frequency Distribution
from nltk import FreqDist

In [26]:
#values = ','.join(str(v) for v in corpus)
freq = FreqDist(','.join(corpus))
## Display 100 most common words
freq.most_common(100)


[(' ', 153873),
 ('e', 72593),
 ('o', 62850),
 ('n', 56651),
 ('t', 55549),
 ('i', 51978),
 ('a', 46024),
 ('s', 43823),
 ('r', 31903),
 ('l', 30420),
 ('h', 21549),
 ('p', 21506),
 ('d', 20073),
 ('m', 19206),
 ('u', 18363),
 ('w', 15979),
 ('#', 15875),
 ('g', 15602),
 ('c', 14950),
 ('S', 13550),
 (',', 12649),
 ('y', 11753),
 ('k', 10404),
 ('f', 9839),
 ('.', 8382),
 ('b', 7514),
 ('@', 7194),
 ('W', 6831),
 ('x', 6460),
 ('T', 5977),
 ('P', 5701),
 ('A', 5597),
 ('v', 5258),
 ('X', 5211),
 ('}', 4298),
 ('{', 4296),
 ('R', 3865),
 ('G', 3585),
 ('I', 2982),
 ("'", 2903),
 (';', 2800),
 ('&', 2707),
 ('C', 2689),
 ('-', 2438),
 ('M', 2412),
 ('!', 2398),
 (':', 2310),
 ('2', 2213),
 ('q', 2098),
 ('N', 1898),
 ('?', 1659),
 ('B', 1476),
 ('L', 1446),
 ('D', 1332),
 ('H', 1295),
 ('1', 1248),
 ('F', 1161),
 ('O', 1110),
 ('j', 1054),
 ('0', 1040),
 ('/', 1019),
 ('E', 1013),
 (')', 801),
 ('(', 770),
 ('U', 740),
 ('z', 721),
 ('\x89', 691),
 ('3', 616),
 ('Û', 582),
 ('J', 563),
 

In [27]:
#fdist1[100]
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/davidtorres/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
from nltk import word_tokenize
tokens = word_tokenize(','.join(corpus))
tokens[:100]

['.',
 '@',
 'wesley83',
 'I',
 'have',
 'a',
 '3G',
 'iPhone',
 '.',
 'After',
 '3',
 'hrs',
 'tweeting',
 'at',
 '#',
 'RISE_Austin',
 ',',
 'it',
 'was',
 'dead',
 '!',
 'I',
 'need',
 'to',
 'upgrade',
 '.',
 'Plugin',
 'stations',
 'at',
 '#',
 'SXSW.',
 ',',
 '@',
 'jessedee',
 'Know',
 'about',
 '@',
 'fludapp',
 '?',
 'Awesome',
 'iPad/iPhone',
 'app',
 'that',
 'you',
 "'ll",
 'likely',
 'appreciate',
 'for',
 'its',
 'design',
 '.',
 'Also',
 ',',
 'they',
 "'re",
 'giving',
 'free',
 'Ts',
 'at',
 '#',
 'SXSW',
 ',',
 '@',
 'swonderlin',
 'Can',
 'not',
 'wait',
 'for',
 '#',
 'iPad',
 '2',
 'also',
 '.',
 'They',
 'should',
 'sale',
 'them',
 'down',
 'at',
 '#',
 'SXSW.',
 ',',
 '@',
 'sxsw',
 'I',
 'hope',
 'this',
 'year',
 "'s",
 'festival',
 'is',
 "n't",
 'as',
 'crashy',
 'as',
 'this',
 'year',
 "'s",
 'iPhone',
 'app']

In [29]:
freq = FreqDist(tokens)
freq.most_common(100)

[('#', 15875),
 (',', 12553),
 ('@', 7194),
 ('mention', 7119),
 ('.', 4929),
 ('SXSW', 4737),
 ('sxsw', 4478),
 ('link', 4311),
 ('}', 4298),
 ('{', 4296),
 ('the', 3928),
 ('to', 3519),
 ('RT', 2947),
 ('at', 2859),
 (';', 2800),
 ('&', 2707),
 ('for', 2440),
 ('!', 2398),
 ('a', 2174),
 ('Google', 2135),
 ('iPad', 2116),
 (':', 2075),
 ('Apple', 1880),
 ('in', 1830),
 ('quot', 1696),
 ('of', 1691),
 ('?', 1659),
 ('is', 1649),
 ('and', 1526),
 ('I', 1461),
 ('iPhone', 1301),
 ('on', 1271),
 ("'s", 1232),
 ('2', 1114),
 ('store', 1049),
 ('-', 972),
 ('you', 944),
 ('Austin', 900),
 ('an', 853),
 ('amp', 836),
 ('with', 805),
 (')', 801),
 ('up', 778),
 ('(', 770),
 ('it', 767),
 ('my', 711),
 ('app', 630),
 ('...', 591),
 ('Circles', 589),
 ('new', 566),
 ('be', 544),
 ('New', 519),
 ('from', 505),
 ('this', 496),
 ('by', 485),
 ('The', 483),
 ("n't", 479),
 ('out', 478),
 ('that', 468),
 ('are', 456),
 ('google', 448),
 ('Android', 445),
 ('your', 431),
 ('not', 427),
 ('Store', 42

In [30]:
from nltk.corpus import stopwords
import string

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/davidtorres/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
# Get all the stop words in the English language
stopwords_list = stopwords.words('english')
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each