<h1>Twitter Sentiment Analysis</h1>

***

<h2>Natural Language Processing (NLP)</h2>


## LIBS

In [27]:
import pandas as pd
import numpy as np

#visualization
import seaborn as sns
import matplotlib.pyplot as plt

import re

import nltk
nltk.download('stopwords')
nltk.download('rslp')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daianeklein/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     /Users/daianeklein/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

<h2>Functions</h2>

In [40]:
def apply_stammer(text):
    sentences_stemmer = []
    stemmer = nltk.stem.RSLPStemmer()

    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in text:
        tweet = tweet.lower()

        # Replace all URls with 'URL'
        tweet = re.sub(urlPattern,' URL',tweet)
        # Replace all emojis.
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])        
        # Replace @USERNAME to 'USER'.
        tweet = re.sub(userPattern,' USER', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

        tweet_words = ''
        for word in tweet.split():
            if len(word) > 1:
                word = stemmer.stem(word)
                tweet_words += (word + ' ')

        sentences_stemmer.append(tweet_words)
            #not_stemmer = [str(stemmer.stem(p)) for p in tweet.split() if p not in stop_words_nltk]
            #sentences_stemmer.append((not_stemmer))

    return sentences_stemmer

In [3]:
# target columns
df_columns = ["sentiment", "ids", "date", "flag", "user", "text"]
df_raw = pd.read_csv('data/twitter-data.csv',
                    encoding = 'latin-1',
                    names = df_columns)

df_raw.head()

Unnamed: 0,sentiment,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Dataset analysis

In [4]:
df1 = df_raw.copy()

In [5]:
df1.describe()

Unnamed: 0,sentiment,ids
count,1600000.0,1600000.0
mean,2.0,1998818000.0
std,2.000001,193576100.0
min,0.0,1467810000.0
25%,0.0,1956916000.0
50%,2.0,2002102000.0
75%,4.0,2177059000.0
max,4.0,2329206000.0


In [6]:
df1.isnull().sum()

sentiment    0
ids          0
date         0
flag         0
user         0
text         0
dtype: int64

In [7]:
#target variable
df1['sentiment'].value_counts()

0    800000
4    800000
Name: sentiment, dtype: int64

In [8]:
df1['sentiment'] = df1['sentiment'].replace(4, 1)
#target variable
df1['sentiment'].value_counts()

0    800000
1    800000
Name: sentiment, dtype: int64

In [9]:
(df1[df1['sentiment'] == 1].sample(3))


Unnamed: 0,sentiment,ids,date,flag,user,text
1101287,1,1970744737,Sat May 30 05:03:33 PDT 2009,NO_QUERY,GoPBirthdayBash,@Zhiqing I love GOP cause of everything HAHAHA...
1199056,1,1985249459,Sun May 31 16:44:01 PDT 2009,NO_QUERY,chicrunner,@kjs72 thanks!!
1438046,1,2061267647,Sat Jun 06 20:31:00 PDT 2009,NO_QUERY,charlesjurries,@aaronlafferty UNLESS.... Unless Liam Neeson r...


In [10]:
(df1[df1['sentiment'] == 0].sample(3))

Unnamed: 0,sentiment,ids,date,flag,user,text
618331,0,2227482599,Thu Jun 18 13:15:10 PDT 2009,NO_QUERY,merderfan89,@SnapshotLexie I know. I'm so sad about it.
290899,0,1995524322,Mon Jun 01 13:37:43 PDT 2009,NO_QUERY,rawisner,Wishing there was a band aid big enough to fix...
369728,0,2049958704,Fri Jun 05 17:48:30 PDT 2009,NO_QUERY,Flower7777,@Mcpattz I personally hate technology sometim...


In [11]:
# target columns
df1 = df1[['sentiment','text']]

<h2>Pre-Processing Text</h2>

In [23]:
df2 = df1.copy()

In [12]:
# Defining dictionary containing all emojis with their meanings.
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}


In [21]:
# removing stop words
stop_words_nltk = nltk.corpus.stopwords.words('english')

stop_words_nltk[0:10]


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [43]:
#checking the function
df2_sample = df2.sample(10, random_state=  39 )
apply_stammer(df2_sample['text'].tolist())

['dont feel to good still deciding if will go out aft all but im still happy ',
 'mwah ha ha ha know wher you are and what yo re doing ',
 'lov my shirt think that look good on me ',
 'relaxing with my boo ',
 'hav just read the pap and just dont get kati pric it all so sad ',
 'it really going to tak me whil to adjust to not having my cat ',
 'just dropped biscuit in her cup of tea end of the world ',
 'scratch that just ask and be ready to accept the respons ',
 'need an extern microphon that work ',
 'the sum gon and so ha the hot weath ']