# Sentiment Analysis Notebook

## Dataset

The data given is in the form of a comma-separated values files with tweets and their corresponding sentiments. The training dataset is a csv file of type tweet_id,sentiment,tweet where the
tweet_id is a unique integer identifying the tweet, sentiment is either 1 (positive) or 0 (negative), and tweet is the tweet enclosed in "". Similarly, the test dataset is a csv file of type
tweet_id,tweet.

In [None]:
import pandas as pd
import sys
import nltk
sys.path.insert(0, 'src')

In [12]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
dataset = pd.read_csv('dataset/train.csv', encoding = "ISO-8859-1")
dataset.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


## Preprocessing

### Preprocessing functions

In [3]:
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet

def preprocess_word(word):
    # remove punctation
    word = word.strip('\'"?!,.():;')
    # more than 3 letter repetition removed
    word = re.sub(r'(.)\1\1+', r'\1\1\1', word)
    # remove - & '
    word = word.strip('-&\'')
    return word

def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)

def preprocess_tweet(tweet, use_stemmer=False, use_lemmatizer=False):
    # convert tweet to lowercase
    tweet = tweet.lower()
    # replace urls with 'URL'
    tweet = re.sub(r'((www.[\S]+)|(https?://.[\S]+))', 'URL', tweet)
    # replace user mentions @user with 'USER_MENTION'
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # replace #hashtag with hastag
    tweet = re.sub(r'#(\S+)', r' \1', tweet)
    # remove retweet RT
    tweet = re.sub(r'\brt\b', '', tweet)
    # replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # remove space, " and ' 
    tweet.strip('" \'')
    # handle emojis. Use only EMO_POS and EMO_NEG
    tweet = handle_emojis(tweet)
    # replace multiple spaces with only one space
    tweet = re.sub(r'\s+', ' ', tweet)
    # preprocess words
    words = tweet.split()

    processed_words = []
    porter_stemmer = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()
    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            if use_stemmer:
                # use stemmer
                word = str(porter_stemmer.stem(word))
            elif use_lemmatizer:
                word = str(wordnet_lemmatizer.lemmatize(word))
            processed_words.append(word)
    return ' '.join(processed_words)

In [4]:
#tweet = "Hi, how is going on? :) CIAO COME vaaaaa? :) :( https://tartarus.org/martin/PorterStemmer/  bella @ivocerti    #sentimentanalysis"
for i in range(4):
    tweet = dataset.loc[i]['SentimentText']
    print('ORIGINAL: ' + tweet)
    print('STEMMER: ' + preprocess_tweet(tweet, use_stemmer=True))
    print('LEMMATIZER: ' + preprocess_tweet(tweet, use_lemmatizer=True))
    print('----------------------------------------------------------------')

ORIGINAL:                      is so sad for my APL friend.............
STEMMER: is so sad for my apl friend
LEMMATIZER: is so sad for my apl friend
----------------------------------------------------------------
ORIGINAL:                    I missed the New Moon trailer...
STEMMER: i miss the new moon trailer
LEMMATIZER: i missed the new moon trailer
----------------------------------------------------------------
ORIGINAL:               omg its already 7:30 :O
STEMMER: omg it alreadi o
LEMMATIZER: omg it already o
----------------------------------------------------------------
ORIGINAL:           .. Omgaga. Im sooo  im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...
STEMMER: omgaga im sooo im gunna cri been at thi dentist sinc i wa supos just get a crown put on
LEMMATIZER: omgaga im sooo im gunna cry been at this dentist since i wa suposed just get a crown put on
----------------------------------------------------------------


### Preprocess the whole dataset

In [None]:
preprocessed_train_file = open('dataset/preprocessed_train.csv', 'w')
preprocessed_train_file.write('ItemID,Sentiment,SentimentText\n')
for index, row in dataset.iterrows():
    preprocessed_train_file.write(str(index+1) + ',' + str(row['Sentiment']) + ',' + preprocess_tweet(row['SentimentText']) + '\n')
    sys.stdout.write('\r')
    sys.stdout.write('Processing ' + str(index+1) + '/' + str(dataset.shape[0]))
    sys.stdout.flush()
preprocessed_train_file.close()

In [6]:
preprocessed_dataset = pd.read_csv('dataset/preprocessed_train.csv', encoding = "ISO-8859-1")
preprocessed_dataset.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my apl friend
1,2,0,i missed the new moon trailer
2,3,1,omg its already o
3,4,0,omgaga im sooo im gunna cry been at this denti...
4,5,0,i think mi bf is cheating on me t_t


### Feature parameters (Word, SentiWordNet, POS + POS_Word, Semantic)

In [22]:
from nltk.corpus import sentiwordnet as swn

def word_to_SWNt(word):
    # take the first result from sentiwordnet (if there is)
    list_sent = list(swn.senti_synsets(word))
    if len(list_sent) > 0:
        sent = list_sent[0]
        pos = round(sent.pos_score()*10)
        neg = round(sent.neg_score()*10)
        obj = round(sent.obj_score()*10)
        # return the word if the word itself is mostly object
        # otherwise return POS-X or NEG-X if the word is 
        # mostly positive or negative and X is the rounded score*10 (from 0 to 10)
        if pos > neg:
            return 'POS-' + str(pos)
        elif neg > pos:
            return 'NEG-' + str(neg)
    return word

def tweet_to_feature_word(tweet):
    # just return the tweet as it is
    return tweet

def tweet_to_feature_SWNt(tweet):
    words = tweet.split()
    processed_words = []
    for word in words:
        processed_words.append(word_to_SWNt(word))
    return ' '.join(processed_words)

def tweet_to_feature_POS(tweet):
    # the tweet is composed by the POS (parts of speech) of the words
    # first just the list of pos, then the list of the pos and the words
    # in the format pos_word
    words = tweet.split()
    words_POS = nltk.pos_tag(words)
    processed_words = []
    pos = []
    for word in words_POS:
        pos.append(word[1])
    processed_words = pos
    size = len(processed_words)
    for i in range(size):
        processed_words.append(processed_words[i] + '_' + words[i])
    return ' '.join(processed_words)

def tweet_to_feature_semantic(tweet):
    return tweet

In [23]:
for i in range(4):
    tweet = preprocessed_dataset.loc[i]['SentimentText']
    print('WORD: ' + tweet_to_feature_word(tweet))
    print('SWNt: ' + tweet_to_feature_SWNt(tweet))
    print('POS: ' + tweet_to_feature_POS(tweet))
    #print('Semantic: ' + tweet_to_feature_semantic(tweet))
    print('----------------------------------------------------------------')

WORD: is so sad for my apl friend
SWNt: POS-2 so NEG-8 for my apl POS-1
POS: VBZ RB JJ IN PRP$ NN NN VBZ_is RB_so JJ_sad IN_for PRP$_my NN_apl NN_friend
----------------------------------------------------------------
WORD: i missed the new moon trailer
SWNt: i NEG-2 the POS-4 moon trailer
POS: NN VBD DT JJ NN NN NN_i VBD_missed DT_the JJ_new NN_moon NN_trailer
----------------------------------------------------------------
WORD: omg its already o
SWNt: omg its POS-1 o
POS: VB PRP$ RB VB VB_omg PRP$_its RB_already VB_o
----------------------------------------------------------------
WORD: omgaga im sooo im gunna cry been at this dentist since i was suposed just get a crown put on
SWNt: omgaga im sooo im gunna cry POS-2 at this dentist since i was suposed POS-6 get a crown put on
POS: JJ NN NN NN NN NN VBN IN DT NN IN NN VBD VBN RB VB DT NN NN IN JJ_omgaga NN_im NN_sooo NN_im NN_gunna NN_cry VBN_been IN_at DT_this NN_dentist IN_since NN_i VBD_was VBN_suposed RB_just VB_get DT_a NN_crow

In [21]:
sentence = preprocessed_dataset.loc[1]['SentimentText']
tokens = sentence.split()
print(tokens)
print(nltk.pos_tag(tokens)[0][1])
a = nltk.pos_tag(tokens)
for w in a:
    print(w[1])

['i', 'missed', 'the', 'new', 'moon', 'trailer']
NN
NN
VBD
DT
JJ
NN
NN


## Vectorization