# Sentiment Analysis 

The <b> purpose</b> of this notebook is to explore sentiment analysis using text blob and vedar.

## Libraries and Data

In [63]:
import pandas as pd
import time
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import re
from textblob import TextBlob
import matplotlib.pyplot as plt
nltk.download('vader_lexicon')

df = pd.read_csv('finaldata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


# Sentiment Analysis Using TextBlob


### Cleaning

We replace "NA" values, "removed," and "deleted" with an empty string. 

In [39]:
df['text'] = df['text'].fillna('')
tbr = ['[removed]', '[deleted]']
df['text'] = df['text'].apply(lambda x: '' if x in tbr else x)

In [40]:
# Convert all the string to lower cases
df['text'] = df['text'].str.lower()

# \S+ means anything that is not an empty space
df['text'] = df['text'].apply(lambda x: re.sub('http\S*', '', x))

# \s+ means all empty space (\n, \r, \t)
df['text'] = df['text'].apply(lambda x: re.sub('\s+', ' ', x))

# We don't want empty string in our text
df = df.loc[df['text'] != ""]

In [41]:
#removing punctuation
df['text'] = df['text'].apply(lambda x: re.sub('[^\w\s]', '', x))

In [6]:
#The function below removes stop words

def sentiment_func(text):
    try:
        return TextBlob(text).sentiment
    except:
        return None

In [42]:
df['text_blob']= df['text'].apply(sentiment_func)

### Creating Dataframe

In [43]:
df[['polarity','subjectivity']]= df['text'].apply(lambda text:pd.Series(TextBlob(text).sentiment))

In [51]:
df.columns.tolist()

['Unnamed: 0',
 'author',
 'crawled',
 'entities_locations',
 'entities_organizations',
 'entities_persons',
 'external_links',
 'highlightText',
 'highlightTitle',
 'language',
 'locations',
 'ord_in_thread',
 'organizations',
 'persons',
 'published',
 'text',
 'thread_country',
 'thread_main_image',
 'thread_participants_count',
 'thread_performance_score',
 'thread_published',
 'thread_replies_count',
 'thread_section_title',
 'thread_site',
 'thread_site_full',
 'thread_site_section',
 'thread_site_type',
 'thread_social_facebook_comments',
 'thread_social_facebook_likes',
 'thread_social_facebook_shares',
 'thread_social_gplus_shares',
 'thread_social_linkedin_shares',
 'thread_social_pinterest_shares',
 'thread_social_stumbledupon_shares',
 'thread_social_vk_shares',
 'thread_spam_score',
 'thread_title',
 'thread_title_full',
 'thread_url',
 'thread_uuid',
 'title',
 'url',
 'uuid',
 'thread_domain_rank',
 'text_blob',
 'polarity',
 'subjectivity']

In [57]:
sentimentdata = df.drop(columns=['Unnamed: 0'])

In [58]:
sentimentdata.to_csv('sentimentdata.csv')

# Sentiment Analysis Using Vader

In [10]:
df1 = pd.read_csv('finaldata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


### Cleaning

In [11]:
# Fill na with empty string
df1['text'] = df1['text'].fillna('')
# Replace `removed` and `deleted` with empty string
tbr = ['[removed]', '[deleted]']
df1['text'] = df1['text'].apply(lambda x: '' if x in tbr else x)

In [12]:
all_text=df1['text']
all_sent_values=[]
all_sentiments=[]

def sentiment_value(paragraph):
    analyser= SentimentIntensityAnalyzer()
    result= analyser.polarity_scores(paragraph)
    score=result['compound']
    return round(score ,1)

#### Warning!
Code below might take some time to run

In [22]:
counter = 0
start_time = time.time()

for i in range(0,len(df1)):
    all_sent_values.append(sentiment_value(all_text[i]))
    
    counter += 1
    
    if counter % 5000 == 0:
        print("There have been {} files read so far".format(counter))
        print("Time elapsed: {}".format(time.time() - start_time))
        
 
    
print("Operation complete after {} seconds.".format(time.time()-start_time))

There have been 5000 files read so far
Time elapsed: 71.21340823173523
There have been 10000 files read so far
Time elapsed: 143.90734815597534
There have been 15000 files read so far
Time elapsed: 215.8674340248108
There have been 20000 files read so far
Time elapsed: 288.4437482357025
There have been 25000 files read so far
Time elapsed: 360.16496109962463
There have been 30000 files read so far
Time elapsed: 430.4718053340912
There have been 35000 files read so far
Time elapsed: 498.0337710380554
There have been 40000 files read so far
Time elapsed: 563.2042272090912
There have been 45000 files read so far
Time elapsed: 637.4129903316498
There have been 50000 files read so far
Time elapsed: 712.1230063438416
There have been 55000 files read so far
Time elapsed: 784.1697692871094
There have been 60000 files read so far
Time elapsed: 850.0583682060242
There have been 65000 files read so far
Time elapsed: 915.6351401805878
There have been 70000 files read so far
Time elapsed: 980.64745

In [24]:
temp_data = df1
temp_data.shape

(173313, 44)

In [25]:
counter = 0
start_time = time.time()

SENTIMENT_VALUE = []
SENTIMENT = []
for i in range(0,len(df1)):
    sent = all_sent_values[i]
    if (sent<=1 and sent>=0.5):
        SENTIMENT.append('V.Positive')
        SENTIMENT_VALUE.append(5)
    elif (sent<0.5 and sent>0):
        SENTIMENT.append('Positive')
        SENTIMENT_VALUE.append(4)
    elif (sent==0):
        SENTIMENT.append('Neutral')
        SENTIMENT_VALUE.append(3)
    elif (sent<0 and sent>=-0.5):
        SENTIMENT.append('Negative')
        SENTIMENT_VALUE.append(2)
    else:
        SENTIMENT.append('V.Negative')
        SENTIMENT_VALUE.append(1)
        
    counter += 1
    
    if counter % 5000 == 0:
        print("There have been {} files read so far".format(counter))
        print("Time elapsed: {}".format(time.time() - start_time))
        
 
    
print("Operation complete after {} seconds.".format(time.time()-start_time))

There have been 5000 files read so far
Time elapsed: 0.01027989387512207
There have been 10000 files read so far
Time elapsed: 0.015654802322387695
There have been 15000 files read so far
Time elapsed: 0.02219390869140625
There have been 20000 files read so far
Time elapsed: 0.03348803520202637
There have been 25000 files read so far
Time elapsed: 0.03943896293640137
There have been 30000 files read so far
Time elapsed: 0.04516196250915527
There have been 35000 files read so far
Time elapsed: 0.0518031120300293
There have been 40000 files read so far
Time elapsed: 0.05954718589782715
There have been 45000 files read so far
Time elapsed: 0.06949806213378906
There have been 50000 files read so far
Time elapsed: 0.08312010765075684
There have been 55000 files read so far
Time elapsed: 0.09449291229248047
There have been 60000 files read so far
Time elapsed: 0.1025838851928711
There have been 65000 files read so far
Time elapsed: 0.10884881019592285
There have been 70000 files read so far


### Creating Dataframe

In [26]:
#update to temp dataset
temp_data['SENTIMENT_VALUE'] = SENTIMENT_VALUE
temp_data['SENTIMENT'] = SENTIMENT

In [60]:
vedardata = temp_data.drop(columns=['Unnamed: 0'])

In [61]:
vedardata.head()

Unnamed: 0,author,crawled,entities_locations,entities_organizations,entities_persons,external_links,highlightText,highlightTitle,language,locations,...,thread_title,thread_title_full,thread_url,thread_uuid,title,url,uuid,thread_domain_rank,SENTIMENT_VALUE,SENTIMENT
0,USNews,2015-10-02T17:33:59.981+03:00,,,,[['http://www.reddit.com/submit?url=http%3A%2F...,,,english,,...,The Healthiest Pastas: From Quinoa to Buckwhea...,The Healthiest Pastas: From Quinoa to Buckwhea...,http://health.usnews.com/health-news/health-we...,8085f289866a814f7a443e1a31e48f8a307a040f,The Healthiest Pastas: From Quinoa to Buckwhea...,http://health.usnews.com/health-news/health-we...,8085f289866a814f7a443e1a31e48f8a307a040f,,5,V.Positive
1,,2015-10-19T09:23:00.540+03:00,,,,,,,english,['Savoonga'],...,Photos: Operation Santa Claus visits Savoonga,Photos: Operation Santa Claus visits Savoonga,http://www.newsdump.com/article/photos-operati...,f4ad43deab0a72726d6165b37a971c578efdd4f5,Photos: Operation Santa Claus visits Savoonga,http://www.newsdump.com/article/photos-operati...,f4ad43deab0a72726d6165b37a971c578efdd4f5,,3,Neutral
2,,2015-10-08T17:42:28.717+03:00,,,,,,,english,['Palmyra'],...,"Watch: Video Shows 2,000-Year-Old Ancient Arch...","Watch: Video Shows 2,000-Year-Old Ancient Arch...",http://www.newsdump.com/article/watch-video-sh...,c98cbd870f52950ff685e772fd189bd01fc85767,"Watch: Video Shows 2,000-Year-Old Ancient Arch...",http://www.newsdump.com/article/watch-video-sh...,c98cbd870f52950ff685e772fd189bd01fc85767,,3,Neutral
3,,2015-10-05T10:10:00.218+03:00,,,,,,,english,,...,'Fear the Walking Dead' ends Season 1 on a gri...,'Fear the Walking Dead' ends Season 1 on a gri...,http://www.newsdump.com/article/fear-the-walki...,3481ad311613e0da31e6017f854c7ded093b398a,'Fear the Walking Dead' ends Season 1 on a gri...,http://www.newsdump.com/article/fear-the-walki...,3481ad311613e0da31e6017f854c7ded093b398a,,1,V.Negative
4,,2015-10-23T15:40:06.454+03:00,,,,,,,english,,...,Facebook app draining your iPhone battery? Com...,Facebook app draining your iPhone battery? Com...,http://www.newsdump.com/article/facebook-app-d...,17954912c005732967b28ef81b4ebc58d3911efc,Facebook app draining your iPhone battery? Com...,http://www.newsdump.com/article/facebook-app-d...,17954912c005732967b28ef81b4ebc58d3911efc,,3,Neutral


In [62]:
vedardata.to_csv('vedardata.csv')

In [33]:
data = temp_data[['text','SENTIMENT','SENTIMENT_VALUE']]

# End