# Sentiment Analysis in Python

In [66]:
#import nltk
#nltk.download('popular', halt_on_error=False)
#nltk.download('all', halt_on_error=False)

In [1]:
import pandas as pd
import re
import sys
import matplotlib.pyplot as plt
#import nltk as nltk
#import nltk.corpus  
#from nltk.text import Text
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

In [2]:
print(sys.version)

3.6.5 |Anaconda, Inc.| (default, Mar 29 2018, 13:32:41) [MSC v.1900 64 bit (AMD64)]


## Sentiment Analysis with TextBlob: Polarity and Subjectivity
Subjectivity can come in many forms, e.g., opinions, allegations, desires, beliefs, suspicions, and speculations.

__textblob.sentiments__ module contains two sentiment analysis implementations, PatternAnalyzer (based on the pattern library: https://www.clips.uantwerpen.be/pattern) and NaiveBayesAnalyzer (an NLTK classifier trained on a movie reviews corpus).

The default implementation is PatternAnalyzer, but you can override the analyzer to use NaiveBayesAnalyzer

In [3]:
text = "Tim is ugly."

In [81]:
blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
blob.sentiment

Sentiment(classification='neg', p_pos=0.3818562132011153, p_neg=0.6181437867988844)

In [75]:
blob = TextBlob(text)
blob.sentiment

Sentiment(polarity=0.475, subjectivity=0.8)

In [33]:
text = "Wolf experts urge UK police not to shoot escaped animal"

In [34]:
blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
blob.sentiment

Sentiment(classification='pos', p_pos=0.6175593484868018, p_neg=0.382440651513196)

In [35]:
blob = TextBlob(text)
blob.sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [5]:
text = "A 29-year-old man was shot Friday evening on the South Side."

In [6]:
blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
blob.sentiment

Sentiment(classification='pos', p_pos=0.7079960513228761, p_neg=0.2920039486771243)

In [7]:
blob = TextBlob(text)
blob.sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [8]:
text = "Three teenagers have been charged with felony robbery after they were\
taken into custody in connection with a string of robberies from the Near North Side to Kenwood."

In [9]:
blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
blob.sentiment

Sentiment(classification='pos', p_pos=0.895182524022347, p_neg=0.1048174759776575)

In [41]:
blob = TextBlob(text)
blob.sentiment

Sentiment(polarity=0.1, subjectivity=0.4)

In [42]:
text = "American and Southwest joined United Airlines in reporting expectation-beating earnings \
and unveiling expansion plans.  But investors, fearing that more flights might lead to a fare war, \
pounded airline stocks for a second day even as American Airlines signaled that higher fuel costs \
will probably force it to raise fares.."

In [43]:
blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
blob.sentiment

Sentiment(classification='pos', p_pos=0.9141571646860674, p_neg=0.0858428353139356)

In [44]:
blob = TextBlob(text)
blob.sentiment

Sentiment(polarity=0.15, subjectivity=0.2)

In [10]:
text = " The gorgeous Giulia Quadrifoglio seduces the soul and sears the \
senses with a beautiful balance of aggression and finesse. \
Alfa flaunts its racing pedigree with the four-leaf-clover \
badge displayed on the Giulia’s shapely flanks. \
Its Ferrari-derived twin-turbo V-6 sings a sinister tune, \
belting out 505 horsepower. Its clever, communicative chassis can \
conquer a race course with unfiltered ferocity or coolly traverse \
the tarmac without commotion. An excellent eight-speed automatic \
transmission and rear-wheel drive are standard; sadly, \
a manual gearbox is missing. Alfa Romeo’s past and present \
reliability issues also remain an unknown quantity. \
Still, the Giulia Quadrifoglio, or QF, is an exotic sports sedan \
that sets a new benchmark for the genre—which is why it made our list of 10Best Cars for 2018. ."

In [46]:
blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
blob.sentiment

Sentiment(classification='pos', p_pos=0.991948748693045, p_neg=0.008051251306939424)

In [47]:
blob = TextBlob(text)
blob.sentiment

Sentiment(polarity=0.15378787878787878, subjectivity=0.6241341991341992)

In [48]:
text = "My husband ordered a fruit arrangement for me for Valentine's Day. \
He had planned on taking me to the movies with two free tickets he was promised with a \
promotion you had been advertising. My husband was unaware that these tickets came via email. \
However, your sales representative who took his order failed to record his email address. \
Therefore we never received the tickets. \
I have called corporate and the store manager about this. \
They seem to not be able to resolve things in a timely manner. \
Also the fruit was not the best tasting. \
Needless to say we will never be supporting your business again. \
Overall poor customer service and a very overpriced product."

In [49]:
blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
blob.sentiment

Sentiment(classification='pos', p_pos=0.9993862773779272, p_neg=0.0006137226220642661)

In [50]:
blob = TextBlob(text)
blob.sentiment

Sentiment(polarity=0.08636363636363636, subjectivity=0.425)

## Reading from URL
#### BeautifulSoup to clean up meta-tags 

In [11]:
url = "https://en.wikipedia.org/wiki/University_of_Chicago"

In [12]:
from bs4 import BeautifulSoup
import urllib.request
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page.read(), "lxml")

In [13]:
uc_wiki = (soup.get_text())
#print (type(uc_wiki))
print (uc_wiki[7170:8010]) 

nkings.[9][10][11][12]
The university is composed of the College, various graduate programs and interdisciplinary committees organized into five academic research divisions and seven professional schools. Beyond the arts and sciences, Chicago is also well known for its professional schools, which include the Pritzker School of Medicine, the Booth School of Business, the Law School, the School of Social Service Administration, the Harris School of Public Policy Studies, the Divinity School and the Graham School of Continuing Liberal and Professional Studies. The university currently enrolls 5,971 undergraduate students, and 16,016 students overall.[13]
University of Chicago scholars have played a major role in the development of many academic disciplines, including sociology,[14] law,[15] economics,[16] literary criticism,[17] re


In [14]:
blob = TextBlob(uc_wiki)

#### Sentiment analysis on entire body of text might be difficult to interpret

In [15]:
blob.sentiment

Sentiment(polarity=0.08580098320482935, subjectivity=0.36561234951767513)

In [16]:
b_sentences = blob.sentences
print (b_sentences[10:15])

[Sentence("[21]  With an estimated completion date of 2021, the Barack Obama Presidential Center will be housed at the university and include both the Obama presidential library and offices of the Obama Foundation."), Sentence("[22]
The University of Chicago has many prominent alumni, faculty members and researchers."), Sentence("97 Nobel laureates[23] have been affiliated with the university as professors, students, faculty, or staff, making it a university with one of the highest concentrations of Nobel laureates in the world."), Sentence("Similarly, 34 faculty members and 17 alumni have been awarded the MacArthur "Genius Grant"."), Sentence("[24] In addition, Chicago's alumni and faculty include 53 Rhodes Scholars,[25] 25 Marshall Scholars,[26] 9 Fields Medalists,[27] 4 Turing Award Winners, 24 Pulitzer Prize winners,[28] 20 National Humanities Medalists,[29] 16 billionaire graduates and a plethora of members of the United States Congress and heads of state of countries all over the

In [17]:
b_sentiment = []
b_subjectivity = []
for sentence in blob.sentences:
    b_sentiment.append(str(sentence.sentiment.polarity))
    b_subjectivity.append(str(sentence.sentiment.subjectivity))

In [18]:
b_sen_sen = list(zip(b_sentences, b_sentiment, b_subjectivity))

In [19]:
print(*b_sen_sen[35:40], sep='\n\n')

(Sentence("Stagg is the namesake of the university's Stagg Field."), '0.0', '0.0')

(Sentence("[43]
The business school was founded thereafter in 1898[44] and the law school was founded in 1902."), '0.0', '0.0')

(Sentence("[45] Harper died in 1906[46] and was replaced by a succession of three presidents whose tenures lasted until 1929."), '0.0', '0.0')

(Sentence("[47] During this period, the Oriental Institute was founded to support and interpret archeological work in what was then called the Near East."), '0.1', '0.4')

(Sentence("[48]
In the 1890s, the University of Chicago, fearful that its vast resources would injure smaller schools by drawing away good students, affiliated with several regional colleges and universities: Des Moines College, Kalamazoo College, Butler University, and Stetson University."), '-0.040000000000000015', '0.62')


#### Analyzing sentiment for entire book

In [60]:
directory = './data/'
book = 'book_2.txt'

In [61]:
blob = TextBlob(open(directory+book, encoding="utf8").read())

FileNotFoundError: [Errno 2] No such file or directory: './data/book_2.txt'

In [None]:
type(blob)

In [None]:
blob.sentiment

### Processing the sentiment of each sentence in the book and passing to PandasDF

In [None]:
polarity = []
subjectivity = []
sentences = []
sentiment_df = pd.DataFrame(columns=['sentence', 'polarity', 'subjectivity'])

for sentence in blob.sentences:
    polarity.append(sentence.sentiment.polarity)
    subjectivity.append(sentence.sentiment.subjectivity)
    sentences.append(str(sentence.raw))

sentiment_df['sentence'] = sentences
sentiment_df['polarity'] = polarity
sentiment_df['subjectivity'] = subjectivity

sentiment_df['sentence'] = sentiment_df['sentence'].str.replace('\n', ' ')

In [None]:
sentiment_df.head(10)

In [None]:
polarity = []
subjectivity = []
sentences = []
sentiment_df = pd.DataFrame(columns=['sentence', 'polarity', 'subjectivity'])

for sentence in blob.sentences:
    polarity.append(sentence.sentiment.polarity)
    subjectivity.append(sentence.sentiment.subjectivity)
    sentences.append(str(sentence.raw))

sentiment_df['sentence'] = sentences
sentiment_df['polarity'] = polarity
sentiment_df['subjectivity'] = subjectivity

sentiment_df['sentence'] = sentiment_df['sentence'].str.replace('\n', ' ')

#### Most positive sentences in the book

In [None]:
sentiment_df.sort_values(by='polarity', ascending=False, inplace=True)
pd.set_option('display.max_colwidth', 100)
sentiment_df.head(10)

#### Most negative sentences in the book

In [None]:
sentiment_df.sort_values(by='polarity', ascending=True, inplace=True)
pd.set_option('display.max_colwidth', 100)
sentiment_df.head(10)

In [None]:
sentiment_df.sort_index(inplace=True)
sentiment_top_df = sentiment_df.head(n=200)
pd.set_option('display.max_colwidth', 100)
#sentiment_df.head(10)

#### Plotting sentiment changes in the book as the story unveils

In [None]:
plt.figure().set_size_inches(20, 10)
plt.plot(sentiment_top_df['polarity'])
plt.xlabel('Sentence')
plt.ylabel('Polarity')
plt.show()

## Analyzing Tweets with Python

In [None]:
directory = './data/'
file = 'jeep_new.txt'
path = directory + file

In [None]:
tweets = pd.read_csv(path,sep='\t', names = ['id', 'lang', 'created_at', 'screen_name', \
                                                       'name', 'location', 'retweet_count', 'text'])

tweets = tweets.sample(n=1000)

In [None]:
pd.set_option('display.max_rows', 10)

In [None]:
tweets.head(5)

In [None]:
tweets['lang'].value_counts()

In [None]:
# Filter non-English tweets
tweets_eng = tweets[tweets['lang']=='en'].reset_index(drop=True)

In [None]:
pd.set_option('display.max_colwidth', 50)

In [None]:
tweets_eng.head(5)

In [None]:
# Count records 
len(tweets_eng)

In [None]:
# Remove special characters to avoid problems with analysis
tweets_eng['text_clean'] = tweets_eng['text'].map(lambda x: re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(x)))

In [None]:
pd.set_option('display.max_colwidth', 100)
tweets_eng[['text', 'text_clean']].head(5)

In [None]:
blob = TextBlob(tweets_eng['text_clean'].to_string())

In [None]:
blob[:1000]

In [None]:
tags = TextBlob(tweets_eng['text_clean'].to_string()).tags

In [None]:
len(tags)

In [None]:
#list all possible tags and values
#nltk.help.upenn_tagset('.*')

In [None]:
tags[:10]

In [None]:
pd.set_option('display.max_colwidth', 50)
tags_pd = pd.DataFrame(tags, columns={"Word","POS Tag"})
tags_pd

In [None]:
nouns = TextBlob(tweets_eng['text_clean'].to_string()).noun_phrases

In [None]:
nouns[:100]

In [None]:
type(nouns)

In [None]:
pd.set_option('display.max_colwidth', 100)
# The x in the lambda function is a row (because axis=1 is set)
# Apply iterates the function accross the dataframe's rows
tweets_eng['nouns'] = tweets_eng.apply(lambda x: TextBlob(x['text_clean']).noun_phrases, axis=1)
tweets_eng[['nouns']].head(5)

In [None]:
tweets_eng['polarity'] = tweets_eng.apply(lambda x: TextBlob(x['text_clean']).sentiment.polarity, axis=1)
tweets_eng['subjectivity'] = tweets_eng.apply(lambda x: TextBlob(x['text_clean']).sentiment.subjectivity, axis=1)
tweets_eng[['text_clean', 'polarity', 'subjectivity']].head(5)

In [None]:
tweets_eng[['text_clean', 'polarity', 'subjectivity']][tweets_eng['polarity'] > 0.6].head(5)

In [None]:
tweets_eng[['text_clean', 'polarity', 'subjectivity']][tweets_eng['polarity'] < -0.6].head(5)

## Very advanced (and labor intensive) sentiment analysis

In [None]:
# Read in the corpus of positive/negative words compiled by University of Pittsburgh, available at UNC Chapel Hill websie
# These came from researchers Theresa Wilson, Janyce Wiebe, and Paul Hoffmann at the University of Pittsburgh, 
# and were readily available at http://mpqa.cs.pitt.edu/

#Code adopted from: http://nealcaren.web.unc.edu/an-introduction-to-text-analysis-with-python-part-3/

from urllib.request import urlretrieve

import urllib
outpath = 'http://www.unc.edu/~ncaren/haphazard/'
directory = 'C://Users//Nick//Documents//Teaching//Data Projects//Text//Tweets//'
tweet_directory = 'C://Users//Nick//Documents//Teaching//Data Projects//Text//Tweets//'
file_pos = 'positive.txt'
file_neg = 'negative.txt'

corpus = [file_pos,file_neg]
for file in corpus:
    urlretrieve(outpath+file,tweet_directory+file)

In [None]:
tweet_list = tweets_eng['text_clean'].tolist()

In [None]:
pos_sent = open(tweet_directory +file_pos).read()
pos_words = pos_sent.split('\n')
neg_sent = open(tweet_directory +file_neg).read()
neg_words = neg_sent.split('\n')

In [None]:
# customize the dictionaries by adding and removing your own positive and negative words and get some counts

pos_add = ['your_pos_term_1, your_pos_term_2']

for term in pos_add:
    pos_words.append(term)

neg_add = ['your_neg_term_1, your_neg_term_2']

for term in neg_add:
    neg_words.append(term)

import re
from string import punctuation
from __future__ import division  
sentiment_scores=[]
for tweet in tweet_list:
    sentiment_score=0
    for p in list(punctuation):
        tweet=tweet.replace(p,'')
        words=tweet.split(' ')
    for word in words:
        if word in pos_words:
            sentiment_score=sentiment_score+1
        if word in neg_words:
            sentiment_score=sentiment_score-1
    sentiment_scores.append(sentiment_score/len(words))

tweet_sentiment=zip(tweet_list,sentiment_scores)

In [None]:
# Create a dataframe from the results
column_names = ["Text", "Sentiment_Score"]
sentiment_results = [tweet_list, sentiment_scores]
results_dict = dict(zip(column_names,sentiment_results))
all_tweets_df = pd.DataFrame.from_dict(results_dict, orient='columns')
all_tweets_df = all_tweets_df[column_names]   # set specific column order

In [None]:
# Create a list to store the sentiments
sent_list = []

# For each row in the column,
for row in all_tweets_df['Sentiment_Score']:
    if row > 0:
        sent_list.append('Positive')
    elif row < 0:
        sent_list.append('Negative')
    else:
        sent_list.append('Neutral')

# Create a column from the list
all_tweets_df['Sentiment_Label'] = sent_list

In [None]:
#Make sure I didn't loose any records
len(tweet_list) - len(all_tweets_df)

In [None]:
pd.set_option('display.max_colwidth', 150)

In [None]:
all_tweets_df.sample(frac=0.005, replace=True)

In [None]:
#all_tweets_df['Sentiment_Label'].value_counts()

In [None]:
plt.figure().set_size_inches(10, 5)

CountSentiment = pd.value_counts(all_tweets_df['Sentiment_Label'].values, sort=True)
print (CountSentiment)

#CountStatus.plot.barh()
CountSentiment.plot.bar()
plt.show()

#### Export the results of advanced sentiment analysis

In [None]:
writer = pd.ExcelWriter(tweet_directory+'jeep_adv_sentiment.xlsx', engine='xlsxwriter')
all_tweets_df.to_excel(writer, sheet_name='Tweets_Sentiment')
writer.save()