### Part2:

To calculate negativity we use Sentiword's dictionary data-set.

The following code ueses that data set to meassure positivie-negative words and weight the sentences:

We use the following class to help us estimate the negativity of sentences.

In [1]:
import pandas as pd
import numpy as np
import re
import nltk

# download required packages to your virtual environment (if needed)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/javaher/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/javaher/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/javaher/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/javaher/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
"""
Thi is the SentimentAnalysis Class
Provided by: https://github.com/anelachan/sentimentanalysis
"""
class SentimentAnalysis(object):
    """Class to get sentiment score based on analyzer."""

    def __init__(self, filename='SentiWordNet.txt', weighting='geometric'):
        """Initialize with filename and choice of weighting."""
        if weighting not in ('geometric', 'harmonic', 'average'):
            raise ValueError(
                'Allowed weighting options are geometric, harmonic, average')
        # parse file and build sentiwordnet dicts
        self.swn_pos = {'a': {}, 'v': {}, 'r': {}, 'n': {}}
        self.swn_all = {}
        self.build_swn(filename, weighting)

    def average(self, score_list):
        """Get arithmetic average of scores."""
        if(score_list):
            return sum(score_list) / float(len(score_list))
        else:
            return 0

    def geometric_weighted(self, score_list):
        """"Get geometric weighted sum of scores."""
        weighted_sum = 0
        num = 1
        for el in score_list:
            weighted_sum += (el * (1 / float(2**num)))
            num += 1
        return weighted_sum

    # another possible weighting instead of average
    def harmonic_weighted(self, score_list):
        """Get harmonic weighted sum of scores."""
        weighted_sum = 0
        num = 2
        for el in score_list:
            weighted_sum += (el * (1 / float(num)))
            num += 1
        return weighted_sum

    def build_swn(self, filename, weighting):
        """Build class's lookup based on SentiWordNet 3.0."""
        file = open(filename)
        records = []
        for line in file:
            if line.startswith('#'):
                continue
            temp = line.split('\t')
            if 0 == len(temp):
                continue
            records.append(temp)
        #print (type ( records) )
        #records = [line.split('\t') for line in open(filename)]

        #print( type( records) )
        
        for rec in records:
            # has many words in 1 entry
            words = rec[4].split()
            pos = rec[0]
            for word_num in words:
                word = word_num.split('#')[0]
                sense_num = int(word_num.split('#')[1])

                # build a dictionary key'ed by sense number
                if word not in self.swn_pos[pos]:
                    self.swn_pos[pos][word] = {}
                self.swn_pos[pos][word][sense_num] = float(
                    rec[2]) - float(rec[3])
                if word not in self.swn_all:
                    self.swn_all[word] = {}
                self.swn_all[word][sense_num] = float(rec[2]) - float(rec[3])

        # convert innermost dicts to ordered lists of scores
        for pos in self.swn_pos.keys():
            for word in self.swn_pos[pos].keys():
                newlist = [self.swn_pos[pos][word][k] for k in sorted(
                    self.swn_pos[pos][word].keys())]
                if weighting == 'average':
                    self.swn_pos[pos][word] = self.average(newlist)
                if weighting == 'geometric':
                    self.swn_pos[pos][word] = self.geometric_weighted(newlist)
                if weighting == 'harmonic':
                    self.swn_pos[pos][word] = self.harmonic_weighted(newlist)

        for word in self.swn_all.keys():
            newlist = [self.swn_all[word][k] for k in sorted(
                self.swn_all[word].keys())]
            if weighting == 'average':
                self.swn_all[word] = self.average(newlist)
            if weighting == 'geometric':
                self.swn_all[word] = self.geometric_weighted(newlist)
            if weighting == 'harmonic':
                self.swn_all[word] = self.harmonic_weighted(newlist)

    def pos_short(self, pos):
        """Convert NLTK POS tags to SWN's POS tags."""
        if pos in set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']):
            return 'v'
        elif pos in set(['JJ', 'JJR', 'JJS']):
            return 'a'
        elif pos in set(['RB', 'RBR', 'RBS']):
            return 'r'
        elif pos in set(['NNS', 'NN', 'NNP', 'NNPS']):
            return 'n'
        else:
            return 'a'

    def score_word(self, word, pos):
        """Get sentiment score of word based on SWN and part of speech."""
        try:
            return self.swn_pos[pos][word]
        except KeyError:
            try:
                return self.swn_all[word]
            except KeyError:
                return 0

    def score(self, sentence):
        """Sentiment score a sentence."""
        # init sentiwordnet lookup/scoring tools
        impt = set(['NNS', 'NN', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS',
                    'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN',
                    'VBP', 'VBZ', 'unknown'])
        non_base = set(['VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NNS', 'NNPS'])
        negations = set(['not', 'n\'t', 'less', 'no', 'never',
                         'nothing', 'nowhere', 'hardly', 'barely',
                         'scarcely', 'nobody', 'none'])
        stopwords = nltk.corpus.stopwords.words('english')
        wnl = nltk.WordNetLemmatizer()

        scores = []
        tokens = nltk.tokenize.word_tokenize(sentence)
        tagged = nltk.pos_tag(tokens)

        index = 0
        for el in tagged:

            pos = el[1]
            try:
                word = re.match('(\w+)', el[0]).group(0).lower()
                start = index - 5
                if start < 0:
                    start = 0
                neighborhood = tokens[start:index]

                # look for trailing multiword expressions
                word_minus_one = tokens[index-1:index+1]
                word_minus_two = tokens[index-2:index+1]

                # if multiword expression, fold to one expression
                if(self.is_multiword(word_minus_two)):
                    if len(scores) > 1:
                        scores.pop()
                        scores.pop()
                    if len(neighborhood) > 1:
                        neighborhood.pop()
                        neighborhood.pop()
                    word = '_'.join(word_minus_two)
                    pos = 'unknown'

                elif(self.is_multiword(word_minus_one)):
                    if len(scores) > 0:
                        scores.pop()
                    if len(neighborhood) > 0:
                        neighborhood.pop()
                    word = '_'.join(word_minus_one)
                    pos = 'unknown'

                # perform lookup
                if (pos in impt) and (word not in stopwords):
                    if pos in non_base:
                        word = wnl.lemmatize(word, self.pos_short(pos))
                    score = self.score_word(word, self.pos_short(pos))
                    if len(negations.intersection(set(neighborhood))) > 0:
                        score = -score
                    scores.append(score)

            except AttributeError:
                pass

            index += 1

        if len(scores) > 0:
            return sum(scores) / float(len(scores))
        else:
            return 0

    def is_multiword(self, words):
        """Test if a group of words is a multiword expression."""
        joined = '_'.join(words)
        return joined in self.swn_all

In [3]:
# actual code of senti-analyzer:
# index will be in between -1 to +1 starting from extreeme negative to totally positive

#analyzer = SentimentAnalysis(filename='../Data/SentiWordNet_3.0.0.txt',weighting='geometric')

analyzer = SentimentAnalysis(filename='../Data/SentiWordNet_3.0.0.txt',weighting='harmonic')

#analyzer = SentimentAnalysis(filename='../Data/SentiWordNet_3.0.0.txt',weighting='average')

In [4]:
# reading the dataset

data = pd.read_csv('../Data/data_IRA_Ads.csv', sep='\t') 

data['senti_index'] = np.nan

for index, eachAdText in data['AD_TEXT'].iteritems():
    
    # calculate the senti-index for each advert text
    tempIndex = 0
    if not pd.isnull(eachAdText):
        tempIndex = analyzer.score(eachAdText)
    data.loc[index, 'senti_index'] = tempIndex


In [5]:
# find positive ads
positiveData = data.loc[ data['senti_index'] > 0.2 ]
print ('Positive Ads:\n\n', len(positiveData))
print (positiveData[['AD_TEXT', 'senti_index']])

# find negative ads
negativeData =data.loc[ data['senti_index'] < -0.1 ]
print ('Negative Ads:\n\n', len(negativeData))
print (negativeData[['AD_TEXT', 'senti_index']])

Positive Ads:

 79
                                                AD_TEXT  senti_index
477   Good morning, Americans! Enjoy the beauty of o...     0.201383
488   It is not always the same thing to be a good m...     0.498321
544   The stars at night truly are big and bright in...     0.279027
575   Happiness is like a kiss. You must share it to...     0.307341
660   Good morning, dear friends! Have a tasty and c...     0.271075
932   What a beautiful and intelligent child she is....     0.282060
958             Believe it or not, black doctors exist.     0.241616
1075            Just because dads are the best friends.     0.256944
1090                           Go well beyond profiling     0.276962
1114  Happy Birthday, real American! We believe in you!     0.210417
1276                         Not all refugees are good.     0.850112
1705               Glasses make you 20% more attractive     0.216837
1716                 Evolve pidgey and he's pretty good     0.425056
1762           

In [6]:
data[['AD_TEXT', 'senti_index']].head(500)    


Unnamed: 0,AD_TEXT,senti_index
0,Instgogogo,0.000000
1,You can go to hell or follow @south_lone_star ...,-0.034383
2,Black lives matter because we see what's going...,-0.007088
3,Don't Shoot is a community site where you can ...,0.004022
4,WHERE is your VOICE?,0.025000
5,Conservative Christians. Pro-life/ gun/ police...,-0.026389
6,Conservative Christians. Pro-life/ gun/ police...,-0.026389
7,Trayvon Martin's lawyers wearing hoodies. Wear...,-0.055271
8,Edward was a combat engineer in Afghanistan wh...,0.066647
9,Unbelievable amount of the US government's was...,-0.015079
