In [7]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
import ast
import numpy as np

In [8]:
#Here are a list of words that, when preceding, require us to inverse our score. 
#"I like Apple" has a score of 1.5, but "I don't like Apple" would have a score of -1.5 because of the negation
negate = \
    ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
     "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
     "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
     "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
     "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
     "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
     "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
     "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]

boosters = \
    ['absolutely', 'astonishingly', 'amazingly', 'greatly', 'completely', 'considerably', 'decidedly', 'deeply', 'effing',
     'enormously', 'exceedingly', 'supremely', 'strikingly', 'vastly', 'notably', 'surpassingly', 'terrifically',
     'immensely', 
     'entirely', 'especially', 'exceptionally', 'extremely', 'fabulously', 'flipping', 'flippin', 'fricking', 'frickin',
     'frigging', 'friggin', 'fully', 'fucking', 'greatly', 'hella', 'highly', 'hugely', 'incredibly', 'intensely', 'majorly',
     'more', 'most', 'particularly', 'purely', 'quite', 'really', 'remarkably', 'so', 'surprisingly', 'substantially',
     'thoroughly', 'totally', 'tremendously', 'uber', 'unbelievably', 'unusually', 'utterly', 'very']

decreasers = \
['almost', 'barely', 'scarcily', 'lacking', 'hardly', 'kind of', 'kinda', 'less', 'little', 'marginally', 'occasionally', 
 'partly', 'sort of',
'sorta', 'hardly', 'slightly', 'carelessly', 'somewhat', 'in part', 'relatively', 'not entirely', 'not fully',]


In [9]:
def create_ci_lexicon(lexicon_file):
    
    """
    Takes in an inital lexicon with information on how the final values was derived.
    Outputs confidence score for each individual word.
    """
    
    df = pd.read_csv(lexicon_file, encoding='cp437', header=None)
    
    def descriptive(x):
        return np.std(ast.literal_eval(x))
    
    df['std_dev'] = df[3].str.strip("'").apply(descriptive)
    
    basis = df.std_dev.mean() + 1.5 * df.std_dev.std()
    
    def normalize_conf(x):
        return (x - basis) / basis
    
    df['norm_conf'] = df.std_dev.apply(normalize_conf)
    
    return dict(zip(df[[0, 'norm_conf']][0].values, (1 - (df.norm_conf.rank() / df.norm_conf.rank().max()))))



In [10]:
ci_lexicon = create_ci_lexicon('HiddenAlphabet_lexicon.csv')
lexicon = dict(zip(pd.read_csv('HiddenAlphabet_lexicon.csv', 
                               encoding='cp437', 
                               header=None)[[0, 1]].values.T[0], 
                   pd.read_csv('HiddenAlphabet_lexicon.csv', 
                               encoding='cp437', 
                               header=None)[[0, 1]].values.T[1]))

In [11]:
class PreprocessText(object):
    """
    Sentiment relevant text properties. 
    """

    def __init__(self, text):

        self.text = text
        self.clean_text = self._clean_text()

    def _clean_text(self):

        wordz = self.text.split()

        lemmatizer = WordNetLemmatizer()
        words = []
        for word in wordz:
            words.append(lemmatizer.lemmatize(word))

        return words

In [12]:
class Sentiment(object):
    """
    Sentiment Analyzer
    """


    def __init__(self):

        self.lexicon = lexicon
        self.negate = negate
        self.ci_lexicon = ci_lexicon
        self.boosters = boosters
        self.decreasers = decreasers
            
    def score(self, text):
        """Calculate sentiment score for text"""
        words = PreprocessText(text).clean_text
        sentiments = []
        confidences = []
        for item in words:
            sentiments, confidences = self.sentiment_polarity(item, sentiments, confidences, words)
        score_interval = self.final_calculation(sentiments, confidences)
        
        return {'score': score_interval[0], 'confidence_interval': score_interval[1]}
        
    def sentiment_polarity(self, item, sentiments, confidences, words):
        """Checks the average sentiment score by querying our lexicon"""
        item_lowercase = item.lower()
        if item_lowercase in self.lexicon:
            weight = self.lexicon[item_lowercase] / 5 
            final_weight = self.modifiers(words, item, weight)
            confidence_score = self.ci_lexicon[item_lowercase]
            confidence_calc = self.confidence_calc(final_weight, confidence_score)
            sentiments.append(final_weight)
            confidences.append(confidence_calc)
        return sentiments, confidences
    
    
    def final_calculation(self, sentiments, confidences):
        """Applies all final calculations to get the final sentiment score"""
        
        if len(sentiments) == 0:
            final_calc = 0
            return final_calc, (0, 0)
        else:
            final_calc = sum(sentiments) / len(sentiments)
        
        count = 0
        upper = 0
        lower = 0
        for interv in confidences:
            upper += interv[1]
            lower += interv[0]
            count += 1
        
        return final_calc, (lower/count, upper/count)
    
    def modifiers(self, words, item, weight):
        """Checks to see if there is a negation word in the words
        I love apples = .72
        I don't love apples = -.72"""
        neg_coef = 1
        boost_coef = 1
        dec_coef = 1
        lexicon_index = words.index(item)
        preceding_word = words[lexicon_index - 1]
        if preceding_word in self.negate:
            neg_coef = -1
        if preceding_word in self.boosters:
            boost_coef = 1.25
        if preceding_word in self.decreasers:
            dec_coef = .8
        final_weight = weight * neg_coef * boost_coef * dec_coef
        return final_weight
    
    
    def confidence_calc(self, final_weight, confidence_score): 
        
        if final_weight >= 0:
            upper = final_weight + (1 - confidence_score) * final_weight
            lower = final_weight - (1 - confidence_score) * final_weight
        else: 
            lower = final_weight + (1 - confidence_score) * final_weight
            upper = final_weight - (1 - confidence_score) * final_weight
        return (lower, upper)
        
    

In [83]:
engine = db.create_engine('postgresql+psycopg2://don:dumbass5@mydb.c44ugb8jww48.us-east-2.rds.amazonaws.com:5432/postgres')
connection = engine.connect()
metadata = db.MetaData()
testtable = db.Table('sample_tweets2', metadata, autoload=True, autoload_with=engine)
query = db.select([testtable])
ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
df = pd.DataFrame(ResultSet)
df.columns = ResultSet[0].keys()
df['sentiment_score'] = df['sentiment_score'].astype('float64')

In [88]:
df[(df['time'] > '2019-06-14') & (df['time'] <= '2019-06-15')]

Unnamed: 0,field1,text,time,sentiment_score
7885,61781,Market Stats at End of Day $SPY $ES_F $IWM htt...,2019-06-14 22:00:01,0.00
8557,62229,$SPY clear 289.19 and hold above get fast ru...,2019-06-14 18:52:37,0.32
8841,62230,"$VIX LOD, $SPY may spike here in ""Power hour!""",2019-06-14 18:52:04,0.00
9714,30877,Sold #TRADING #FOLLOWTHEBULL #SPREADTRADING #S...,2019-06-14 09:55:19,0.00
11721,30878,Sold #TRADING #FOLLOWTHEBULL #SPREADTRADING #S...,2019-06-14 09:55:19,0.00
11927,30879,Sold #TRADING #FOLLOWTHEBULL #SPREADTRADING #S...,2019-06-14 09:55:19,0.00
12050,30880,Sold #TRADING #FOLLOWTHEBULL #SPREADTRADING #S...,2019-06-14 09:55:18,0.00
12588,30881,Sold #TRADING #FOLLOWTHEBULL #SPREADTRADING #S...,2019-06-14 09:55:18,0.00
12630,30965,#BuenosDiasATodos #trading #forex 📉😉 https:...,2019-06-14 09:31:48,0.00
13043,31831,‘Monopoly’-Style #BlockchainProperty #Trading ...,2019-06-14 05:55:31,0.00


In [13]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [37]:
dir(SentimentIntensityAnalyzer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_amplify_ep',
 '_amplify_qm',
 '_but_check',
 '_idioms_check',
 '_least_check',
 '_never_check',
 '_punctuation_emphasis',
 '_sift_sentiment_scores',
 'make_lex_dict',
 'polarity_scores',
 'score_valence',
 'sentiment_valence']

In [28]:
def vectorized_sentiment(text):
    try:
        return model.score(text)['score']
    except:
        return 0

In [50]:
def vader_sentiment(text):
    try:
        return model2.polarity_scores(text)['compound']
    except:
        return 0

In [81]:
vectorized_sentiment("The scooter craze is purely hype")

-0.12

In [82]:
vader_sentiment("The scooter craze is purely hype")

-0.1531

In [29]:
model = Sentiment()

In [30]:
df = pd.read_csv('trading-tweets.csv')

In [56]:
df['our_scores'] = df.text.apply(vectorized_sentiment)

In [57]:
df['vader_scores'] = df.text.apply(vader_sentiment)

In [58]:
df[['text','our_scores','vader_scores']]

Unnamed: 0,text,our_scores,vader_scores
0,RT @maya_preferred: Wow 🔥\nMaya Preferred 223 ...,0.420000,0.6239
1,$USDCAD has been slammed over the last 4 #trad...,-0.040000,-0.4215
2,$USDJPY has hit a 5-month low of 107.0696 at 0...,0.030000,-0.2732
3,$NZDUSD has hit a week high of 0.66 at 04:15 o...,0.280000,0.0000
4,RT @MartinRapaport: #HongKong fair opens with ...,0.340000,0.7964
5,RT @myforexeye: 📊Trade Call 📉|#EURINR |#Sell a...,0.000000,0.0000
6,"An Ok! Day \n\nDone for the day , again lovely...",0.700000,0.8748
7,RT @SiptoSlurrency: Verbalizing my thoughts ar...,-0.240000,-0.2960
8,RT @murthaburke: Let's go South Korea!! @authp...,0.000000,0.0000
9,RT @murthaburke: Look how easy it is to Mine X...,0.380000,0.4404


In [59]:
df.to_csv('comparing_models.csv')

In [45]:
model2 = SentimentIntensityAnalyzer()

In [49]:
model2.polarity_scores('hey')

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}