## Import Packages  

In [9]:
import sys,nltk,re,math,time
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import MWETokenizer
from nltk.corpus import stopwords
import _pickle as pc

import tqdm
import pandas as pd
import tweepy, nltk, csv, pickle, collections
import numpy as np
from nltk.sentiment import sentiment_analyzer
from nltk.classify import NaiveBayesClassifier

## Classification Functions

In [10]:
# Functions for formating data and extracting features

def formatInputData(input_data): # Format input data to ([text_tokens], SA_classifer) tuples
    formattedSentimentData = []
    for i in range(0, len(input_data)):
        ## Change df type
        formattedSentimentData.append((input_data['tweets'][i].text.split(), input_data['Classification'][i]))
    return (formattedSentimentData)
def feature_extract_func(document, word_features): # Apply feature definement to tweet text
    #doc_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in doc_words)
    return features
def get_word_features(wordlist): # Get words of all tweets, arranged by frequency
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features
def train_SA_classifier(input_data):
    # Define Sent_analysis and cross-validation wrapper
    SA = sentiment_analyzer.SentimentAnalyzer()
    # Prepare features
    all_words = get_word_features(SA.all_words(input_data))
    word_features = get_word_features(all_words)
    SA.add_feat_extractor(feature_extract_func, **{'word_features': word_features})
    training_set = SA.apply_features(input_data, labeled=True)  
    # Return classifier and write to file
    return (SA.train(NaiveBayesClassifier.train, training_set, save_classifier='sentiment_analysis_classifier'), word_features)

### Gather Data

In [30]:
amzn = pd.read_pickle('amzn_sample_data.p')
goog = pd.read_pickle('google_sample_data.pkl')
msft = pd.read_pickle('msft_sample_data.p')
aapl = pd.read_pickle('aapl_sample_data.p')
input_data = pd.concat([aapl, amzn, goog, msft])

In [28]:
input_data = pd.read_pickle('preprocessed_training.p')

In [29]:
input_data.head()

Unnamed: 0_level_0,text,retweets,favorites,followers
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-11-21 19:02:27,"[', 1, ']",0.0,0.0,0.0
2016-11-21 21:10:54,"[', 0, ']",0.0,0.0,0.0
2016-11-21 20:17:55,"[', 0, ']",0.0,0.0,0.0
2016-11-21 21:33:05,"[', -, ', ', 1, ']",0.0,0.0,0.0
2016-11-21 18:37:06,"[', 0, ']",0.0,0.0,0.0


### Display Data Statistics

In [12]:
input_data_counts = input_data['Classification'].value_counts()
print ('-------------------------------DATA--------------------------------\n')
print ('COUNTS(%d) -- Positive: %d, Neutral: %d, Negative: %d' %(input_data['Classification'].count(), input_data_counts[1], input_data_counts[0], input_data_counts[-1]))
print ('\n', input_data_counts)

-------------------------------DATA--------------------------------

COUNTS(2000) -- Positive: 474, Neutral: 1204, Negative: 322

  0    1204
 1     474
-1     322
Name: Classification, dtype: int64


### Train SentimentAnalyzer

In [13]:
print ('\n-------------------------------RUN MODEL--------------------------------\n')
SA_classifier, word_features = train_SA_classifier(formatInputData(input_data))


-------------------------------RUN MODEL--------------------------------

Training classifier
Saving sentiment_analysis_classifier


### Tweet Classifying

In [15]:
SA_classifier.classify(feature_extract_func(input_data.iloc[0]['tweets'].text.split(), word_features))

1

## Convert old Data format to new

In [16]:
input_data.head()

Unnamed: 0,Classification,Volume,favorites,followers,length,retweets,text,tweets
2016-11-21 19:02:27,1,,,,,,,"Status(in_reply_to_user_id=None, id_str='80077..."
2016-11-21 21:10:54,0,,,,,,,"Status(in_reply_to_user_id=None, extended_enti..."
2016-11-21 20:17:55,0,,,,,,,"Status(in_reply_to_user_id=None, id_str='80079..."
2016-11-21 21:33:05,-1,,,,,,,"Status(in_reply_to_user_id=None, id_str='80081..."
2016-11-21 18:37:06,0,,,,,,,"Status(in_reply_to_user_id=None, id_str='80077..."


In [33]:
input_data['text'] = [tweet.text for tweet in input_data['tweets']]

In [18]:
input_data.head()

Unnamed: 0,Classification,Volume,favorites,followers,length,retweets,text,tweets
2016-11-21 19:02:27,1,,,,,,Benzinga: Previewing #BlackFriday Week: Apple ...,"Status(in_reply_to_user_id=None, id_str='80077..."
2016-11-21 21:10:54,0,,,,,,"$AAPL Put Spread Trades 1,300 Times","Status(in_reply_to_user_id=None, extended_enti..."
2016-11-21 20:17:55,0,,,,,,#Apple offers iPhone 6S owners free battery re...,"Status(in_reply_to_user_id=None, id_str='80079..."
2016-11-21 21:33:05,-1,,,,,,$AAPL:\n\nForget Apple! Here’s a Better Stock ...,"Status(in_reply_to_user_id=None, id_str='80081..."
2016-11-21 18:37:06,0,,,,,,Apple $AAPL Should Buy Netflix to Boost Growth...,"Status(in_reply_to_user_id=None, id_str='80077..."


### Below is the Results

classifier = classify_tweet_sentiment(SA_classifier, word_features)
classifiers = classifier.classify_tweets(input_data[:100])
collections.Counter(classifiers)

## Preprocessing

In [31]:
input_data.head()

Unnamed: 0,Classification,Volume,favorites,followers,length,retweets,text,tweets
2016-11-21 19:02:27,1,,,,,,,"Status(in_reply_to_user_id=None, id_str='80077..."
2016-11-21 21:10:54,0,,,,,,,"Status(in_reply_to_user_id=None, extended_enti..."
2016-11-21 20:17:55,0,,,,,,,"Status(in_reply_to_user_id=None, id_str='80079..."
2016-11-21 21:33:05,-1,,,,,,,"Status(in_reply_to_user_id=None, id_str='80081..."
2016-11-21 18:37:06,0,,,,,,,"Status(in_reply_to_user_id=None, id_str='80077..."


In [32]:
STOCKS = ['aapl','goog','amzn','msft']
SYMBOLS = ['@','#','$','.',',',':', '…','...','(',')','"','[',']']
REMOVABLES = ['rt'] #'the', 'my','i','we','me','you']
STOPWORDS = set(stopwords.words('english'))
EMOTICONS = [(':)','smile'), ('(:','smile'), ('):','frown'), (':(','frown'), (':D','biggrin'), (':\'(','crying'), (':\'‑(','crying'), (')\':','crying'), (')-\':','crying'), ('D:','sadness'), (':O','surprise'), (':o','shock') ]

def preprocess(pdata, file = False):
    print('Preprocessing...')
    
    if file:
        dataframe = pd.read_pickle(pdata) #get pickled dataset from location passed in as a parameter to the function
    if not file:
        dataframe = pdata
    preprocessed_dataframe = pd.DataFrame(columns=['Classification','volume','favorites','followers','length', 'retweets', 'text', 'tweets']).set_index('date')
    
    ##ITERATE THROUGH EVERY TWEET IN THE DATAFRAME
    for it, tweet in tqdm.tqdm(dataframe.iterrows()):
        text = str(tweet['text'])
        retweets = tweet['retweets']
        favorites = tweet['favorites']
        followers = tweet['followers']
        date = it
        
        ##if either retweets, favorites, or followers is NaN, replace NaN with 0
        if(math.isnan(retweets)):
            retweets = 0
        if(math.isnan(favorites)):
            favorites = 0
        if(math.isnan(followers)):
            followers = 0
            
        #text = text+':) :( :) :(' #TEST for emoticon replacement
        #text = text+"gooooood, jeeeeezz" #TEST for repeated letter reduction
        
        text = text.replace('#','') #remove hashes
        text = text.replace('%', 'percent')
        ##Iterate though listed emoticons and their corrisponding emotions, replace symbol with emotion word
        for symbol, emotion  in EMOTICONS:
            text = text.replace(symbol, emotion)
            
        text =  re.sub(r"http\S+", "", text) #remove URLs from Tweet text
        #text = re.sub(r'([a-zA-Z])\1{3,}', r'\1\1\1', text) #reduce excessively long repeated letters ##REPLACED BY TweetTokenizer built-in function
        tk = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) #create new TweetTokenizer, take all text to lowercase and remove users handles from Tweet text
        tk2 = MWETokenizer()
        
        tokenizedtext = tk.tokenize(text) #tokenize the Tweet text using TweetTokenizer
        tokenizedtext = tk2.tokenize(tokenizedtext)
        tokenizedtext = [word for word in tokenizedtext if (word not in STOPWORDS and word not in SYMBOLS and word not in STOCKS and word not in REMOVABLES)] #remove stopwords, extra symbols, target stocks, and other removable phrases
        
        temp = pd.DataFrame({
            'date':date,
            'text':[tokenizedtext],
            'retweets':retweets,
            'favorites':favorites,
            'followers':followers
        }, columns=['Classification','volume','favorites','followers','length', 'retweets', 'text', 'tweets']).set_index('date')
        preprocessed_dataframe = preprocessed_dataframe.append(temp)
        
    #print(preprocessed_dataframe) #printout of the preprocessed dataframe
    preprocessed_dataframe.to_pickle('preprocessed_tweets_s'+str(int(len(dataframe)/100))+'.p') #create a pickled dataframe with a semi-unique identifier (based on the number of rows in the dataframe)
    return preprocessed_dataframe #return the dataframe
    
#preprocess('final_data.p')

In [34]:
pd.to_pickle(preprocess(input_data), 'preprocessed_training.p') 

Preprocessing...


KeyError: 'date'