# CS591 Final Project - Corey Clemente & Renzo Callejas

## Part 1: Functions for Scraping Twitter for Historical Tweets

In [None]:
from dateutil.parser import parse
from datetime import timedelta
import sqlite3
import got # 3rd party library for scraping twitter
import os

def check_folder(fname): # make proper folder structure for data
    if not os.path.isdir(fname):
        os.mkdir(fname)
        
def make_new(name): # make a database file to store tweets
    name = name.replace(' ', '_')
    check_folder('tweet_data')
    sqlite_file = 'tweet_data/' + name + '.db'
    table_name = name
    col1 = 'username'
    col2 = 'tweet'
    col3 = 'date'
    col4 = 'retweets'
    col5 = 'favorites'
    col_type = 'TEXT'
    conn = sqlite3.connect(sqlite_file)
    c = conn.cursor()
    c.execute('CREATE TABLE {tn} ({nf} {ft})'\
            .format(tn=table_name, nf=col1, ft=col_type))
    c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
            .format(tn=table_name, cn=col2, ct=col_type))
    c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
            .format(tn=table_name, cn=col3, ct=col_type))
    c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
            .format(tn=table_name, cn=col4, ct=col_type))
    c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
            .format(tn=table_name, cn=col5, ct=col_type))
    conn.commit()
    conn.close()

def get_data(query, start, end, N, show=True):
    start = parse(start)
    end = parse(end)
    diff = (end - start).days
    days = [(start + timedelta(days=i)).strftime("%Y-%m-%d") \
            for i in range(diff+2)] # get list of each day to get N tweets per day
    
    tweet_data = []
    try:
        for i in range(len(days)-1):
            if show:
                print 'Query: %s\n%d tweet(s) from %s\n' % (query, N, days[i])
            tweetCriteria = got.manager.TweetCriteria().setQuerySearch(query).\
                            setSince(days[i]).setUntil(days[i+1]).\
                            setMaxTweets(N) # scrape specifications
            tweets = got.manager.TweetManager.getTweets(tweetCriteria) # actually scrape for tweets
            if len(tweets) == 0: # IP blocked or no tweets found
                return tweet_data
            for tweet in tweets:
                tweet_data += [[tweet.username,
                                tweet.text,
                                tweet.date.strftime("%Y-%m-%d"),
                                str(tweet.retweets),
                                str(tweet.favorites)]]               
        return tweet_data
    except:
        return tweet_data

def run(N, keyword, start_date, end_date, show=True):
    make_new(keyword) # make new database for these tweets
    query_string = 'INSERT INTO ' + keyword.replace(' ', '_') + \
                   ' VALUES (?, ?, ?, ?, ?)'
    conn = sqlite3.connect('tweet_data/' + keyword.replace(' ', '_') + '.db')
    c = conn.cursor()

    to_insert = get_data(keyword, start_date, end_date, N, show=show) # all tweets
    if to_insert != []:
        c.executemany(query_string, to_insert) # insert all into database
        print keyword
        print 'Completed successfully!'
        print 'Saving and shutting down...'
        conn.commit()
        conn.close()
        print 'Everything saved.\n'
    else:
        print keyword
        print 'Returned empty tweet list: IP may be blocked or no tweets found!'
        print 'Shutting down, nothing saved...'
        conn.commit()
        conn.close()
        return

## Part 2: Functions for Parsing Tweets for Sentiment and Pairing Data with Stock Data

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer # sentiment analysis
from datetime import timedelta, date
from dateutil.parser import parse
from yahoo_finance import Share # library to get stock data
from textblob import TextBlob # for polarity and subjectivity analysis
import numpy as np
import sqlite3
import pickle
import os

def check_folder(fname): # make proper folder structure for data
    if not os.path.isdir(fname):
        os.mkdir(fname)
        
def load_database(name): # return database as list of tuples (inside tuple is entry)
    name = name.replace(' ', '_')
    check_folder('tweet_data')
    conn = sqlite3.connect('tweet_data/' + name + '.db')
    tweets = []
    with conn:
        c = conn.cursor()
        c.execute("SELECT * FROM " + name)
        for row in c.fetchall():
            tweets += [row]
        return tweets
        
def stock_data(stock, start_date, end_date):
    '''
    stock: string of stock symbol (e.g. YHOO)
    start_date: string of min date to get data for, format is yyyy-mm-dd
    end_date: same format as start_date, max date to get data for
    '''
    # get historical stock data and make chronoloogical order
    raw_data = Share(stock).get_historical(start_date, end_date)
    raw_data.reverse()
    
    data = []
    for i in range(len(raw_data)):
        date = parse(raw_data[i]['Date']) # parse to get weekday later
        data += [{'date': raw_data[i]['Date'], # date as string
                  'open': float(raw_data[i]['Open']),
                  'close': float(raw_data[i]['Close']),
                  'weekday': date.weekday()}]  # weekday as int, Mon = 0, Sun = 6
        data[-1]['change'] = data[-1]['close'] - data[-1]['open']

    pos, change, date = [], [], [] # datasets for models later
    for x in data:
        if x['change'] > 0:
            pos += [1]
            change += [x['change']]
            date += [(parse(x['date'])-timedelta(days=1)).strftime("%Y-%m-%d")]
        else:
            pos += [-1]
            change += [x['change']]
            date += [(parse(x['date'])-timedelta(days=1)).strftime("%Y-%m-%d")]
            
    return np.array(pos), np.array(change), np.array(date)
    
def build_company(stock_name, keyword):
    pos1, change, dates = stock_data(stock_name, '2015-01-01', '2015-12-31')
    raw_data = load_database(keyword)
    sid = SentimentIntensityAnalyzer()
    data = {}
    for tweet in raw_data: # tweet = (user, tweet, date, retweets, favorites)
        user, text, tdate, ret, fav = tweet
        s = sid.polarity_scores(text)
        com, neg, neu, pos = s['compound'], s['neg'], s['neu'], s['pos']
        pol, sub = TextBlob(text).sentiment
        alpha = int(ret) + int(fav) # place more weight on popular tweets
        if tdate not in data:
            data[tdate] = [0]*9 # [ret, favs, com, neg, neu, pos, pol, sub, count]
        data[tdate][0] += int(ret) * alpha
        data[tdate][1] += int(fav) * alpha
        data[tdate][2] += com * alpha
        data[tdate][3] += neg * alpha
        data[tdate][4] += neu * alpha
        data[tdate][5] += pos * alpha
        data[tdate][6] += pol * alpha
        data[tdate][7] += sub * alpha
        data[tdate][8] += alpha

    X = [] # input data matrix for models
    for i in range(len(dates)): # average all featured
        try:
            ret = data[dates[i]][0] / float(data[dates[i]][8])
            fav = data[dates[i]][1] / float(data[dates[i]][8])
            com = data[dates[i]][2] / float(data[dates[i]][8])
            neg = data[dates[i]][3] / float(data[dates[i]][8])
            neu = data[dates[i]][4] / float(data[dates[i]][8])
            pos = data[dates[i]][5] / float(data[dates[i]][8])
            pol = data[dates[i]][6] / float(data[dates[i]][8])
            sub = data[dates[i]][7] / float(data[dates[i]][8])
            X += [[ret, fav, com, neg, neu, pos, pol, sub, change[i-1]]] # vector in training matrix
        except:
            X += [[ret, fav, com, neg, neu, pos, pol, sub, 0]] # first day has no past stock data
    
    return np.array(X), pos1, change
    
def make_entire_dataset(keywords, stock_names, dataset_name): # save everything to pickled file
    data = {}
    for i in range(len(keywords)):
        print 'Parsing %s Tweet Data' % stock_names[i]
        X, y1, y2 = build_company(stock_names[i], keywords[i])
        data[stock_names[i]] = [X, y1, y2]
    check_folder('datasets')
    pickle.dump(data, open('datasets/' + dataset_name + '.p', 'wb'))

## Part 3: Functions for Loading Dataset and Building Models

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import numpy as np
import pickle
import os

%matplotlib inline

def check_folder(fname): # make proper folder structure for data
    if not os.path.isdir(fname):
        os.mkdir(fname)
        
def load_entire_dataset(dataset_name):
    check_folder('datasets')
    return pickle.load(open('datasets/' + dataset_name + '.p', 'rb'))

def random_split(X, y, percentage): # randomly split data into training and testing
    X, y = np.array(X), np.array(y)
    n = int(round(len(y)*percentage))
    lst = list(range(len(y)))
    np.random.shuffle(lst)
    test = lst[:n]
    train = lst[n:]
    return X[train], X[test], y[train], y[test]
        
def predict_using_all(clfs, X_test, y_test): # random forest of all models' answers
    X1, X2, y1, y2 = random_split(X_test, y_test, .7)
    results = np.array([clfs[i][0].predict(X1) for i in range(len(clfs))]).T
    clf = RandomForestClassifier()
    clf.fit(results, y1)
    results = np.array([clfs[i][0].predict(X2) for i in range(len(clfs))]).T
    return clf.score(results, y2)

def all_classifiers(dataset_name, percentage): # train each model on each company
    data = load_entire_dataset(dataset_name)
    best_x = [i for i in range(1, len(data) + 1)]
    best_x_name = []
    best_y = []
    for company in data:
        best = 0
        X, y = data[company][0], data[company][1]
        X_train, X_test, y_train, y_test = random_split(X, y, percentage)
        classifiers = [[KNeighborsClassifier(3),'KNN'], # use 9 different classifiers
                       [SVC(kernel="linear", C=0.025),'L-SVM'],
                       [SVC(gamma=2, C=1),'R-SVM'],
                       [AdaBoostClassifier(), 'ABC'],
                       [DecisionTreeClassifier(max_depth=8),'DT'],
                       [RandomForestClassifier(max_depth=8),'RF'],
                       [GaussianNB(),'GNB'],
                       [LinearDiscriminantAnalysis(),'LDA'],
                       [QuadraticDiscriminantAnalysis(),'QDA']]
        for clf in classifiers:
            clf[0].fit(X_train, y_train)
            best += clf[0].score(X_test, y_test) # get accuracy
        best_x_name += [company]
        best_y += [best / float(9)]
    plt.bar(best_x, best_y)
    for i in range(len(data)):
        plt.text(best_x[i] + .15, best_y[i] + .01, '%.2f' % best_y[i])
    plt.xticks([x + .5 for x in best_x], best_x_name)
    plt.title('Stock Movement Prediction Accuracy')
    plt.xlabel('Stock')
    plt.ylabel('Accuracy')
    plt.ylim(.4, .6) # if you know the ranges of accuracies, makes graph prettier
    plt.show()
    
def aggregated(dataset_name, percentage): # "mini" stock market example, all tweets predict all stocks
    data = load_entire_dataset(dataset_name)
    best_x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
    best_x_name = []
    best = []
    X = []
    y = []
    for company in data:
        X_tmp, y_tmp = data[company][0], data[company][1]
        for x in X_tmp:
            X += [x]
        for x in y_tmp:
            y += [x]
    X = np.array(X)
    y = np.array(y)
    X_train, X_test, y_train, y_test = random_split(X, y, percentage)
    classifiers = [[KNeighborsClassifier(3),'KNN'],
                   [SVC(kernel="linear", C=0.025),'L-SVM'],
                   [SVC(gamma=2, C=1),'R-SVM'],
                   [AdaBoostClassifier(), 'ABC'],
                   [DecisionTreeClassifier(max_depth=8),'DT'],
                   [RandomForestClassifier(max_depth=8),'RF'],
                   [GaussianNB(),'GNB'],
                   [LinearDiscriminantAnalysis(),'LDA'],
                   [QuadraticDiscriminantAnalysis(),'QDA']]
    for clf in classifiers:
        best_x_name += [clf[1]]
        clf[0].fit(X_train, y_train)
        best += [clf[0].score(X_test, y_test)]
    plt.bar(best_x, best)
    for i in range(9):
        plt.text(best_x[i] + .0, best[i] + .01, '%.2f' % best[i])
    plt.xticks([x + .5 for x in best_x], best_x_name)
    plt.title('Stock Market Movement Prediction Accuracy')
    plt.xlabel('Method')
    plt.ylabel('Accuracy')
    plt.ylim(.4, .6) # if you know the ranges of accuracies, makes graph prettier
    plt.show()
    print 'Heuristic: %.3f' % predict_using_all(classifiers, X_test, y_test)

## Steps Taken for Our Exact Project

In [None]:
import time

# DO NOT RUN THIS - WILL TAKE ABOUT 17 HOURS
def this_project():
    keywords = ['citigroup', 'netflix', 'tesla', 'twitter', 'mcdonalds', 'walmart', 'microsoft', 'disney']
    stock_names = ['C', 'NFLX', 'TSLA', 'TWTR', 'MCD', 'WMT', 'MSFT', 'DIS']
    dataset_name = 'all_project_data'
    tweets_per_day = 400
    start_date = '2015-01-01'
    end_date = '2015-12-31'
    training_partition = .7

    # part 1: get tweets and store them
    for keyword in keywords:
        run(tweets_per_day, keyword, start_date, end_date)
        time.sleep(5) # to avoid IP address being blocked
        
    # part 2: sentiment analysis and pairing with stock data (making dataset)
    make_entire_dataset(keywords, stock_names, dataset_name)
    
    # part 3: results!
    all_classifiers(dataset_name, training_partition)
    aggregated(dataset_name, training_partition)
    
# CAN RUN THIS
def see_results_of_project():
    dataset_name = 'all_project_data'
    training_partition = .7
    
    all_classifiers(dataset_name, training_partition)
    aggregated(dataset_name, training_partition)
    
see_results_of_project()

## Make Your Own Small Version

In [None]:
import time

def make_small_version(keywords, stock_names, dataset_name, tweets_per_day, 
                       start_date, end_date, training_partition, show):
    # part 1: get tweets and store them
    for keyword in keywords:
        run(tweets_per_day, keyword, start_date, end_date, show=show)
        time.sleep(5) # to avoid IP address being blocked
        
    # part 2: sentiment analysis and pairing with stock data (making dataset)
    make_entire_dataset(keywords, stock_names, dataset_name)
    
    # part 3: results!
    all_classifiers(dataset_name, training_partition)
    aggregated(dataset_name, training_partition)
    
# Change these variables to whatever companies you like
# This takes about 2 minutes to run, can see entire process
keywords = ['netflix', 'facebook'] # what to search for in tweets
stock_names = ['NFLX', 'FB'] # must be correct or will fail
dataset_name = 'test02' # what to save dataset as in dataset folder
tweets_per_day = 10 # how many tweets to get per day
start_date = '2015-01-01' # day to start mining tweets
end_date = '2015-01-15' # day to end mining tweets
training_partition = .7 # percent of data to be used for training
show = True # print each day's progess while scraping, change to False to not do so
    
# IF YOU RESCRAPE A COMPANY TWICE, delete it's .db file. This won't run until you do 
# that as a saftey percaution so we wouldn't lost all of our data by accident
make_small_version(keywords, stock_names, dataset_name, tweets_per_day, 
                   start_date, end_date, training_partition, show)