### Libraries

In [1]:
import os
import sys
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
%matplotlib inline

### Quandl

In [3]:
import quandl

In [4]:
api_key = "Be49L9JHyXCXmNGmUwtX"
quandl.ApiConfig.api_key = api_key

### Helper Functions

In [5]:
filepath = '/Users/Joe/Documents/Metis/Projects/metis-two-Luther/python-scripts'
sys.path.append(filepath)

In [8]:
from luther import nasdaq_companies
ticker_dict = nasdaq_companies('abbrev')

In [10]:
#ticker_dict

In [21]:
def open_file(filename, filepath='/Users/Joe/Documents/Metis/Projects/metis-two-Luther/wikipedia-revision-logs-abbrev/'):
    """
    Opens the pickled dataframe stored at the specified location.
    ---
    IN: string
    OUT: pandas dataframe
    """
    with open(filepath + filename, 'rb') as picklefile: 
        df = pickle.load(picklefile)
    return df

In [55]:
temp = open_file('revision-log-ADBE.pkl')

In [76]:
temp.head()

Unnamed: 0,date,user,size,size_delta,minor_edit
0,2017-09-28 07:10:00,Sleske (talk | contribs),"(48,694 bytes)",(0),m
1,2017-09-28 05:05:00,Agrasen8080 (talk | contribs),"(48,694 bytes)",(0),0
2,2017-09-23 18:51:00,TaerkastUA (talk | contribs),"(48,694 bytes)",(-30),0
3,2017-09-21 05:30:00,Jon Kolbert (talk | contribs),"(48,724 bytes)",(-8),m
4,2017-09-20 19:40:00,Software121 (talk | contribs),"(48,732 bytes)",(-1),0


In [20]:
def save_file(df, filename, filepath='/Users/Joe/Documents/Metis/Projects/metis-two-Luther/combined-wiki-price-data/'):
    """
    Pickles a dataframe to the specified location.
    ---
    IN: dataframe, string
    OUT: void
    """    
    with open(filepath + filename, 'wb') as picklefile:
        pickle.dump(df, picklefile)

In [28]:
def pickle_file_name(abbrev):
    """
    From the stock ticker, return the name of the pickle file corresponding to the scraped wikipedia history
    of that company.
    ---
    IN: string
    OUT: string
    """
    filename = 'revision-log-' + abbrev + '.pkl'
    return filename

In [41]:
def abbrev_from_filename(filename):
    """
    From the filename of the pickled wikipedia dataframe, return the stock ticker.
    ---
    IN: string
    OUT: string
    """
    abbrev = filename.replace('revision-log-', '')
    abbrev = abbrev.replace('.pkl', '')
    return abbrev

### Clean and Organize Wikipedia Data

In [88]:
def format_wikipedia_df(filename):
    """
    Loads a saved dataframe of wikipedia revision history and returns a cleaned dataframe
    ---
    IN: string
    OUT: pandas dataframe
    """
    
    df = open_file(filename)
    
    df.rename(columns={'date': 'datetime',
                       'size': 'size (bytes)'}, inplace=True)
    
    df['date'] = df['datetime'].dt.date
    
    df['user'] = df['user'].apply(lambda x: x.split('(')[0])
    df['size (bytes)'] = df['size (bytes)'].apply(lambda x: x.replace('(empty)', '0')) # Note: Filling in data
    df['size (bytes)'] = df['size (bytes)'].apply(lambda x: x.strip('('))
    df['size (bytes)'] = df['size (bytes)'].apply(lambda x: x.replace(' bytes)', ''))
    df['size (bytes)'] = df['size (bytes)'].apply(lambda x: x.replace(',', ''))
    df['size (bytes)'] = df['size (bytes)'].astype(int)
    
    df['size_delta'] = df['size_delta'].apply(lambda x: x.replace('(', ''))
    df['size_delta'] = df['size_delta'].apply(lambda x: x.replace(')', ''))
    df['size_delta'] = df['size_delta'].apply(lambda x: x.replace(',', ''))
    df['size_delta'] = df['size_delta'].astype(int)
    
    df['minor_edit'] = df['minor_edit'].replace('m', 1)
    df['edit_count'] = 1
    
    df['date'] = pd.to_datetime(df['date'])
    
    return df.groupby('date').sum().reset_index()

### Get stock data to match

In [89]:
def get_stock_data_to_match(abbrev, df_wikipedia):
    """
    Given the stock ticker and corresponding wikipedia history data, 
    """
    start = df_wikipedia['date'].min().strftime('%Y-%M-%d')
    stop  = df_wikipedia['date'].max().strftime('%Y-%M-%d')
    dates = {'gte': start, 'lte': stop}
    df_ticker = quandl.get_table('WIKI/PRICES', ticker=abbrev, date=dates)
    df_ticker['date'] = pd.to_datetime(df_ticker['date'])
    return df_ticker

### Merge

In [90]:
def company_history(filename):
    abbrev = abbrev_from_filename(filename)
    df_wikipedia = format_wikipedia_df(filename)
    df_ticker = get_stock_data_to_match(abbrev, df_wikipedia)
    df = pd.merge(df_wikipedia, df_ticker, how='inner', on='date')
    return df.dropna(), abbrev

### Let's do this for every company collected

Let's wait till we finish re-scraping the wikipedia data first.

In [94]:
filepath_wikipedia_logs = '/Users/Joe/Documents/Metis/Projects/metis-two-Luther/wikipedia-revision-logs-abbrev/'

In [95]:
for filename in os.listdir(filepath_wikipedia_logs):
    if not filename.startswith('.'):
        df, abbrev = company_history(filename)
        save_file(df, 'wikipedia-and-stock-history-' + abbrev + '.pkl')
        print(f'Combined data for {abbrev}')

Combined data for AAL
Combined data for AAPL
Combined data for ADBE
Combined data for ADI
Combined data for ADP
Combined data for ADSK
Combined data for AKAM
Combined data for ALXN
Combined data for AMAT
Combined data for AMGN
Combined data for AMZN
Combined data for ATVI
Combined data for AVGO
Combined data for BBBY
Combined data for BIDU
Combined data for BIIB
Combined data for BMRN
Combined data for CA
Combined data for CELG
Combined data for CERN
Combined data for CHKP
Combined data for CHRW
Combined data for CHTR
Combined data for CMCSA
Combined data for COST
Combined data for CSCO
Combined data for CSX
Combined data for CTAS
Combined data for CTRP
Combined data for CTSH
Combined data for CTXS
Combined data for DISCA
Combined data for DISCK
Combined data for DISH
Combined data for DLTR
Combined data for EA
Combined data for EBAY
Combined data for EQIX
Combined data for ESRX
Combined data for EXPD
Combined data for EXPE
Combined data for FAST
Combined data for FB
Combined data for 