In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from fbprophet import Prophet
import datetime
import multiprocessing
import sys

# Read in Raw Data and Keys

In [2]:
# Read in an take the transpose
rawData_df = pd.read_csv('../input/train.csv')
# Fill all NaN with zeros
rawData_df.fillna(value=0.0, inplace=True)

# Create Date Range

In [3]:
date_start = datetime.date(2017, 1, 1)
date_end = datetime.date(2017, 3, 1)
num_days = 60

date_list = [date_start + datetime.timedelta(days=x) for x in range(0, num_days)]

# Date Dataframe for Future Prediction

In [4]:
future_df = pd.DataFrame(date_list, columns=['ds'])

### Set up Multiprocessing

In [5]:
# Take the transpose
rawData_df = rawData_df.T

In [6]:
FORECAST_DIR = 'forecasts/'

In [15]:
def ProcessTimeSeries(idx):
    # Repeat the page name, date name times
    page_name = rawData_df.iloc[0, idx]
    page_name_list = [page_name] * len(date_list)
    
    try:
    
        # Set up the test frame
        test_df = rawData_df.iloc[1:, idx].to_frame().reset_index().fillna(method='ffill')
        test_df.columns = ['ds','y']
        test_df['ds'] = pd.to_datetime(test_df['ds'],format='%Y-%m-%d')

        # Train Out-of-the-box Prophet on the test dataframe
        m = Prophet(yearly_seasonality=True)
        m.fit(test_df)

        # Make a forecast
        forecast = m.predict(future_df)
        forecast = forecast[['ds', 'yhat']]
        
       
        forecast['page_date'] = pd.Series(["{0}_{1}".format(a_, b_) for a_, b_ in zip(page_name_list, date_list)])
        
        return {'page':page_name,
                'forecast':forecast.round(4) 
        }
    
    except:
        with open(FORECAST_DIR + 'error_log', 'a') as f:
            f.write(page_name + '\n')
        f.closed
        return { 'page': page_name,
                'forecast': pd.DataFrame()
        }

### Run Multiprocessing

In [23]:
globalCount = 0
jointFrame = pd.DataFrame()

pool = multiprocessing.Pool()
for resFrame in pool.imap_unordered(ProcessTimeSeries, list(range(0, rawData_df.shape[1]))):
    globalCount = globalCount+1
    
    # Alway update the joint frame
    if (jointFrame.empty):
        jointFrame = resFrame['forecast']
    else:
        jointFrame = jointFrame.append(resFrame['forecast'], ignore_index=True)
    
    # Reached reasonable row count -> spit joint frame to file and reset
    if (globalCount % 20000 == 0):
        print("writing")
        with open(FORECAST_DIR + 'all_rows.csv', 'a') as f:
            jointFrame.to_csv(f, header=False)
        
        # zero the counter   
        globalCount = 0 
        # Empty the joint frame
        jointFrame = pd.DataFrame() # 

# All rows done close the pool
pool.close()
pool.join()
                     
# Flush the jointFrame before closing
with open(FORECAST_DIR + 'all_rows.csv', 'a') as f:
    jointFrame.to_csv(f, header=False)

writing
writing
writing
writing
