In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from fbprophet import Prophet
import matplotlib.pyplot as plt
import math as math

%matplotlib inline

In [2]:
# Load the data
train = pd.read_csv("../input/train_1.csv").fillna(0)
keys = pd.read_csv("../input/key_1.csv").fillna(0)
ss = pd.read_csv("../input/sample_submission_1.csv").fillna(0)

In [3]:
train.head()

Unnamed: 0,Page,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,...,2016-12-22,2016-12-23,2016-12-24,2016-12-25,2016-12-26,2016-12-27,2016-12-28,2016-12-29,2016-12-30,2016-12-31
0,2NE1_zh.wikipedia.org_all-access_spider,18.0,11.0,5.0,13.0,14.0,9.0,9.0,22.0,26.0,...,32.0,63.0,15.0,26.0,14.0,20.0,22.0,19.0,18.0,20.0
1,2PM_zh.wikipedia.org_all-access_spider,11.0,14.0,15.0,18.0,11.0,13.0,22.0,11.0,10.0,...,17.0,42.0,28.0,15.0,9.0,30.0,52.0,45.0,26.0,20.0
2,3C_zh.wikipedia.org_all-access_spider,1.0,0.0,1.0,1.0,0.0,4.0,0.0,3.0,4.0,...,3.0,1.0,1.0,7.0,4.0,4.0,6.0,3.0,4.0,17.0
3,4minute_zh.wikipedia.org_all-access_spider,35.0,13.0,10.0,94.0,4.0,26.0,14.0,9.0,11.0,...,32.0,10.0,26.0,27.0,16.0,11.0,17.0,19.0,10.0,11.0
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,48.0,9.0,25.0,13.0,3.0,11.0,27.0,13.0,36.0,10.0


In [4]:
def plot_prediction_and_actual_2(train, forecast, actual, xlim=None, ylim=None, figSize=None, title=None):
    fig, ax  = plt.subplots(1,1,figsize=figSize)
    ax.plot(pd.to_datetime(train.index), train.values, 'k.')
    ax.plot(pd.to_datetime(actual.index), actual.values, 'r.')
    ax.plot(pd.to_datetime(forecast.index), forecast.values, 'b-')
    ax.set_title(title)
    plt.show()
    

In [5]:
def median_model(df_train, df_actual, p, review=False, figSize=(12, 4)):
    
    def nanmedian_zero(a):
        return np.nan_to_num(np.nanmedian(a))
    
    df_train['y'] = df_train['y'].convert_objects(convert_numeric=True)
    df_actual['y'] = df_actual['y'].convert_objects(convert_numeric=True)
    visits = nanmedian_zero(df_train['y'].values[-p:])
    train_series = df_train['y']
    train_series.index = df_train.ds
    
    idx = np.arange( p) + np.arange(len(df_train)- p+1)[:,None]
    b = [row[row>=0] for row in df_train.y.values[idx]]
    pre_forecast = pd.Series(np.append(([float('nan')] * (p-1)), list(map(nanmedian_zero,b))))
    pre_forecast.index = df_train.ds
    
    forecast_series = pd.Series(np.repeat(visits, len(df_actual)))
    forecast_series.index = df_actual.ds
    
    forecast_series = pre_forecast.append(forecast_series)
    
    actual_series = df_actual.y
    actual_series.index = df_actual.ds
    
    print(forecast_series)
    print("--------------")
    print(actual_series)
    
    if(review):
        plot_prediction_and_actual_2(train_series, forecast_series, actual_series, figSize=figSize, title='Median model')
    
    return smape(forecast_series, actual_series)

In [6]:
def smape(predict, actual, debug=False):
    
    actual = actual.fillna(0)
    data = pd.concat([predict, actual], axis=1, keys=['predict', 'actual'])
    data = data[data.actual.notnull()]
    if debug:
        print('debug', data)
    
    evals = abs(data.predict - data.actual) * 1.0 / (abs(data.predict) + abs(data.actual)) * 2
    evals[evals.isnull()] = 0
    
    result = np.sum(evals) / len(data)
    
    return result

In [7]:
page_details = train.Page.str.extract(r'(?P<topic>.*)\_(?P<lang>.*).wikipedia.org\_(?P<access>.*)\_(?P<type>.*)')

page_details[0:10]

  """Entry point for launching an IPython kernel.


Unnamed: 0,topic,lang,access,type
0,2NE1,zh,all-access,spider
1,2PM,zh,all-access,spider
2,3C,zh,all-access,spider
3,4minute,zh,all-access,spider
4,52_Hz_I_Love_You,zh,all-access,spider
5,5566,zh,all-access,spider
6,91Days,zh,all-access,spider
7,A'N'D,zh,all-access,spider
8,AKB48,zh,all-access,spider
9,ASCII,zh,all-access,spider


In [8]:
def extract_series(df, row_num, start_idx):
    y = df.iloc[row_num, start_idx:]
    df = pd.DataFrame({ 'ds': y.index, 'y': y.values})
    return df

In [9]:
# Generate train and validate dataset

train_df = pd.concat([page_details, train], axis=1)

def get_train_validate_set(train_df, test_percent):
    train_end = math.floor((train_df.shape[1]-5) * (1-test_percent))
    train_ds = train_df.iloc[:, np.r_[0,1,2,3,4,5:train_end]]
    test_ds = train_df.iloc[:, np.r_[0,1,2,3,4,train_end:train_df.shape[1]]]
    return train_ds, test_ds

X_train, y_train = get_train_validate_set(train_df, 0.1)


In [10]:
train_df

Unnamed: 0,topic,lang,access,type,Page,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,...,2016-12-22,2016-12-23,2016-12-24,2016-12-25,2016-12-26,2016-12-27,2016-12-28,2016-12-29,2016-12-30,2016-12-31
0,2NE1,zh,all-access,spider,2NE1_zh.wikipedia.org_all-access_spider,18.0,11.0,5.0,13.0,14.0,...,32.0,63.0,15.0,26.0,14.0,20.0,22.0,19.0,18.0,20.0
1,2PM,zh,all-access,spider,2PM_zh.wikipedia.org_all-access_spider,11.0,14.0,15.0,18.0,11.0,...,17.0,42.0,28.0,15.0,9.0,30.0,52.0,45.0,26.0,20.0
2,3C,zh,all-access,spider,3C_zh.wikipedia.org_all-access_spider,1.0,0.0,1.0,1.0,0.0,...,3.0,1.0,1.0,7.0,4.0,4.0,6.0,3.0,4.0,17.0
3,4minute,zh,all-access,spider,4minute_zh.wikipedia.org_all-access_spider,35.0,13.0,10.0,94.0,4.0,...,32.0,10.0,26.0,27.0,16.0,11.0,17.0,19.0,10.0,11.0
4,52_Hz_I_Love_You,zh,all-access,spider,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,0.0,0.0,0.0,0.0,0.0,...,48.0,9.0,25.0,13.0,3.0,11.0,27.0,13.0,36.0,10.0
5,5566,zh,all-access,spider,5566_zh.wikipedia.org_all-access_spider,12.0,7.0,4.0,5.0,20.0,...,16.0,27.0,8.0,17.0,32.0,19.0,23.0,17.0,17.0,50.0
6,91Days,zh,all-access,spider,91Days_zh.wikipedia.org_all-access_spider,0.0,0.0,0.0,0.0,0.0,...,2.0,7.0,33.0,8.0,11.0,4.0,15.0,6.0,8.0,6.0
7,A'N'D,zh,all-access,spider,A'N'D_zh.wikipedia.org_all-access_spider,118.0,26.0,30.0,24.0,29.0,...,64.0,35.0,35.0,28.0,20.0,23.0,32.0,39.0,32.0,17.0
8,AKB48,zh,all-access,spider,AKB48_zh.wikipedia.org_all-access_spider,5.0,23.0,14.0,12.0,9.0,...,34.0,105.0,72.0,36.0,33.0,30.0,36.0,38.0,31.0,97.0
9,ASCII,zh,all-access,spider,ASCII_zh.wikipedia.org_all-access_spider,6.0,3.0,5.0,12.0,6.0,...,25.0,17.0,22.0,29.0,30.0,29.0,35.0,44.0,26.0,41.0


In [11]:
# This is to demo the median model
#print(train.iloc[[0]])
#print(train)

avg_score = 0

for i in range(0,len(X_train)):
    df_train = extract_series(X_train, i, 5)
    df_actual = extract_series(y_train, i, 5)
    lang = X_train.iloc[i,1]
    score = median_model(df_train.copy(), df_actual.copy(), 15, review=False)
    avg_score = avg_score + (score*100)
    break
    #print("The SMAPE score is : %.5f" % score)
#print(avg_score)


ds
2015-07-01     NaN
2015-07-02     NaN
2015-07-03     NaN
2015-07-04     NaN
2015-07-05     NaN
2015-07-06     NaN
2015-07-07     NaN
2015-07-08     NaN
2015-07-09     NaN
2015-07-10     NaN
2015-07-11     NaN
2015-07-12     NaN
2015-07-13     NaN
2015-07-14     NaN
2015-07-15    14.0
2015-07-16    14.0
2015-07-17    14.0
2015-07-18    14.0
2015-07-19    14.0
2015-07-20    14.0
2015-07-21    14.0
2015-07-22    14.0
2015-07-23    14.0
2015-07-24    14.0
2015-07-25    14.0
2015-07-26    14.0
2015-07-27    14.0
2015-07-28    14.0
2015-07-29    14.0
2015-07-30    15.0
              ... 
2016-12-02    19.0
2016-12-03    19.0
2016-12-04    19.0
2016-12-05    19.0
2016-12-06    19.0
2016-12-07    19.0
2016-12-08    19.0
2016-12-09    19.0
2016-12-10    19.0
2016-12-11    19.0
2016-12-12    19.0
2016-12-13    19.0
2016-12-14    19.0
2016-12-15    19.0
2016-12-16    19.0
2016-12-17    19.0
2016-12-18    19.0
2016-12-19    19.0
2016-12-20    19.0
2016-12-21    19.0
2016-12-22    19.0
2016-12-2

  
  import sys


In [12]:
avg_score0 = 43.507159185597586
avg_score1 = 45.48352382720768
avg_score2 = 45.371427285007306
avg_score3 = 42.86936386440958
avg_score4 = 47.17123850805835

In [13]:
avg_score2 = 0

for i in range(0,100):
    df_train = extract_series(X_train, i, 5)
    df_actual = extract_series(y_train, i, 5)
    lang = X_train.iloc[i,1]
    score = median_model(df_train.copy(), df_actual.copy(), 15, review=False)
    avg_score2 = avg_score2 + (score*100)
    #print("The SMAPE score is : %.5f" % score)
print(avg_score2)

  
  import sys


ds
2015-07-01     NaN
2015-07-02     NaN
2015-07-03     NaN
2015-07-04     NaN
2015-07-05     NaN
2015-07-06     NaN
2015-07-07     NaN
2015-07-08     NaN
2015-07-09     NaN
2015-07-10     NaN
2015-07-11     NaN
2015-07-12     NaN
2015-07-13     NaN
2015-07-14     NaN
2015-07-15    14.0
2015-07-16    14.0
2015-07-17    14.0
2015-07-18    14.0
2015-07-19    14.0
2015-07-20    14.0
2015-07-21    14.0
2015-07-22    14.0
2015-07-23    14.0
2015-07-24    14.0
2015-07-25    14.0
2015-07-26    14.0
2015-07-27    14.0
2015-07-28    14.0
2015-07-29    14.0
2015-07-30    15.0
              ... 
2016-12-02    19.0
2016-12-03    19.0
2016-12-04    19.0
2016-12-05    19.0
2016-12-06    19.0
2016-12-07    19.0
2016-12-08    19.0
2016-12-09    19.0
2016-12-10    19.0
2016-12-11    19.0
2016-12-12    19.0
2016-12-13    19.0
2016-12-14    19.0
2016-12-15    19.0
2016-12-16    19.0
2016-12-17    19.0
2016-12-18    19.0
2016-12-19    19.0
2016-12-20    19.0
2016-12-21    19.0
2016-12-22    19.0
2016-12-2

In [14]:
avg_score0 = 43.507159185597586
avg_score1 = 45.48352382720768
avg_score2 = 45.371427285007306
avg_score3 = 42.86936386440958
avg_score4 = 47.17123850805835
avg_score5= 46.817569252797466
avg_score6= 43.38640660187318
avg_score7= 48.989538687348156
avg_score8= 44.09729253186841
avg_score9= 45.798844831342524
avg_score10= 41.68591328191039
avg_score11 = 48.92279502895371
avg_score12 = 46.00161492031962
avg_score13 = 49.55082915183694
avg_score14= 26.43975802234293
