# Kaggle competition
# Web Traffic Time Series Forecasting
# https://www.kaggle.com/c/web-traffic-time-series-forecasting

In [None]:
import numpy
import pandas
import math
import re
import datetime as dt
import matplotlib.pyplot as plt

Import data

In [None]:
dataframe = pandas.read_csv('train_1.csv').fillna(0)
dataframe.head()

# Plot all pages visualisation together

In [None]:
df = pandas.DataFrame()
df['tot'] = dataframe.drop(['Page'], axis=1).sum().values
dates = list(dataframe.drop(['Page'], axis=1).columns.values)
df = df.set_index(pandas.DatetimeIndex(dates))
df.plot(figsize=(16,6))
plt.xlabel('Date')
plt.ylabel('# of visualizations')
plt.show()

# Study dataframe divided by languages

In [None]:
labels={'en':'English','ja':'Japanese','de':'German','na':'Media','fr':'French','zh':'Chinese','ru':'Russian','es':'Spanish'}
languages = ['zh','en','ja','de','na','fr','es','ru']

def find_language(url):
    # identify the language of a page from its url
    res = re.search('[a-z][a-z].wikipedia.org',url)
    if res:
        res=res.group()
        return res[0:2]
    return 'na'

# prepare dataframe with language specification
data_lang = []
for i in range(numpy.shape(dataframe)[0]):
    data_lang.append(find_language(dataframe.Page[i]))
dataframe = dataframe.assign(lang=data_lang)

# prepare dataframe with time-series and sum of visuailzations
# separated by different languages
for language in languages:
    df[labels[language]] = dataframe.loc[dataframe['lang'] == language].drop(['Page','lang'],axis=1).sum().values
df.head()

In [None]:
df.drop(['tot'], axis=1).plot(figsize=(16,6))
plt.xlabel('Date')
plt.ylabel('# of visualisations')
plt.show()

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

for language in languages:
#series = df['Chinese']
    result = seasonal_decompose(df[labels[language]], model='additive')
    result.trend[-20:].plot(figsize=(16,6), label='Trend %s'% labels[language])
    result.seasonal[-20:].plot(figsize=(16,6), label='Seasonal %s'% labels[language])
    plt.legend()
    plt.show()