In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

In [None]:
def load_trend(trend_name='football', country_code='us'):
    df = pd.read_csv('data/google-trends_'
                     + trend_name + '_'
                     + country_code
                     + '.csv').iloc[1:, :]
    df.columns = ['values']
    df['values'] = df['values'].str.replace('<1', '0').astype(int)
    return df

In [None]:
df = load_trend(**{'trend_name': 'data-science', 'country_code': 'us'})
#keyword arguments 

In [None]:
trends = [
    {'trend_name': 'data-science', 'country_code': 'us'},
    {'trend_name': 'football', 'country_code': 'us'},
    {'trend_name': 'football', 'country_code': 'uk'},
    {'trend_name': 'game-of-thrones', 'country_code': 'us'},
    {'trend_name': 'pokemon', 'country_code': 'us'},
    {'trend_name': 'taxes', 'country_code': 'us'},   
]

In [None]:
np.random.shuffle(trends)

In [None]:
trend_dfs = [load_trend(**trend) for trend in trends]

In [None]:
trend_df = trend_dfs[0]

In [None]:
import matplotlib; matplotlib.style.use('ggplot')

fig, axs = plt.subplots(len(trend_dfs), 1, figsize=(8, 10))
plt.tight_layout()
for i, trend_df in enumerate(trend_dfs):
    ax = axs[i]
    #ax.set_title(str(trends[i]))
    ax.plot(trend_df.index, trend_df['values'])
    ticks = ax.get_xticks()
    ax.set_ylim((0, 100))
    ax.set_xticks([tick for tick in ticks if tick%24 == 0])

In [None]:
trends

## Game of Thrones - Time Series

In [None]:
got_df = load_trend('game-of-thrones')
got_df['i'] = np.arange(len(got_df))
got_df['month'] = got_df['i'] % 12
got_df = got_df.loc[~(got_df['i']<=83)]

In [None]:
got_df['month'] +=1
# got_df

In [None]:
trend_model = LinearRegression()
trend_model.fit(got_df[['i']], got_df['values'])
trend_line = trend_model.predict(got_df[['i']])

In [None]:
month_encoder = OneHotEncoder(categories='auto')
month_encoder.fit(got_df[['month']])
month_data = month_encoder.transform(got_df[['month']]).toarray()

In [None]:
lr = LinearRegression()
data = np.hstack((got_df[['i']].values, month_data))
lr.fit(data, got_df['values'])
lr_pred = lr.predict(data)  # Predixctive model based on i and month

In [None]:
trend_df = got_df
fig, ax = plt.subplots(figsize=(8, 4))
ax.set_title('Game of Thrones Trend')
ax.plot(got_df['i'], trend_df['values'], label='Data',
       linewidth=.5, alpha=.8)
ax.plot(trend_df['i'], trend_line, label='Trend')
ax.plot(trend_df['i'], lr_pred, label='Regression', linestyle="dotted")
plt.legend()
ticks = ax.get_xticks()
ax.set_ylim((0, 100))
ax.set_xticks([tick for tick in ticks if tick%24 == 0])
plt.show()

In [None]:
residuals = trend_df['values'] - lr_pred

fig, ax = plt.subplots(figsize=(8, 4))
ax.set_title("Residuals")
ax.plot(trend_df['i'], trend_df['values'], label='Data',
       linewidth=.5, alpha=.8)
ax.plot(trend_df['i'], lr_pred, label='Regression', linestyle="dotted")
ax.plot(trend_df['i'], residuals,
        label='Residuals', linewidth=.5)

#ax.plot(trend_df.index, trend_line, label='trend')
plt.legend()
ticks = ax.get_xticks()
ax.set_ylim((-10, 90))
ax.set_xticks([tick for tick in ticks if tick%24 == 0])
plt.show()

## Data Science Trend - Time Series

In [None]:
datasci_df = load_trend('data-science')
datasci_df['i'] = np.arange(len(datasci_df))
datasci_df['month'] = datasci_df['i'] % 12

In [None]:
datasci_df['month'] += 1

In [None]:
datasci_df = datasci_df.loc[datasci_df['i']>107]
datasci_df

In [None]:
trend_model = LinearRegression()
trend_model.fit(datasci_df[['i']], datasci_df['values'])
trend_line = trend_model.predict(datasci_df[['i']])

In [None]:
month_encoder = OneHotEncoder(categories='auto')
month_encoder.fit(datasci_df[['month']])
month_data = month_encoder.transform(datasci_df[['month']]).toarray()

In [None]:
lr = LinearRegression()
data = np.hstack((datasci_df[['i']].values, month_data))
lr.fit(data, datasci_df['values'])
lr_pred = lr.predict(data)  # Predixctive model based on i and month

In [None]:
trend_df = datasci_df
fig, ax = plt.subplots(figsize=(8, 4))
ax.set_title('Data Science Trend')
ax.plot(datasci_df['i'], trend_df['values'], label='Data',
       linewidth=.5, alpha=.8)
ax.plot(trend_df['i'], trend_line, label='Trend')
ax.plot(trend_df['i'], lr_pred, label='Regression', linestyle="dotted")
plt.legend()
ticks = ax.get_xticks()
ax.set_ylim((0, 100))
ax.set_xticks([tick for tick in ticks if tick%24 == 0])
plt.show()

In [None]:
residuals = trend_df['values'] - lr_pred

fig, ax = plt.subplots(figsize=(8, 4))
ax.set_title("Residuals")
ax.plot(trend_df['i'], trend_df['values'], label='Data',
       linewidth=.5, alpha=.8)
ax.plot(trend_df['i'], lr_pred, label='Regression', linestyle="dotted")
ax.plot(trend_df['i'], residuals,
        label='Residuals', linewidth=.5)

#ax.plot(trend_df.index, trend_line, label='trend')
plt.legend()
ticks = ax.get_xticks()
ax.set_ylim((-10, 90))
ax.set_xticks([tick for tick in ticks if tick%24 == 0])
plt.show()

## Taxes in US Trend - Time Series

In [None]:
taxes_df = load_trend('taxes')
taxes_df['i'] = np.arange(len(taxes_df))
taxes_df['month'] = taxes_df['i'] % 12

In [None]:
taxes_df['month'] += 1
#fix how the month numbers in the month column from 0 represents jan to 1 represents jan

In [None]:
# taxes_df

In [None]:
trend_model = LinearRegression()
trend_model.fit(taxes_df[['i']], taxes_df['values'])
trend_line = trend_model.predict(taxes_df[['i']])

In [None]:
month_encoder = OneHotEncoder(categories='auto')
month_encoder.fit(taxes_df[['month']])
month_data = month_encoder.transform(taxes_df[['month']]).toarray()

In [None]:
lr = LinearRegression()
data = np.hstack((taxes_df[['i']].values, month_data))
lr.fit(data, taxes_df['values'])
lr_pred = lr.predict(data)  # Predixctive model based on i and month

In [None]:
trend_df = taxes_df
fig, ax = plt.subplots(figsize=(8, 4))
ax.set_title('Taxes')
ax.plot(trend_df['i'], trend_df['values'], label='Data',
       linewidth=.5, alpha=.8)
ax.plot(trend_df['i'], trend_line, label='Trend')
ax.plot(trend_df['i'], lr_pred, label='Regression', linestyle="dotted")
plt.legend()
ticks = ax.get_xticks()
ax.set_ylim((0, 100))
ax.set_xticks([tick for tick in ticks if tick%24 == 0])
plt.show()

In [None]:
residuals = trend_df['values'] - lr_pred

fig, ax = plt.subplots(figsize=(8, 4))
ax.set_title("Residuals")
ax.plot(trend_df['i'], trend_df['values'], label='Data',
       linewidth=.5, alpha=.8)
ax.plot(trend_df['i'], lr_pred, label='Regression', linestyle="dotted")
ax.plot(trend_df['i'], residuals,
        label='Residuals', linewidth=.5)

#ax.plot(trend_df.index, trend_line, label='trend')
plt.legend()
ticks = ax.get_xticks()
ax.set_ylim((-10, 90))
ax.set_xticks([tick for tick in ticks if tick%24 == 0])
plt.show()

## Create Functions for working with time series model

In [None]:
def fit_trend_model(trend_df):
    trend_df['i'] = np.arange(len(trend_df))
    trend_df['month'] = trend_df['i'] % 12
    trend_model = LinearRegression()
    trend_model.fit(trend_df[['i']], trend_df['values'])
    trend_line = trend_model.predict(trend_df[['i']])
    month_encoder = OneHotEncoder(categories='auto')
    month_encoder.fit(trend_df[['month']])
    month_data = month_encoder.transform(trend_df[['month']]).toarray()
    lr = LinearRegression()
    data = np.hstack((trend_df[['i']].values, month_data))
    lr.fit(data, trend_df['values'])
    return lr

def get_prediction_for_trend(topic, country):
    trend_df = load_trend(topic, country)
    lr = fit_trend_model(trend_df)
    lr_pred = lr.predict(data) # Predixctive model based on i and month
    return lr_pred
#lr_pred = get_prediction_for_trend('football', 'uk')

def plot_predictions(topic, country):
    trend_df = load_trend(topic, country)
    trend_df['i'] = np.arange(len(trend_df))
    trend_df['month'] = trend_df['i'] % 12
    trend_df['month'] += 1
    lr_pred = get_prediction_for_trend(topic, country)
    fig, ax = plt.subplots(figsize=(8, 4))
    #ax.set_title('Google Trends')
    ax.set_title(topic.title() + ' ' + country.upper() + ' ' + 'Trend')
    ax.plot(trend_df['i'], trend_df['values'], label='Data',
      linewidth=.5, alpha=.8)
    ax.plot(trend_df['i'], lr_pred, label='Regression', linestyle="dotted")
    plt.legend()
    ticks = ax.get_xticks()
    ax.set_ylim((0, 100))
    ax.set_xticks([tick for tick in ticks if tick%24 == 0])
    plt.show()

plot_predictions('football', 'uk')

In [None]:
plot_predictions('football', 'us')

## Conclusion

traditional time series forcasting comes before data science ever exists, it's important to be aware of trying to predict something new(the sales for the new product), try to look at different features/variables that might impact the market and sales for the particular new product segement, then do a prediction for the possible sales result for the new product. 