all weeks cleaned function

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import pytrends
import os
import time
import requests

from pytrends.request import TrendReq
from datetime import timedelta, date
from datetime import datetime as dt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import zscore

pio.renderers.default = 'iframe'
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
# pytrends = TrendReq()
pytrends = TrendReq(timeout=(10,25), retries=2, backoff_factor=0.1) #, hl='en-US'
scaler = MinMaxScaler()

# Data Input

In [3]:
cat = ['123movies','torrent','stream','online free','gangs of london season 2']

In [4]:
release_date = "20-10-2022"

# 1.0 Data retrieval

In [5]:
def download_data(cat,version,start_date,end_date):
    """
    
    """
    inter = []
    try:
        if version == "Hourly":
            historicaldf = pytrends.get_historical_interest(cat, year_start=int(start_date.split("-")[0]), month_start=int(start_date.split("-")[1]), 
                                                            day_start=int(start_date.split("-")[2]), hour_start=0, 
                                                            year_end=int(end_date.split("-")[0]), month_end=int(end_date.split("-")[1]), day_end=int(end_date.split("-")[2]), 
                                                            hour_end=23, cat=0, geo='GB', gprop='', sleep=0)
            inter.append(historicaldf.drop(columns='isPartial'))
        else:
            pytrends.build_payload(kw_list=cat,cat=0,geo='GB',timeframe="{} {}".format(start_date,end_date))
            inter.append(pytrends.interest_over_time().drop(columns='isPartial'))
        time.sleep(15)
    except requests.exceptions.Timeout:
        print("Timeout search: extend time.sleep")
    ## finalize df
    df = pd.concat(inter,axis=1)
    return df

In [6]:
def range_date(release_date):
    """
    
    """
    x = dt(int(release_date.split("-")[2]), int(release_date.split("-")[1]), int(release_date.split("-")[0]))
    start_date = x + timedelta(days=-2)
#     end_date = x + timedelta(days=+4)
    end_date = dt.today()
    start_date = start_date.strftime("%Y-%-m-%d")
    end_date = end_date.strftime("%Y-%-m-%d")
    return start_date, end_date

In [7]:
start_date, end_date = range_date(release_date)

In [8]:
df_raw = download_data(cat,'Hourly',start_date,end_date) 

# 2.0 Data preparation

In [9]:
df = df_raw.copy()

# 3.0 Features Enginerring

## 3.1 Piracy Demand Index

Indication of the weights retrived from insights from [MUSO](https://www.muso.com/magazine/piracy-data-overview-january-2022-to-august-2022).
Change weights to reflect the fact that "stream" on weekends are higher due to football. 

In [11]:
weights = [0.15,0.25,0.25,0.35]

In [12]:
df['Piracy Demand Index'] = df.drop(columns='gangs of london season 2').apply(lambda x: x.dot(weights),axis=1)

## 3.2 Z-Score

In [13]:
df['Piracy Score'] = zscore(df['Piracy Demand Index'])
df['Gangs Score'] = zscore(df['gangs of london season 2'])
df['Final Score'] = df['Piracy Score']*0.3 + df['Gangs Score']*0.7
df['Final Score Scaled'] = scaler.fit_transform(df['Final Score'].values.reshape(-1,1))

## 3.3 Extract date time info

In [14]:
df['Day'] = df.index.date
df['Hour'] = df.index.hour

In [15]:
df['Hour Range'] = 'Morning'
df['Hour Range'] = df['Hour Range'].mask((df['Hour']>=0) & (df['Hour']<=5),'Night')
df['Hour Range'] = df['Hour Range'].mask((df['Hour']>12) & (df['Hour']<=17),'Afternoon')
df['Hour Range'] = df['Hour Range'].mask((df['Hour']>17) & (df['Hour']<=21),'Evening')

In [16]:
df = df.reset_index()

# 4.0 Modeling
## 4.1 Bar Chart

In [17]:
gr = df.groupby('Day')['Piracy Score','Gangs Score','Final Score'].mean()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [18]:
fig = px.bar(gr,x=gr.index,y=gr.columns,title="When did consumer demand increase (or decrease) during the last week?<br><sup>Final Score: weighted average of Piracy and Gangs Score - Score related to the week under examination - Weekly mean centered at Zero</sup>")
fig.update_layout(title_x=0.5)
fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Score")
fig.update_layout(barmode='group')
fig.show()

## 4.2 Multicriteria 
### 4.2.1 Multicriteria Day by Day

In [19]:
df['date_str'] = df['Day'].apply(lambda x: str(x))

In [20]:
fig = px.scatter(df, x='Piracy Score', y='Gangs Score',color='Hour Range',custom_data=['date'],size='Final Score Scaled',animation_frame='date_str',
                 range_x=[df['Piracy Score'].min()-0.5,df['Piracy Score'].max()+0.5],
                 range_y=[df['Gangs Score'].min()-0.5,df['Gangs Score'].max()+0.5],
                 title="How did consumer demand behave during the selected time periods?<br><sup>Day by day animation - Every circle represents one hour of data gruped into four day parts (hour range).</sup>")
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
# fig.update_layout(showlegend=False)
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1000
fig.add_hrect(y0=0, y1=df['Gangs Score'].max()+0.5, line_width=0, fillcolor="red", opacity=0.1)
fig.add_vrect(x0=0, x1=df['Piracy Score'].max()+0.5, line_width=0, fillcolor="red", opacity=0.1)
fig.add_hrect(y0=0, y1=df['Gangs Score'].min()-0.5, line_width=0, fillcolor="green", opacity=0.1)
fig.add_vrect(x0=0, x1=df['Piracy Score'].min()-0.5, line_width=0, fillcolor="green", opacity=0.1)
fig.add_hline(y=0)
fig.add_vline(x=0)
fig.add_vrect(x0=0, x1=round(df['Piracy Score'].max()+0.5,5), annotation_text="Piracy increase, Gangs increase", annotation_position="top right", annotation=dict(font_size=12))
fig.add_vrect(x0=0, x1=round(df['Piracy Score'].min()-0.5,5), annotation_text="Piracy decrease, Gangs increase", annotation_position="top right", annotation=dict(font_size=12))
fig.add_vrect(x0=0, x1=round(df['Piracy Score'].max()+0.5,5), annotation_text="Piracy increase, Gangs decrease", annotation_position="bottom right", annotation=dict(font_size=12))
fig.add_vrect(x0=0, x1=round(df['Piracy Score'].min()-0.5,5), annotation_text="Piracy decrease, Gangs decrease", annotation_position="bottom right", annotation=dict(font_size=12))
fig.update_traces(marker=dict(line=dict(width=2,color='DarkSlateGrey')),selector=dict(mode='markers'))
fig.update_layout(title_x=0.5)
fig.show()

### 4.2.2 Multicriteria All Days

In [22]:
fig = px.scatter(df, x='Piracy Score', y='Gangs Score',color='date_str',custom_data=['date'],size='Final Score Scaled',
                 range_x=[df['Piracy Score'].min()-0.5,df['Piracy Score'].max()+0.5],
                 range_y=[df['Gangs Score'].min()-0.5,df['Gangs Score'].max()+0.5],
                 title="How did consumer demand behave during the selected time periods?<br><sup>Overall days - Every circle represents one hour of data - Legend interactive</sup>")
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.add_hrect(y0=0, y1=df['Gangs Score'].max()+0.5, line_width=0, fillcolor="red", opacity=0.1)
fig.add_vrect(x0=0, x1=df['Piracy Score'].max()+0.5, line_width=0, fillcolor="red", opacity=0.1)
fig.add_hrect(y0=0, y1=df['Gangs Score'].min()-0.5, line_width=0, fillcolor="green", opacity=0.1)
fig.add_vrect(x0=0, x1=df['Piracy Score'].min()-0.5, line_width=0, fillcolor="green", opacity=0.1)
fig.add_hline(y=0)
fig.add_vline(x=0)
fig.add_vrect(x0=0, x1=round(df['Piracy Score'].max()+0.5,5), annotation_text="Piracy increase, Gangs increase", annotation_position="top right", annotation=dict(font_size=10))
fig.add_vrect(x0=0, x1=round(df['Piracy Score'].min()-0.5,5), annotation_text="Piracy decrease, Gangs increase", annotation_position="top right", annotation=dict(font_size=10))
fig.add_vrect(x0=0, x1=round(df['Piracy Score'].max()+0.5,5), annotation_text="Piracy increase, Gangs decrease", annotation_position="bottom right", annotation=dict(font_size=10))
fig.add_vrect(x0=0, x1=round(df['Piracy Score'].min()-0.5,5), annotation_text="Piracy decrease, Gangs decrease", annotation_position="bottom right", annotation=dict(font_size=10))
fig.update_traces(marker=dict(line=dict(width=2,color='DarkSlateGrey')),selector=dict(mode='markers'))
fig.update_layout(title_x=0.5)
fig.update_traces(hovertemplate="<br>".join([
        "Pirate Score: %{x}",
        "Gangs Score: %{y}",
        "Date & Hour: %{customdata[0]}"]))
fig.show()

## 4.3 PCA

Note: change name detection for PCA

It's been also tested SparsePCA for validation of the PCA and TruncatedSVD --> More or less all give the same results.

In [23]:
def principal_component_analysis(df):
    """
    
    """
    x = df.iloc[:,1:6]
    x2 = StandardScaler().fit_transform(x.values)
    x3 = pd.DataFrame(x2, index=x.index, columns=x.columns)
    pca = PCA(n_components=0.9,random_state=1)
    pca.fit(x3)
    var1 = pca.explained_variance_ratio_[0]
    sv1 = pca.singular_values_[0]
    comp = pd.DataFrame(pca.components_,columns=[x3.columns]).T
    comp = comp.reset_index()
    comp = comp.sort_values(by=0,ascending=False)
    comp = comp.rename(columns={'level_0':'Keyword',0:'Score1'})
    fig = px.bar(comp,x=comp['Keyword'],y=comp['Score1'],color=comp['Keyword'])
    fig.update_layout(
        title="Keywords Score Importance of the week",
        xaxis_title="Keywords searched",
        yaxis_title="Score")
    fig.update_layout(title_x=0.5)
    fig.show()
    return var1, sv1

In [24]:
var1, sv1 = principal_component_analysis(df)

eigenvector?

In [27]:
# eig = pd.DataFrame(pca.transform(x3),index=df['date'])

In [28]:
# plt.figure(figsize=(10,10))
# plt.plot(eig[0])

## 4.4 CDS

In [29]:
def pds_static(df):
    """
    
    """
    for i in df:
        if (i == 'Piracy Demand Index') | (i == 'gangs of london season 2'):
            print(i)
            mean = df[i].mean() # the higher the mean, the higher the activity
            std = df[i].std() # the higher the std, the higher the movements
            var = df[i].quantile(0.99) # check for extreme value
            pds = 0.4*mean + 0.2*std + 0.4*var
            ub = 40
            lb = 0
            pds2 = 100*((1-0)*(pds-lb)/(ub-lb)+lb)
            print(pds2)

In [30]:
pds_static(df)

gangs of london season 2
40.263460402972896
Piracy Demand Index
68.04785343562413
