# Time Analysis of the Website's Attractiveness to the Different Classes of Raters

In [1]:
import pandas as pd
import numpy as np

import ingestion as ing

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns

import plotly.graph_objects as go

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Loading the data

In [2]:
# read the users
users_ddf = ing.read_parquet(
    path=ing.build_path(folderind="ba", filename="users_with_scores", ext=".parquet", basepath=ing.REFINED_PATH),
    keepcols=None,
    assume_missing=False,
    mode="lazy")

In [3]:
# compute dataframe
users_df = users_ddf.compute()
users_df.columns

Index(['n_ratings', 'n_reviews', 'uid', 'username', 'joined', 'location',
       'country', 'cfm_score', 'exp_score', 'xpl_score', 'adv_score', 'is_cfm',
       'is_exp', 'is_xpl', 'is_adv'],
      dtype='object')

In [4]:
# added a "classified" column to the dataframe to easily find classified/unclassified users
class_indicators = ["is_cfm", "is_exp", "is_xpl", "is_adv"]
users_df = users_df.assign(classified=(users_df[class_indicators].sum(axis=1) != 0).astype(int))
users_df.columns

Index(['n_ratings', 'n_reviews', 'uid', 'username', 'joined', 'location',
       'country', 'cfm_score', 'exp_score', 'xpl_score', 'adv_score', 'is_cfm',
       'is_exp', 'is_xpl', 'is_adv', 'classified'],
      dtype='object')

In [5]:
# sanity check, should be TRUE
len(users_df.query("classified == 0")) + len(users_df.query("+".join(class_indicators) + " > 0")) == len(users_ddf)

True

## Time Analysis

In [6]:
# all users accounted for
by_year = users_df.groupby(by=users_df.joined.dt.year)

In [7]:
# conformists
cfms = users_df.query("is_cfm == 1")
cfms_by_year = cfms.groupby(by=cfms.joined.dt.year)
n_cfms_by_year = pd.DataFrame(cfms_by_year.size(), columns=["size"])

In [8]:
# expert-like users
exps = users_df.query("is_exp == 1")
exps_by_year = exps.groupby(by=exps.joined.dt.year)
n_exps_by_year = pd.DataFrame(exps_by_year.size(), columns=["size"])

In [9]:
# adventurers
advs = users_df.query("is_adv == 1")
advs_by_year = advs.groupby(by=advs.joined.dt.year)
n_advs_by_year = pd.DataFrame(advs_by_year.size(), columns=["size"])

In [10]:
# explorers
xpls = users_df.query("is_xpl == 1")
xpls_by_year = xpls.groupby(by=xpls.joined.dt.year)
n_xpls_by_year = pd.DataFrame(xpls_by_year.size(), columns=["size"])

In [11]:
# all classified users
classified_users = users_df.query("classified == 1")
classified_by_year = classified_users.groupby(by=classified_users.joined.dt.year)
n_classified_by_year = pd.DataFrame(classified_by_year.size(), columns=["size"])

In [12]:
gpby_year = {
    "CFM": cfms_by_year,
    "XPL": xpls_by_year,
    "EXPL": exps_by_year,
    "ADV": advs_by_year
}
n_by_year = {
    "CFM": n_cfms_by_year,
    "XPL": n_xpls_by_year,
    "ADV": n_advs_by_year,
    "EXP": n_exps_by_year
    }

In [13]:
CMAP = {
    "CFM": "#4477AA",
    "XPL": "#228833",
    "EXP": "#CCBB44",
    "ADV": "#AA3377"
}

In [14]:
all_years = users_df.joined.dt.year.sort_values().unique()
all_years

array([1996, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
       2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017])

### What User Cathegories does the Website Attract Over the Years ?

In [15]:
def fill_gap_years(df):
    df = df.copy()
    for year in all_years:
        if year not in df.index:
            df.loc[year, "size"] = 0
    return df

In [33]:
fig1 = go.Figure()
fig2 = go.Figure()

# left plot
for catname, df in n_by_year.items():
    df = fill_gap_years(df).sort_index()
    fig1.add_scatter(
        x=df.index,
        y=df["size"].values,
        name=catname,
        mode="lines",
        marker_color=CMAP[catname])
    df["fraction"] = (df / n_classified_by_year)
    fig2.add_scatter(
        x=df.index,
        y=df["fraction"].values,
        name=catname,
        mode="lines",
        marker_color=CMAP[catname])
    
fig1.add_vline(x=2013, line_width=1, line_dash="dash", line_color="navy")
fig1.add_vline(x=2014, line_width=1, line_dash="dash", line_color="navy")
fig1.add_vrect(x0=2013, x1=2014, line_width=1, line_dash="dash", fillcolor="navy", opacity=0.2)

fig1.update_layout(
    title_text="Yearly Count of the Number of New Users By Category",
    yaxis_title="Number of new users",
    xaxis_title="Year",
    
    xaxis={'categoryorder':'array', 'categoryarray':all_years},
    
    barmode='stack',
    
    width=900, height=400,
    margin=dict(l=20, r=20, t=40, b=40),
    #plot_bgcolor="rgba(0,0,0,0)",
    #paper_bgcolor="rgba(0,0,0,0)"
    )


fig2.update_layout(
    title_text="Yearly Percentage Contribution of Each Cathegory to the Number of New Classified Users",
    yaxis_title="Percentage",
    xaxis_title="Year",
    
    xaxis={'categoryorder':'array', 'categoryarray':all_years},
    
    barmode='stack',
    
    width=900, height=400,
    margin=dict(l=20, r=20, t=40, b=40),
    #plot_bgcolor="rgba(0,0,0,0)",
    #paper_bgcolor="rgba(0,0,0,0)"
    )

fig2.add_vline(x=2013, line_width=1, line_dash="dash", line_color="navy")
fig2.add_vline(x=2014, line_width=1, line_dash="dash", line_color="navy")
fig2.add_vrect(x0=2013, x1=2014, line_width=1, line_dash="dash", fillcolor="navy", opacity=0.2)

fig1.show()
fig2.show()

### What Caused the Sudden Explosion in the Number of Users Between 2013 and 2014 ?

In [17]:
# conformists
cfms_by_month = cfms.groupby(by=[cfms.joined.dt.year, cfms.joined.dt.month])
n_cfms_by_month = pd.DataFrame(cfms_by_month.size(), columns=["size"])

In [18]:
# expert-like users
exps_by_month = exps.groupby(by=[exps.joined.dt.year, exps.joined.dt.month])
n_exps_by_month = pd.DataFrame(exps_by_month.size(), columns=["size"])

In [19]:
# explorers
xpls_by_month = xpls.groupby(by=[xpls.joined.dt.year, xpls.joined.dt.month])
n_xpls_by_month = pd.DataFrame(xpls_by_month.size(), columns=["size"])

In [20]:
# adventurers
advs_by_month = advs.groupby(by=[advs.joined.dt.year, advs.joined.dt.month])
n_advs_by_month = pd.DataFrame(advs_by_month.size(), columns=["size"])

In [21]:
n_by_month = {
    "XPL": n_xpls_by_month,
    "CFM": n_cfms_by_month,
    "ADV": n_advs_by_month,
    "EXP": n_exps_by_month
}

In [31]:
fig3 = go.Figure()

target_year=2014
for catname, df in n_by_month.items():
    df = df.loc[target_year]
    fig3.add_scatter(
        x=df.index,
        y=df["size"].values,
        name=catname,
        mode="lines",
        marker_color=CMAP[catname]
    )
    
fig3.update_layout(
    title_text="Percentage Contribution of Each Cathegory to the Number of New Classified Users in 2014",
    yaxis_title="Percentage",
    xaxis_title="Month",
    
    xaxis={'categoryorder':'array', 'categoryarray':all_years},
    
    barmode='stack',
    
    width=900, height=400,
    margin=dict(l=20, r=20, t=40, b=40),
    #plot_bgcolor="rgba(0,0,0,0)",
    #paper_bgcolor="rgba(0,0,0,0)"
    )

fig3.add_vline(x=5, line_width=1, line_dash="dash", line_color="navy")
fig3.add_vline(x=7, line_width=1, line_dash="dash", line_color="navy")
fig3.add_vrect(x0=5, x1=7, line_width=1, line_dash="dash", fillcolor="navy", opacity=0.2)

fig3.show()

In [30]:
#fig1.write_html("time_analysis_new_user_count.html")
#fig2.write_html("time_analysis_cat_perc_new_users.html")
#fig3.write_html("time_analysis_cat_perc_new_users_2014.html")