## RuPaul's Drag Race: An Analysis Using Plotly Dash
**Created by: Duncan Wang**

Resources:
- https://dash.plotly.com/
- https://dash-bootstrap-components.opensource.faculty.ai/
- https://rupaulsdragrace.fandom.com/wiki/RuPaul%27s_Drag_Race_Wiki
- http://svmiller.com/blog/2019/02/dragracer-rupauls-drag-race-analysis/

In [None]:
#import dash, plotly, and dash for jupyter notebook 
import dash
import plotly.express as px
import plotly.graph_objs as go
from jupyter_dash import JupyterDash #for jupyter notebook

#import front end libraries 
import dash_core_components as dcc #core components for front end 
import dash_html_components as html #html components for front end
import dash_bootstrap_components as dbc #boostrap components for front end
from dash.dependencies import Input, Output

#other packages 
import pandas as pd
import numpy as np

#for clustering
from sklearn.cluster import KMeans

### Data Cleaning & Preprocessing

In [None]:
#import files
#Available from: https://cran.r-project.org/web/packages/dragracer/index.html
rpdr_c = pd.read_csv('rpdr_contestants.csv')
rpdr_cp = pd.read_csv('rpdr_contestant_performance.csv')
rpdr_e = pd.read_csv('rpdr_episodes.csv')

In [None]:
#DATA PREPROCESSING: ALL

#episodes where no ranked competition occurred are excluded (i.e. the reunion)
to_remove_11 = rpdr_cp[(rpdr_cp['season'] == 'S11')& (rpdr_cp['episode'] == 13)]
to_remove_12 = rpdr_cp[(rpdr_cp['season'] == 'S12')& (rpdr_cp['episode'] == 13)]
rpdr_cp.drop(to_remove_11.index, axis = 0, inplace = True)
rpdr_cp.drop(to_remove_12.index, axis = 0, inplace = True)

#regroup episode rankings
#making it to the finale is considered a win
#OUT = not in episode/or eliminated both included
rpdr_cp['outcome2'] = rpdr_cp['outcome']
rpdr_cp['outcome2'].replace(['OUT', 'LOST1ST ROUND','LOST2ND ROUND','LOST3RD ROUND','MISSCON','DISQ','RTRN','SAFE+DEPT', np.NaN], 'OUT', inplace = True)
rpdr_cp['outcome2'].replace(['BTM'],'BOTTOM', inplace = True)
rpdr_cp['outcome2'].replace(['TOP2','WIN+RTRN'],'WIN', inplace = True)

In [None]:
#DATA PREPROCESSING: WINNERS 

#filter to winners only 
winners = rpdr_cp[rpdr_cp['rank'] == 1]
#remove index 1706, 676 for seasons where winner did not compete in first episode
winners.drop([676, 1706], axis = 0, inplace = True)

#scale outcomes 
winners['outcome_count'] = winners.groupby('contestant')['participant'].transform('count')
winners['outcome_count_scaled'] = winners['outcome_count']/winners.groupby('contestant')['outcome_count'].transform('sum')

#DATA PREPROCESSING: MISS CONGENIALITY 

missc = rpdr_cp[rpdr_cp['missc'] == 1]
missc['outcome_count'] = missc.groupby('contestant')['participant'].transform('count')
missc['outcome_count_scaled'] = missc['outcome_count']/missc.groupby('contestant')['outcome_count'].transform('sum')

#DATA PREPROCESSING: DUSTED OR BUSTED
db = rpdr_cp[rpdr_cp['finale'] == 0]

db['db_factor'] = db['outcome']
db['db_factor'].replace(['OUT',np.NaN],0, inplace = True)
db['db_factor'].replace(['BTM'],0, inplace = True)
db['db_factor'].replace(['LOW'],2.5, inplace = True)
db['db_factor'].replace(['SAFE','SAFE+DEPT'],5, inplace = True)
db['db_factor'].replace(['HIGH','TOP2'],7.5, inplace = True)
db['db_factor'].replace(['WIN','WIN+RTRN'],10, inplace = True)

db['db_sum'] = db.groupby('contestant')['db_factor'].transform('sum')/db.groupby('contestant')['db_factor'].transform('count')

db.drop_duplicates(subset=['contestant'],keep = 'first', inplace = True)
db_plot = db.sort_values(by = 'db_sum', ascending = False).head(20)

db_plot['rank'].replace(1, 'WINNER',inplace = True)
db_plot['rank'].replace([2,3], 'OTHER',inplace = True)

#DATA PREPROCESSING: CLUSTERING 
#cluster by: age, season, season, rank, dusted and busted score 
cluster = pd.DataFrame()
cluster['contestant']  = rpdr_c['contestant']
cluster['age']  = rpdr_c['age']
cluster['season'] = rpdr_c['season'].map(lambda x: x.lstrip('S')).astype('int')
cluster = cluster.merge(db[['rank','db_sum','contestant']], on = 'contestant', how = 'inner')
cluster = cluster.rename({'db_sum':'dusted and busted score','rank':'place'}, axis = 1)
cluster.dropna(axis = 0, how = 'any', inplace = True)

### Part 1: Dash App of Contestant Rankings

In [None]:
#first create a simple bar graph 

import plotly.express as px
import pandas as pd

#filter to season 5 data only 
s5 = rpdr_cp[rpdr_cp['season'] == 'S05']

#create bar chart 
fig1 = px.bar(s5, x='contestant', color = 'outcome2', 
             labels={'count':'aggregate performance', 'contestant':'contestant (in order of season rank)','outcome2':'episode outcome'},
             color_discrete_sequence=['#636EFA', '#FF6692', '#00CC96', '#FFA15A', '#FECB52','lightgrey'],
             category_orders={"outcome2": ["WIN", "HIGH", "SAFE", "LOW", "BOTTOM","OUT"]})

fig1.update_layout(legend_traceorder='normal',xaxis={'categoryorder':'array', 'categoryarray':s5['contestant'].unique()})
fig1.show()

In [182]:
#incorporate bar graph within dash application 

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = JupyterDash(__name__, external_stylesheets=external_stylesheets)

app.layout = html.Div([
    html.H6(
        children = 'Comparison of Contestant Performance by Season',
        style = {'textAlign': 'center', 'color': '#636EFA'}),
    dcc.Dropdown(
        id = 'season_dropdown', clearable = False,
        value = 'S07',
        options = [{'label':i, 'value': i} for i in rpdr_cp['season'].unique()],
        style = {'width': '75%', 'margin': 'auto'}
    ),
    dcc.Graph(
        id='rpdr_graph',
        figure= {}
    )
])
@app.callback( 
    Output('rpdr_graph', 'figure'), #this displays the graph 
    [Input('season_dropdown', 'value')]) #this takes the ticker dropdown and the value that the user selects

def update_graph(season):
    s_x = rpdr_cp[rpdr_cp['season'] == str(season)]
    fig1 = px.bar(s_x, x='contestant', color = 'outcome2', 
                 labels={'count':'aggregate performance', 'contestant':'contestant (in order of season rank)','outcome2':'episode outcome'},
                 color_discrete_sequence=['#636EFA', '#FF6692', '#00CC96', '#FFA15A', '#FECB52','lightgrey'],
                 category_orders={"outcome2": ["WIN", "HIGH", "SAFE", "LOW", "BOTTOM","OUT"]})

    fig1.update_layout(legend_traceorder='normal',xaxis={'categoryorder':'array', 'categoryarray':s_x['contestant'].unique()})
    return fig1

#if __name__ == '__main__':
    #app.run_server(mode = 'inline', port = 3306)

### Part 2-4
- Winners 
- Miss Congenialities
- Dusted or Busted

In [None]:
#WINNERS
fig2 = px.bar(winners, x = 'contestant', y = 'outcome_count_scaled', color = 'outcome2',
            labels={'outcome_count_scaled':'relative performance', 'contestant':'winner (in order of season)','outcome2':'episode outcome'},
                 color_discrete_sequence=['#636EFA', '#FF6692', '#00CC96', '#FFA15A', '#FECB52'],
                 category_orders={"outcome2": ["WIN", "HIGH", "SAFE", "LOW", "BOTTOM"]})

fig2.update_layout(legend_traceorder='normal')

In [None]:
#MISS CONGENIALITY 
fig3 = px.bar(missc, x = 'contestant', y = 'outcome_count_scaled', color = 'outcome2',
            labels={'outcome_count_scaled':'relative performance', 'contestant':'miss congeniality (in order of season)','outcome2':'episode outcome'},
                 color_discrete_sequence=['#636EFA', '#FF6692', '#00CC96', '#FFA15A', '#FECB52','lightgray'],
                 category_orders={"outcome2": ["WIN", "HIGH", "SAFE", "LOW", "BOTTOM","OUT"]})

fig3.update_layout(legend_traceorder='normal',xaxis={'categoryorder':'array', 'categoryarray':missc['contestant'].unique()})

In [None]:
#DUSTED OR BUSTED 

fig4 = px.bar(db_plot, x = 'contestant', y = 'db_sum', color = 'rank',
            labels={'db_sum':'dusted or busted score', 'contestant':'contestant (in order of performance)','rank':'season placement'},
             color_discrete_sequence=['#636EFA', '#FF6692'])

fig4.update_layout(legend_traceorder='normal',xaxis={'categoryorder':'array', 'categoryarray':db_plot['contestant'].unique()}) 

### Part 5: Clustering

In [None]:
#CLUSTERING 

app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])

controls = dbc.Card(
    [
        dbc.FormGroup(
            [
                dbc.Label("X variable"),
                dcc.Dropdown(
                    id="x-variable",
                    options=[
                        {"label": col, "value": col} for col in cluster.iloc[:,1:].columns
                    ],
                    value="age",
                ),
            ]
        ),
        dbc.FormGroup(
            [
                dbc.Label("Y variable"),
                dcc.Dropdown(
                    id="y-variable",
                    options=[
                        {"label": col, "value": col} for col in cluster.iloc[:,1:].columns
                    ],
                    value="place",
                ),
            ]
        ),
        dbc.FormGroup(
            [
                dbc.Label("Number of clusters"),
                dbc.Input(id="cluster-num", type="number", value=3),
            ]
        ),
    ],
    body=True,
)

app.layout = dbc.Container(
    [
        html.H1("K-Means Clustering of Contestants"),
        html.Hr(),
        dbc.Row(
            [
                dbc.Col(controls, md=4),
                dbc.Col(dcc.Graph(id="cluster-graph"), md=8),
            ],
            align="center",
        ),
    ],
    fluid=True,
)


@app.callback(
    Output("cluster-graph", "figure"),
    [
        Input("x-variable", "value"),
        Input("y-variable", "value"),
        Input("cluster-num", "value"),
    ],
)
def make_graph(x, y, n_clusters):
    # minimal input validation, make sure there's at least one cluster
    km = KMeans(n_clusters=max(n_clusters, 1))
    df = cluster.loc[:, [x, y,'contestant']]
    km.fit(df[[x,y]].values)
    df["cluster"] = km.labels_
    centers = km.cluster_centers_

    data = [
        go.Scatter(
            x=df.loc[df.cluster == c, x],
            y=df.loc[df.cluster == c, y],
            mode="markers",
            marker={"size": 8},
            name="Cluster {}".format(c),
        )
        for c in range(n_clusters)
    ]

    data.append(
        go.Scatter(
            x=centers[:, 0],
            y=centers[:, 1],
            mode="markers",
            marker={"color": "#000", "size": 12, "symbol": "diamond"},
            name="Cluster centers"
        )
    )

    layout = {"xaxis": {"title": x}, "yaxis": {"title": y}}

    return go.Figure(data=data, layout=layout)


# make sure that x and y values can't be the same variable
def filter_options(v):
    """Disable option v"""
    return [
        {"label": col, "value": col, "disabled": col == v}
        for col in cluster.columns
    ]


# functionality is the same for both dropdowns, so we reuse filter_options
app.callback(Output("x-variable", "options"), [Input("y-variable", "value")])(
    filter_options
)
app.callback(Output("y-variable", "options"), [Input("x-variable", "value")])(
    filter_options
)



if __name__ == '__main__':
    app.run_server('localhost', port = 3306)

### FINAL DASHBOARD
- Combine parts 1-5 into one Dash Application

In [None]:
#initialize dash app
app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])

# the style arguments for the sidebar
SIDEBAR_STYLE = {
    "position": "fixed",
    "top": 0,
    "left": 0,
    "bottom": 0,
    "width": "32rem",
    "padding": "2rem 1rem",
    "background-color": "#f8f9fa",
}

# the styles for the main content position it to the right of the sidebar and add some padding.
CONTENT_STYLE = {
    "margin-left": "32rem",
    "margin-right": "2rem",
    "padding": "2rem 1rem",
}

#define sidebar section of HTML 
sidebar = html.Div(
    [
        #html.H2("Drag Race: Plotly Dashboard", className="display-4"),
        html.Img(src='http://outtv.ca/content/uploads/2018/03/RPDR_S10_top_logo.png'),
        html.Hr(),
        html.P(
            "A Comparative Analysis of RuPaul's Drag Race Contestants", className="lead"
        ),
        dbc.Nav(
            [
                dbc.NavLink("Season Rankings", href="/", active="exact"),
                dbc.NavLink("Winners", href="/page-1", active="exact"),
                dbc.NavLink("Miss Congenialities", href="/page-2", active="exact"),
                dbc.NavLink("Dusted or Busted Rankings", href="/page-3", active="exact"),
                dbc.NavLink("Cluster Analysis", href="/page-4", active="exact")

            ],
            vertical=True,
            pills=True,
        ),
    ],
    style=SIDEBAR_STYLE,
)

#Define content section of HTML 
content = html.Div(
    id="page-content", 
    style=CONTENT_STYLE
)

#Define a container for cluster dropdown options 
controls = dbc.Card(
    [
        dbc.FormGroup(
            [
                dbc.Label("X variable"),
                dcc.Dropdown(
                    id="x-variable",
                    options=[
                        {"label": col, "value": col} for col in cluster.iloc[:,1:].columns
                    ],
                    value="age",
                ),
            ]
        ),
        dbc.FormGroup(
            [
                dbc.Label("Y variable"),
                dcc.Dropdown(
                    id="y-variable",
                    options=[
                        {"label": col, "value": col} for col in cluster.iloc[:,1:].columns
                    ],
                    value="place",
                ),
            ]
        ),
        dbc.FormGroup(
            [
                dbc.Label("Number of clusters"),
                dbc.Input(id="cluster-num", type="number", value=3),
            ]
        ),
    ],
    body=True,
)

##############################################################################################################

#Define layouts for each page 
layout1 = html.Div([
    html.H3(
        children = 'Contestant Performance by Season (S01-12)',
        style = {'textAlign': 'center', 'color': '#636EFA'}),
    dcc.Dropdown(
        id = 'season_dropdown', clearable = False,
        value = 'S12',
        options = [{'label':i, 'value': i} for i in rpdr_cp['season'].unique()],
        style = {'width': '75%', 'margin': 'auto'}
    ),
    dcc.Graph(
        id='rpdr_graph1',
        figure= {}
    )
])

layout2 = html.Div([
    html.H3(
        children = 'Relative Comparison of Winner Performances (S01-12)',
        style = {'textAlign': 'center', 'color': '#636EFA'}),
    dcc.Graph(
        id='rpdr_graph2',
        figure= fig2
    )
])

layout3 = html.Div([
    html.H3(
        children = 'Comparison of Miss Congeniality Performances (S01-12)',
        style = {'textAlign': 'center', 'color': '#636EFA'}),
    dcc.Graph(
        id='rpdr_graph3',
        figure= fig3
    )
])

layout4 = html.Div([
    html.H3(
        children = 'Top 20 Contestants by Dusted or Busted Score (S01-12)',
        style = {'textAlign': 'center', 'color': '#636EFA'}),
    dcc.Graph(
        id='rpdr_graph4',
        figure= fig4
    )
])

layout5 = dbc.Container([
    html.H3(
        children = "K-Means Clustering of Contestants",
        style = {'textAlign': 'center', 'color': '#636EFA'}),
        html.Hr(),
        dbc.Row(
            [
                dbc.Col(controls, md=4),
                dbc.Col(dcc.Graph(id="cluster-graph"), md=8),
            ],
            align="center",
        ),
    ],
    fluid=True,
)

##############################################################################################################

#INDEX LAYOUT 
app.layout = html.Div([dcc.Location(id="url"), sidebar, content])

#INDEX CALLBACKS 
@app.callback(Output("page-content", "children"), 
              [Input("url", "pathname")])

def render_page_content(pathname):
    if pathname == "/":
        return layout1 
    elif pathname == "/page-1":
        return layout2
    elif pathname == "/page-2":
        return layout3
    elif pathname == "/page-3":
        return layout4
    elif pathname == "/page-4":
        return layout5
    
    # If the user tries to reach a different page, return a 404 message
    return dbc.Jumbotron(
        [
            html.H1("404: Not found", className="text-danger"),
            html.Hr(),
            html.P("The pathname {pathname} was not recognised..."),
        ]
    )

##############################################################################################################

#OTHER CALLBACKS

#Callback for graph 1 dropdown options 
@app.callback( 
    Output('rpdr_graph1', 'figure'), #this displays the graph 
    [Input('season_dropdown', 'value')]) #this takes the ticker dropdown and the value that the user selects

def update_graph(season):
    s_x = rpdr_cp[rpdr_cp['season'] == str(season)]
    fig1 = px.bar(s_x, x='contestant', color = 'outcome2', 
                 labels={'count':'aggregate performance', 'contestant':'contestant (in order of season rank)','outcome2':'episode outcome'},
                 color_discrete_sequence=['#636EFA', '#FF6692', '#00CC96', '#FFA15A', '#FECB52','lightgrey'],
                 category_orders={"outcome2": ["WIN", "HIGH", "SAFE", "LOW", "BOTTOM","OUT"]})
    fig1.update_layout(legend_traceorder='normal',xaxis={'categoryorder':'array', 'categoryarray':s_x['contestant'].unique()})
    return fig1

##############################################################################################################

#Callback for cluster 
@app.callback(
    Output("cluster-graph", "figure"),
    [
        Input("x-variable", "value"),
        Input("y-variable", "value"),
        Input("cluster-num", "value"),
    ],
)
def make_graph(x, y, n_clusters):
    # minimal input validation, make sure there's at least one cluster
    km = KMeans(n_clusters=max(n_clusters, 1))
    df = cluster.loc[:, [x, y,'contestant']]
    km.fit(df[[x,y]].values)
    df["cluster"] = km.labels_
    centers = km.cluster_centers_

    data = [
        go.Scatter(
            x=df.loc[df.cluster == c, x],
            y=df.loc[df.cluster == c, y],
            mode="markers",
            marker={"size": 8},
            name="Cluster {}".format(c),
        )
        for c in range(n_clusters)
    ]

    data.append(
        go.Scatter(
            x=centers[:, 0],
            y=centers[:, 1],
            mode="markers",
            marker={"color": "#000", "size": 12, "symbol": "diamond"},
            name="Cluster centers"
        )
    )

    layout = {"xaxis": {"title": x}, "yaxis": {"title": y}}

    return go.Figure(data=data, layout=layout)


# make sure that x and y values can't be the same variable
def filter_options(v):
    """Disable option v"""
    return [
        {"label": col, "value": col, "disabled": col == v}
        for col in cluster.columns
    ]


# functionality is the same for both dropdowns, so we reuse filter_options
app.callback(Output("x-variable", "options"), [Input("y-variable", "value")])(
    filter_options
)
app.callback(Output("y-variable", "options"), [Input("x-variable", "value")])(
    filter_options
)

##############################################################################################################

#run dash locally 

#if __name__ == '__main__':
    #app.run_server('localhost', port = 3306)