In [1]:
from bokeh.layouts import column
from bokeh.models import Select, ColumnDataSource, CustomJS
from bokeh.plotting import Figure, show, output_notebook
import bokeh.palettes as bp
import pandas as pd
import numpy as np
import collections
import math
import itertools
import operator

In [2]:
output_notebook()

In [3]:
df = pd.read_csv('all_data.txt') # columns are movies, rows are users

# get rid of whitespace and characters that cause issues with pandas column names
df.columns = [i.replace(' ','_') for i in df.columns]
df.columns = [i.replace('(','') for i in df.columns]
df.columns = [i.replace(')','') for i in df.columns]
df.columns = [i.replace(':','') for i in df.columns]
movies_with_rt = df.columns[1:]

In [4]:
# go over every cell in dataframe to get all guesses
data = collections.defaultdict(list)
movies_scores = [] # to sort movies by score so colors won't overlap
for movie in movies_with_rt:
    movie_name  = ' '.join(movie.split('_')[:-1])
    rt_score = int(movie.split('_')[-1])
    movies_scores.append((movie_name, rt_score))
    
    for index, row in df.iterrows():
        user = row.user
        guess = row[movie] # access in dict-like way since movie is variable
        if not math.isnan(guess):
            data['user'].append(user)
            data['guess'].append(guess)
            data['movie'].append(movie_name)
            data['rt_score'].append(rt_score)
            
points = pd.DataFrame.from_dict(data)
            
# get list of movies sorted by rt_value so can color them differently    
sorted_movies = [i[0] for i in sorted(movies_scores, key=operator.itemgetter(1))]

# get a color for each movie, cycling through a list of colors
colors = list(itertools.islice(itertools.cycle(bp.d3['Category10'][10]), 0, len(movies_scores)))
movie_to_color = {movie: color for movie, color in zip(sorted_movies, colors)}

# apply movie_to_color dict to get new column in dataframe
points['color'] = points['movie'].map(movie_to_color)
points['line_width'] = 1 # later on user can change line_width by selection

In [5]:
# calculate the leaderboard based on those who have made minimum # of guesses
min_guesses = np.ceil(len(movies_with_rt) / 2)
users = df.user.unique()
leaderboard = collections.defaultdict(list)
for user in users:
    user_df = points.loc[points['user']==user]
    if user_df.shape[0] >= min_guesses:
        deltas = abs(user_df.guess - user_df.rt_score)
        leaderboard['mean_delta'].append(deltas.mean())
        leaderboard['median_delta'].append(deltas.median())
        leaderboard['best_delta'].append(int(deltas.min()))
        leaderboard['guesses'].append(user_df.shape[0])
        leaderboard['user'].append(user)

leader = pd.DataFrame.from_dict(leaderboard)

# sort the dataframe from lowest->highest median, with mean as 1st tie-breaker
leader = leader.sort_values(['median_delta', 'mean_delta', 'best_delta', 'guesses', 'user'], 
                            ascending=[True, True, True, False, True])

# generate strings for displaying in the Select menu
leaders = []
for index, row in leader.iterrows():
    leaders.append(f'{row.user} (median={row.median_delta:.1f} mean={row.mean_delta:.1f}'
                   f'best={row.best_delta} guesses={row.guesses})')

In [6]:
source = ColumnDataSource(data=points)

TOOLTIPS = [
    ('user', '@user'),
    ('movie', '@movie'),
    ('RT', '@rt_score'),
    ('guess', '@guess')
]

redditor = Select(title='Redditor', value='none', options=['duddles','hvahood'])

p = Figure(plot_width=800, plot_height=800, tooltips=TOOLTIPS,
           title="How well can Redditors predict Rotten Tomato scores")
p.xaxis.axis_label = 'Guess'
p.yaxis.axis_label = 'Actual'
p.circle('guess', 'rt_score', color='color',line_width='line_width',size=10,source=source, fill_alpha=0.2, )
p.line([0,100],[0,100])

callback = CustomJS(args=dict(source=source), code="""
    var data = source.data;
    var f = cb_obj.value
    f = f.split(" ")[0];
    var x = data['user']
    var y = data['line_width']
    for (var i=0; i<x.length; i++) {
        if (x[i] == f) {
            y[i] = 10
        } else {
            y[i] = 1
        }
    }
    source.change.emit();
""")

options = [''] + leaders
select =  Select(title="Redditor:", value="none", options=options)
select.js_on_change('value', callback)

layout = column(select, p)
show(layout)

In [7]:
from IPython.display import HTML

In [8]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')