In [None]:
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.io import output_notebook
import bokeh.palettes as bp
import pandas as pd
import collections
import math
import itertools
import operator

In [None]:
output_notebook()

In [None]:
df = pd.read_csv('all_data.txt')
# columns are movies, rows are users

In [None]:
# get rid of whitespace and characters that cause issues with pandas column names
df.columns = [i.replace(' ','_') for i in df.columns]
df.columns = [i.replace('(','') for i in df.columns]
df.columns = [i.replace(')','') for i in df.columns]
df.columns = [i.replace(':','') for i in df.columns]
movies_with_rt = df.columns[1:]

In [None]:
# go over every cell in dataframe to get all guesses
Point = collections.namedtuple('Point', 'movie user guess')
Movie = collections.namedtuple('Movie', 'name rt_value')
points = []
for index, row in df.iterrows():
    user = row.user
    for movie in movies_with_rt:
        guess = row[movie]
        if not math.isnan(guess):
            points.append(Point(movie, user, guess))
            
# build dict to convert movie column to name and rt_value
movie_info = {}
for movie in movies_with_rt:
    movie_name = ' '.join(movie.split('_')[:-1])
    rt_value = int(movie.split('_')[-1])
    movie_info[movie] = Movie(movie_name, rt_value)
    
# get list of movies sorted by rt_value so can color them differently    
movie_rt_value = [(movie, info.rt_value) for movie, info in movie_info.items()]
sorted_movies = [i[0] for i in sorted(movie_rt_value, key=operator.itemgetter(1))]

# get a color for each movie, cycling through a list of colors
colors = list(itertools.islice(itertools.cycle(bp.d3['Category10'][10]), 0, len(movies_with_rt)))
movie_to_color = {movie: color for movie, color in zip(sorted_movies, colors)}

In [None]:
rt_values = [movie_info[i.movie].rt_value for i in points]
guesses = [i.guess for i in points]
movies = [movie_info[i.movie].name for i in points]
users = [i.user for i in points]
colors = [movie_to_color[i.movie] for i in points]

In [None]:
source = ColumnDataSource(data=dict(
    x=guesses,
    y=rt_values,
    movies=movies,
    users=users,
    color=colors
))

TOOLTIPS = [
    ('user', '@users'),
    ('movie', '@movies'),
    ('RT', '@y'),
    ('guess', '@x')
]

p = figure(plot_width=800, plot_height=800, tooltips=TOOLTIPS,
           title="How well can redditors predict Rotten Tomato scores")
p.xaxis.axis_label = 'Guess'
p.yaxis.axis_label = 'Actual'
p.circle('x', 'y', color='color',size=10, source=source, fill_alpha=0.2, )
p.line([0,100],[0,100])

In [None]:
show(p)