Team: Pandas

Group members: Francesca-Zhoufan Li, Elena Sorina Lupu, Nikhil Ranganathan

# Install and Import Packages

In [None]:
!pip install iqplot

In [None]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

from math import pi
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import seaborn as sns
sns.set_theme(style="white", context="talk")

import iqplot
import bokeh.io
from bokeh.layouts import column, gridplot
from bokeh.models import ColorBar, ColorMapper, LinearColorMapper, Ticker
bokeh.io.output_notebook()

# Load and Clean Up Data

In [None]:
def load_data():
    """Load the user and movie data, FZL modified"""
    data = pd.read_csv(
        "https://raw.githubusercontent.com/lakigigar/Caltech-CS155-2021/main/projects/project2/data/data.txt",
        sep="\t",
        header=None,
        names=["USER", "MOVIE", "RATING"],
    )
    movies = pd.read_csv(
        "https://raw.githubusercontent.com/lakigigar/Caltech-CS155-2021/main/projects/project2/data/movies.txt",
        encoding="latin-1",
        sep="\t",
        header=None,
        names=[
            "MOVIE_ID",
            "TITLE",
            "UNKNOWN",
            "ACTION",
            "ADVENTURE",
            "ANIMATION",
            "CHILDREN",
            "COMEDY",
            "CRIME",
            "DOCUMENTARY",
            "DRAMA",
            "FANTASY",
            "FILM-NOIR",
            "HORROR",
            "MUSICAL",
            "MYSTERY",
            "ROMANCE",
            "SCI-FI",
            "THRILLER",
            "WAR",
            "WESTERN",
        ],
    )

    movies.loc[movies.TITLE == "unknown", "TITLE"] = "MOVIE_ID: " + movies.loc[
        movies.TITLE == "unknown", "MOVIE_ID"
    ].astype("str")

    return data, movies

In [None]:
data, movies = load_data()

# Basic Visualization

## All MovieLens Dataset

In [None]:
sum_df = pd.DataFrame(movies.set_index(["MOVIE_ID", "TITLE"]).sum(axis=0)).reset_index()
sum_df.columns = ["Genres", "Counts"]
sum_df = sum_df.sort_values(["Counts"], ascending=False).reset_index(drop=True)
sum_df

In [None]:
plt.figure(figsize=(15,8))
x = np.array(list(sum_df.Genres))
y1 = sum_df.Counts.values
sum_bar = sns.barplot(x=x, y=y1, 
                      # palette=colorcet.b_glasbey_category10
                      palette="crest_r"
                     )

for index, row in sum_df.iterrows():
    sum_bar.text(index,row.Counts, row.Counts, color="black", ha="center")

sum_bar.set(xlabel="Genres", ylabel="Counts", 
            title="Summary of gernre counts")
sum_bar.set_xticklabels(sum_bar.get_xticklabels(), rotation=90);

In [None]:
plt.figure(figsize=(10, 40))
movie_heat = sns.heatmap(
    movies.iloc[:, 1:].set_index("TITLE"),
    yticklabels=False,
    cmap=[(1, 1, 1), (0.14573579, 0.29354139, 0.49847009)],
    cbar_kws=dict(use_gridspec=False, shrink=0.2, ticks=[0, 1],
                  label="if the movie belongs to the genre",
                  # location="top"
                  ),
)

movie_heat.set(xlabel="Genres", ylabel="Movies", title="Summary of gernre for each movie");

## All ratings in the MovieLens Dataset

In [None]:
data_all = iqplot.histogram(data=data, q="RATING", title="Rating for all movies")
bokeh.io.show(data_all)

In [None]:
def plot_heat_rate(
    df, title_details, x_tick_scale=None, y_tick_scale=None, ifreorder=False
):
    """Plot heatmap where x-axis is the users, y-axis is the movie id,
    and the color corresponds to the rating"""

    try:
        if "TITLE" in df.columns:
            nrow = len(df.TITLE.unique())
            df_heat = df.pivot("TITLE", "USER", "RATING")
        elif "MOVIE_ID" in df.columns:
            nrow = len(df.MOVIE_ID.unique())
            df_heat = df.pivot("MOVIE_ID", "USER", "RATING")
        elif "MOVIE" in df.columns:
            nrow = len(df.MOVIE.unique())
            df_heat = df.pivot("MOVIE", "USER", "RATING")
        else:
            nrow = -1
    except:
        print("Resolving duplicating issue")
        df_heat = (
            df[["TITLE", "USER", "RATING"]]
            .drop_duplicates()
            .reset_index(drop=True)
            .pivot_table(values="RATING", index=["TITLE", "USER"], aggfunc="mean")
            .unstack(1)
        )
        df_heat.columns = [user_id for r, user_id in df_heat.columns]

    if nrow == None:
        nscale = -20
    elif nrow < 4:
        nscale = 4
    elif nrow > 1000:
        nscale = 0.01
        y_tick_scale = 50
    else:
        nscale = 0.6
    p_height = nrow * nscale

    if len(df.USER.unique()) > 50:
        x_tick_scale = 50
    
    if ifreorder:
        df_heat = df_heat.reindex(df.TITLE.unique())

    plt.figure(figsize=(20, p_height))
    rating_heat = sns.heatmap(
        df_heat,
        cmap="crest",
        cbar_kws=dict(
            use_gridspec=False,
            shrink=0.2,
            ticks=list(range(1, 6)),
            label="rating",
            location="top",
        ),
    )

    if x_tick_scale != None:
        rating_heat.xaxis.set_major_locator(ticker.MultipleLocator(x_tick_scale))
        rating_heat.xaxis.set_major_formatter(ticker.ScalarFormatter())

    if y_tick_scale != None:
        rating_heat.yaxis.set_major_locator(ticker.MultipleLocator(y_tick_scale))
        rating_heat.yaxis.set_major_formatter(ticker.ScalarFormatter())
    
    rating_heat.set(xlabel="USER", title=f"Summary of {title_details}")
    plt.show()

    return rating_heat

In [None]:
def plot_rating_cat(df, cat):
    """Plot the rating grouped by user or movie"""
    p = iqplot.stripbox(
        data=df.sort_values(cat),
        q="RATING",
        cats=cat,
        plot_width=len(df[cat].unique())*10,
        palette=[sns.color_palette("crest", 5).as_hex()[-1]] * len(df),
        jitter=True,
        marker_kwargs=dict(alpha=0.05),
        q_axis="y",
        title="Rating per " + cat,
    )
    p.xaxis.major_label_orientation = pi/2
    bokeh.io.show(p)

In [None]:
rating_heat_all = plot_heat_rate(data,
                                 "full user movie rating",
                                 x_tick_scale=50, y_tick_scale=50)

In [None]:
plot_rating_cat(data, "USER")

In [None]:
plot_rating_cat(data, "MOVIE")

## All ratings of the ten most popular (rated) movies

In [None]:
def get_pop_movie(data, movies):
    """Get the number and title of rated movies"""
    movie_count = pd.DataFrame(data.MOVIE.value_counts()).reset_index()
    movie_count.columns = ["MOVIE_ID", "RATING_COUNTS"]
    return movie_count

In [None]:
def merge_title(df, movies, ifgenres=False):
    """Add movie titles"""
    if "MOVIE" in df.columns:
        df = df.rename(columns={"MOVIE": "MOVIE_ID"})
    if not ifgenres:
        movie_comb = movies.iloc[:, :2]
    else:
        movie_comb = movies

    df = df.merge(movie_comb, left_on="MOVIE_ID", right_on="MOVIE_ID", how="left")

    df.loc[df.TITLE.isnull(), "TITLE"] = (
        "MOVIE_ID: " + df.loc[df.TITLE.isnull(), "MOVIE_ID"]
    )

    return df

In [None]:
def get_top_pop_movie_data(data, movies, topn):
    """Get the top rated movie ratings"""
    data_lists = []
    movie_count = get_pop_movie(data, movies)
    for i in movie_count.MOVIE_ID[:topn]:
        data_lists.append(data[data.MOVIE==i])
    return merge_title(pd.concat(data_lists), movies)

In [None]:
def plot_topn_pop_rating(df, title_details, topn):
    """Plot the top n most popular/rated movies"""

    if topn == None:
        topn = len(df)
        topn_str = "all"
    else:
        topn_str = topn
    
    ncols = len(df.TITLE.unique())
    if ncols < 100:
        col_w = 50
    else:
        col_w = 20
    p_w = ncols*col_w
    
    if p_w < 500:
        p_w = 500
    
    topn_pop = iqplot.stripbox(
        data=df,
        q="RATING",
        cats="TITLE",
        palette=list(sns.color_palette("crest_r", topn).as_hex()),
        jitter=True,
        top_level="box",
        q_axis="y",
        plot_width=p_w,
        marker_kwargs=dict(alpha=0.05),
        title=f"Rating for the {topn_str} {title_details} movies",
    )

    mapper = LinearColorMapper(
        palette=list(sns.color_palette("crest", topn).as_hex()), low=1, high=topn,
    )
    color_bar = ColorBar(
        color_mapper=mapper,
        padding=0,
        location=(0, 0),
        title="Most rated",
        title_standoff=10,
    )

    topn_pop.add_layout(color_bar, "right")
    topn_pop.xaxis.major_label_orientation = pi / 2
    topn_pop.xaxis.axis_label = "MOVIE"
    bokeh.io.show(topn_pop)
    return topn_pop

In [None]:
def plot_hist_list(df):
    """Plot a list of hist depaned on title"""

    titles = df.TITLE.unique()
    hists = [None] * len(titles)
    for c, t in enumerate(titles):
        hists[c] = iqplot.histogram(
            data=df[df.TITLE == t],
            bins="exact",
            q="RATING",
            title=t,
            plot_width=200,
            plot_height=150,
        )

    grid = gridplot(hists, ncols=10)

    bokeh.io.show(column(grid))
    return hists

In [None]:
def plot_plots(
    df,
    topn,
    title_details,
    x_tick_scale=None,
    y_tick_scale=None,
    ifhist=True,
    ifreorder=True,
):
    """Generate the stripbox or box, heatmap, and histograms"""

    if topn == None:
        topn_str = "all"
    else:
        topn_str = topn

    try:
        poporrate = plot_topn_pop_rating(df, title_details, topn)
    except:
        poporrate = plot_top_rate(df, title_details, topn)

    heat = plot_heat_rate(
        df,
        f"{title_details} {topn_str} movies",
        x_tick_scale=x_tick_scale,
        y_tick_scale=y_tick_scale,
        ifreorder=ifreorder,
    )

    if ifhist:
        hists = plot_hist_list(df)
        return poporrate, heat, hists
    else:
        return poporrate, heat

In [None]:
topn = 10
topn_pop_df = get_top_pop_movie_data(data, movies, topn)
pop, pop_heat, pop_hist = plot_plots(topn_pop_df, topn, "top rated", x_tick_scale=50)

## All ratings of the ten best movies with the highest average ratings

In [None]:
def get_top_rate(data, movies, topn):
    """Get the top rated movies"""
    top_rate = (
        data.set_index(["MOVIE"])
        .groupby(["MOVIE"])
        .mean()
        .sort_values(["RATING"], ascending=False)
        .reset_index()
    )

    df_list = []
    
    for i in top_rate.MOVIE[:topn]:
        df_list.append(data[data.MOVIE==i])
    
    return merge_title(pd.concat(df_list), movies)

In [None]:
def plot_top_rate(df, title_details, topn):
    """Plot strip plots of the top rated movies"""
    
    if topn == None:
        topn = len(df)
        topn_str = "all"
    else:
        topn_str = topn
        
    ncols = len(df.TITLE.unique())
    if ncols < 20:
        col_w = 50
    else:
        col_w = 10
    p_w = ncols*col_w
    
    if p_w < 200:
        p_w = 200
        
    topn_rate = iqplot.strip(
        data=df,
        q="RATING",
        cats="TITLE",
        palette=list(sns.color_palette("crest_r", 
                                       len(df.RATING.unique())).as_hex()),
        jitter=True,
        q_axis="y",
        plot_width=p_w,
        marker_kwargs=dict(alpha=0.5),
        color_column="RATING",
        title=f"Rating for the {title_details} {topn_str} movies",
    )

    topn_rate.xaxis.major_label_orientation = pi / 2
    topn_rate.xaxis.axis_label = "MOVIE"
    bokeh.io.show(topn_rate)
    return topn_rate

In [None]:
topn = 10
topn_rate_df = get_top_rate(data, movies, topn)
rate, rate_heat, rate_hist = plot_plots(topn_rate_df, topn, "top average rating")

## All ratings of movies from different genres of choice

In [None]:
topn = None
genre_df = merge_title(data, movies, ifgenres=True)
genres = movies.columns[2:]
for g in genres:
    g_df = genre_df[genre_df[g]==1]
    rate, rate_heat = plot_plots(
        g_df, topn, g, ifhist=False, ifreorder=False
    )