In [1]:
import pandas as pd
import numpy as np
import png
import time

In [4]:
df=pd.read_csv("../all_ratings.csv")
df.head()

Unnamed: 0,movie_id,user_id,rating
0,4500,2532865,4
1,4500,573364,3
2,4500,1696725,3
3,4500,1253431,3
4,4500,1265574,2


In [3]:
titles_df = pd.read_csv("../flask/static/movie_titles.csv",header = None, names = ['movie_id', 'year', 'title'], usecols=[0,1,2],encoding = "ISO-8859-1")
titles_df.head()

Unnamed: 0,movie_id,year,title
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [4]:
mvc=df["movie_id"].value_counts()

In [5]:
top_1k_ids =list(mvc.head(1000).keys())

In [6]:
def get_title(df, movie_id):
    return df[df.movie_id==movie_id].iloc[0].title

In [7]:
[get_title(titles_df, x) for x in top_1k_ids]

['Miss Congeniality',
 'Independence Day',
 'The Patriot',
 'The Day After Tomorrow',
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'Pretty Woman',
 'Forrest Gump',
 'The Green Mile',
 'Con Air',
 'Twister',
 'Sweet Home Alabama',
 'Pearl Harbor',
 'Armageddon',
 'The Rock',
 'What Women Want',
 'Bruce Almighty',
 "Ocean's Eleven",
 'The Bourne Identity',
 'The Italian Job',
 'I',
 'American Beauty',
 'How to Lose a Guy in 10 Days',
 'Lethal Weapon 4',
 'Shrek 2',
 'Lost in Translation',
 'Top Gun',
 'Pulp Fiction',
 'Gone in 60 Seconds',
 'The Sixth Sense',
 'Lord of the Rings: The Two Towers',
 'Men of Honor',
 'Gladiator',
 'Lord of the Rings: The Fellowship of the Ring',
 'Sister Act',
 'Double Jeopardy',
 'Two Weeks Notice',
 'The Royal Tenenbaums',
 'Troy',
 'National Treasure',
 '50 First Dates',
 'Indiana Jones and the Last Crusade',
 'My Big Fat Greek Wedding',
 'Mystic River',
 'Titanic',
 'Dirty Dancing',
 'Catch Me If You Can',
 'Finding Nemo (Widescreen)',
 '

In [8]:
titles_df.query(f"movie_id in {top_1k_ids}")

Unnamed: 0,movie_id,year,title
27,28,2002.0,Lilo and Stitch
29,30,2003.0,Something's Gotta Give
110,111,2003.0,Duplex (Widescreen)
142,143,1997.0,The Game
174,175,1992.0,Reservoir Dogs
190,191,2003.0,X2: X-Men United
196,197,2004.0,Taking Lives
198,199,1978.0,The Deer Hunter
240,241,1959.0,North by Northwest
256,257,1973.0,Charlotte's Web


In [12]:
titles_df.query(f"movie_id in {top_1k_ids}")[["movie_id","title"]].to_csv("top_1k_titles.csv",index=False)

In [2]:
titles_1k = pd.read_csv("top_1k_titles.csv")

In [3]:
ids_1k = list(titles_1k["movie_id"])

In [5]:
movies = {}
for i in ids_1k:
    movies[i] = df[df["movie_id"]==i].drop(["movie_id"],axis=1).set_index("user_id")

In [6]:
def rough_means_v1(df, id1, id2, movie_dict=None, col_norm=50, low_clip=0, sym=False):
    if id1==id2:
        if sym:
            return [0,0,0],[0,0,0]
        else:
            return [0,0,0]
        
    if movie_dict is None:
        df1 = df[df["movie_id"]==id1].drop(["movie_id"],axis=1)
        df2 = df[df["movie_id"]==id2].drop(["movie_id"],axis=1)
    else:
        df1 = movie_dict[id1]
        df2 = movie_dict[id2]
    
    df_join = df1.join(df2, how="inner", lsuffix="_x", rsuffix="_y")
    
    if sym:
        return rough_means_df(df_join, True, col_norm, low_clip), rough_means_df(df_join, False, col_norm, low_clip)
    else:
        return rough_means_df(df_join, True, col_norm, low_clip)

In [7]:
def rough_means_df(df_join, col_x=True, col_norm=50, low_clip=0):
    total = len(df_join)
    if total == 0 or total < low_clip:
        return [0,0,0]
    
    rx = df_join["rating_x"] if col_x else df_join["rating_y"]
    ry = df_join["rating_y"] if col_x else df_join["rating_x"]
    
    low = df_join[rx<3]
    med = df_join[rx==3]
    high = df_join[rx>3]
    
    result = []
    
    for x in [(low,2),(med,1),(high,2)]:
        if len(x[0])==0:
            result.append(0)
        else:
            if col_x:
                result.append(np.interp(x[0]["rating_y"].mean(),[1,5],[0,1])*np.interp(len(x[0]),[0,col_norm*x[1]],[0,1]))
            else:
                result.append(np.interp(x[0]["rating_x"].mean(),[1,5],[0,1])*np.interp(len(x[0]),[0,col_norm*x[1]],[0,1]))

    return result

In [13]:
def rough_means_matrix_sparse(id_list, movie_dict, col_norm=50, low_clip=0, progress=False, prog_interval=100):
    _range=len(id_list)
    
    matrix = [[[0,0,0]]*_range for x in range(_range)]

    counter=0
    t0=time.time()
    for i in range(_range-1):
        for j in range(i+1, _range):
            if progress:
                if counter%prog_interval==0:
                    t1=time.time()
                    print(f"{counter} records complete... ({round(t1-t0,3)} s elapsed)")
                counter+=1
                
            #Compute rough_means for (i,j) and (j,i)
            x1, x2 = rough_means_v1(None, id_list[i], id_list[j], movie_dict, col_norm, low_clip, True)
            matrix[i][j] = x1
            matrix[j][i] = x2
            
    return matrix

In [15]:
def matrix_to_png(matrix, range_=(0,1), arraylike=True):
    num_rows = len(matrix)
    num_cols = max(len(matrix[0]), len(matrix[-1]))
    if arraylike:
        num_channels = max(len(matrix[0][0]),3)
    else:
        num_channels = 1

    img_array = np.reshape(np.array([0,0,0]*num_rows*num_cols),[num_rows, num_cols*3])
    
    def interp(x):
        return np.interp(x, range_,[0,255])
    
    for i in range(num_rows):
        num_cols_i = len(matrix[i])
        for j in range(num_cols_i):
            if not arraylike:
                for k in range(3):
                    img_array[i][3*j+k]=interp(matrix[i][j])
            else:
                for k in range(num_channels):
                    img_array[i][3*j+k]=interp(matrix[i][j][k])
                    
    img_array = np.vstack(map(np.uint8, img_array))
    
    return png.from_array(img_array,'RGB')

In [14]:
means_matrix_100=rough_means_matrix_sparse(ids_1k[:100],movies,progress=True)

0 records complete... (0.0 s elapsed)
100 records complete... (1.83 s elapsed)
200 records complete... (4.31 s elapsed)
300 records complete... (5.79 s elapsed)
400 records complete... (7.64 s elapsed)
500 records complete... (10.1 s elapsed)
600 records complete... (12.33 s elapsed)
700 records complete... (14.31 s elapsed)
800 records complete... (16.03 s elapsed)
900 records complete... (17.57 s elapsed)
1000 records complete... (19.1 s elapsed)
1100 records complete... (20.72 s elapsed)
1200 records complete... (22.39 s elapsed)
1300 records complete... (23.97 s elapsed)
1400 records complete... (26.08 s elapsed)
1500 records complete... (28.06 s elapsed)
1600 records complete... (30.11 s elapsed)
1700 records complete... (31.83 s elapsed)
1800 records complete... (33.51 s elapsed)
1900 records complete... (35.13 s elapsed)
2000 records complete... (37.19 s elapsed)
2100 records complete... (38.89 s elapsed)
2200 records complete... (40.53 s elapsed)
2300 records complete... (42.7 

In [17]:
matrix_to_png(means_matrix_100).save("sparse_output_100.png")