In [1]:
import math
import pandas as pd

## Reading data files 

In [2]:
file = open('hw8_ids.txt', "r")
ids = []
for line in file:
    ids.append(line.strip())

In [3]:
file = open('hw8_movies.txt', "r")
movies = []
for line in file:
    movies.append(line.strip())

In [4]:
file = open('hw8_probR_init.txt', "r")
probR = []
for line in file:
    p = line.strip().split()
    p = [float(i) for i in p]
    probR.append(p)

In [5]:
file = open('hw8_probZ_init.txt', "r")
probZ = []
for line in file:
    p = line.strip()
    p = float(p)
    probZ.append(p)

In [6]:
file = open('hw8_ratings.txt', "r")
ratings = []
for line in file:
    p = line.strip().split()
    ratings.append(p)

In [7]:
M = len(movies)
T = len(ids)
K = 4

## (a) Mean popularity rating

In [8]:
MPR = []
for i in range(M):
    seen = 0
    rec = 0
    for j in range(T):
        r = ratings[j][i]
        if(r == '0'):
            seen += 1
        if(r == '1'):
            seen += 1
            rec += 1
    MPR.append((movies[i],rec/seen))

In [9]:
MPR = sorted(MPR,key=lambda x:x[1])

### Movies sorted from least popular to most popular 

In [10]:
for i in range(M):
    print(MPR[i][0])

Chappaquidick
I_Feel_Pretty
Fast_&_Furious:_Hobbs_&_Shaw
Magic_Mike
Bridemaids
The_Last_Airbender
World_War_Z
Fifty_Shades_of_Grey
Star_Wars:_The_Force_Awakens
The_Hateful_Eight
Terminator:_Dark_Fate
Drive
Pitch_Perfect
Phantom_Thread
Prometheus
Pokemon_Detective_Pikachu
The_Farewell
Good_Boys
Fast_Five
The_Girl_with_the_Dragon_Tattoo
American_Hustle
Avengers:_Age_of_Ultron
The_Shape_of_Water
The_Revenant
Thor
The_Help
Man_of_Steel
The_Perks_of_Being_a_Wallflower
Hidden_Figures
Frozen
Jurassic_World
Three_Billboards_Outside_Ebbing
Mad_Max:_Fury_Road
Captain_America:_The_First_Avenger
Dunkirk
Manchester_by_the_Sea
Ex_Machina
Rocketman
Ready_Player_One
Once_Upon_a_Time_in_Hollywood
21_Jump_Street
Black_Swan
Iron_Man_2
Hustlers
Darkest_Hour
The_Hunger_Games
X-Men:_First_Class
Gone_Girl
Us
Avengers:_Endgame
La_La_Land
Avengers:_Infinity_War
Spiderman:_Far_From_Home
Midnight_in_Paris
Now_You_See_Me
12_Years_a_Slave
Django_Unchained
Room
The_Avengers
Toy_Story_3
Joker
Her
Les_Miserables
The_

## (e) EM Implementation

### Movies seen by each user

In [11]:
seen = []
for i in range(T):
    temp = []
    for j in range(M):
        if(ratings[i][j]!='?'):
            temp.append(j)
    seen.append(temp)

### Log-likelihood

In [12]:
def loglikelihood(probR,probZ):
    mle = 0
    for i in range(T):
        p = 0
        for j in range(K):
            p1 = probZ[j]
            for m in seen[i]:
                if(ratings[i][m]=='1'):
                    p1 *= probR[m][j]
                if(ratings[i][m]=='0'):
                    p1 *= (1-probR[m][j])
            p += p1
        mle += math.log(p)
    return mle/T

### E-Step (Inference) 

In [13]:
def eStep(probZ,probR):
    pit = []
    
    for i in range(T):
        p = []
        for j in range(K):
            p1 = probZ[j]
            for m in seen[i]:
                if(ratings[i][m]=='1'):
                    p1 *= probR[m][j]
                if(ratings[i][m]=='0'):
                    p1 *= (1-probR[m][j])
            p.append(p1)
        s = sum(p)
        p = [x/s for x in p]
        pit.append(p)

    return pit

### M-Step (Learning)

In [14]:
def mStep(probZ,probR,max_iter):
    ll = [[0,loglikelihood(probR,probZ)]]
    for i in range(max_iter):
        pit = eStep(probZ,probR)
        
        probZu = []
        probRu = []
        
        for j in range(K):
            tempp = 0
            for k in range(T):
                tempp += pit[k][j]
            probZu.append(tempp/T)
            
            temp = []
            for m in range(M):
                num = 0
                for k in range(T):
                    if(ratings[k][m] == '?'):
                        num += probR[m][j]*pit[k][j]
                    if(ratings[k][m] == '1'):
                        num += pit[k][j]
                temp.append(num/tempp)
            
            probRu.append(temp)
        probRu = [list(x) for x in zip(*probRu)]
        
        if i+1 in [1,2,4,8,16,32,64,128,256]:
            ll.append([i+1,loglikelihood(probRu,probZu)])
            
        probZ = probZu
        probR = probRu
    df = pd.DataFrame(ll,columns = ["Iteration","Log-likelihood"])
    return df,probZu,probRu

In [15]:
df,pz,pr = mStep(probZ,probR,256)

### Iteration vs Log-likelihood in EM Update

In [16]:
df

Unnamed: 0,Iteration,Log-likelihood
0,0,-27.624366
1,1,-18.476708
2,2,-16.794868
3,4,-15.551768
4,8,-14.980241
5,16,-14.680079
6,32,-14.567533
7,64,-14.554449
8,128,-14.552489
9,256,-14.55211


## (f) Personal Movie Recommendations

In [17]:
pit = eStep(pz,pr)
pid = ids.index("A59023831")

In [18]:
rm = []
for i in range(M):
    if(ratings[pid][i]=='?'):
        p = 0
        for j in range(K):
            p += pit[pid][j]*probR[i][j]
        rm.append((movies[i],p))

In [19]:
rm = list(reversed(sorted(rm,key=lambda x:x[1])))

### Sorted ratings of unseen movies 

In [20]:
rm

[('Her', 0.969595748319675),
 ('Manchester_by_the_Sea', 0.878831984411844),
 ('Three_Billboards_Outside_Ebbing', 0.807360528884841),
 ('The_Farewell', 0.797638803958567),
 ('Us', 0.770883910501888),
 ('Ex_Machina', 0.743825854075093),
 ('The_Perks_of_Being_a_Wallflower', 0.725997985350451),
 ('Man_of_Steel', 0.712988980382677),
 ('Avengers:_Infinity_War', 0.641566208946338),
 ('Room', 0.63946088087994),
 ('Pitch_Perfect', 0.617144913620724),
 ('Terminator:_Dark_Fate', 0.569494412745376),
 ('Ready_Player_One', 0.549547921508563),
 ('Chappaquidick', 0.505662165676897),
 ('Rocketman', 0.450873936413349),
 ('The_Social_Network', 0.345560727043048),
 ('Fast_Five', 0.315515631006063),
 ('Django_Unchained', 0.237026980243028),
 ('Phantom_Thread', 0.184010201627465),
 ('Prometheus', 0.165354197116933),
 ('Midnight_in_Paris', 0.137474704146238),
 ('The_Martian', 0.065961090684024),
 ('The_Girl_with_the_Dragon_Tattoo', 0.05336254511708)]