In [1]:
import pandas as pd
import numpy as np
import operator
from surprise import Dataset
from surprise import get_dataset_dir
from scipy import spatial

data = Dataset.load_builtin('ml-100k')

In [2]:
# testing some connections with movielens data
u_data_path = get_dataset_dir() + '/ml-100k/ml-100k/u.data'
r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv(u_data_path, sep='\t', names=r_cols, usecols=range(3))
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [3]:
# more testing
movieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})
movieProperties.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


In [4]:
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
movieNormalizedNumRatings.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.774914
2,0.223368
3,0.152921
4,0.357388
5,0.146048


In [5]:
# model test run
u_item_path = get_dataset_dir() + '/ml-100k/ml-100k/u.item'

movieDict = {}

with open(u_item_path, encoding="ISO-8859-1") as f:
    temp = ''
    for line in f:
        #line.encode().decode("ISO-8859-1")
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = fields[5:25]
        genres = map(int, genres)
        movieDict[movieID] = (name, np.array(list(genres)), movieNormalizedNumRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean'))

In [6]:
print(movieDict[1])

('Toy Story (1995)', array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.7749140893470791, 3.8783185840707963)


In [7]:
def ComputeDistance(a, b):
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA, genresB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return genreDistance + popularityDistance

ComputeDistance(movieDict[1], movieDict[4])

1.084192439862543

In [8]:
def getNeighbors(movieID, K):
    distances = []
    for movie in movieDict:
        if (movie != movieID):
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors

K = 10
avgRating = 0

print(movieDict[1], '\n')
neighbors = getNeighbors(1, K) # Toy Story (1995)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))

avgRating /= K

('Toy Story (1995)', array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.7749140893470791, 3.8783185840707963) 

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
George of the Jungle (1997) 2.685185185185185
Beavis and Butt-head Do America (1996) 2.7884615384615383
Birdcage, The (1996) 3.4436860068259385
Home Alone (1990) 3.0875912408759123
Lion King, The (1994) 3.7818181818181817


In [9]:
u_user_path = get_dataset_dir() + '/ml-100k/ml-100k/u.user'

# pass in column names for each CSV
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(u_user_path, sep='|', names=u_cols,
                    encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(u_data_path, sep='\t', names=r_cols,
                      encoding='latin-1')

# the movies file contains columns indicating the movie's genres
# let's only load the first five columns of the file with usecols
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv(u_item_path, sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')

# create one merged DataFrame
movie_ratings = pd.merge(movies, ratings)
lens = pd.merge(movie_ratings, users)

In [10]:
# test merged df
most_rated = lens.groupby('title').size().sort_values(ascending=False)[:25]
most_rated

title
Star Wars (1977)                             583
Contact (1997)                               509
Fargo (1996)                                 508
Return of the Jedi (1983)                    507
Liar Liar (1997)                             485
English Patient, The (1996)                  481
Scream (1996)                                478
Toy Story (1995)                             452
Air Force One (1997)                         431
Independence Day (ID4) (1996)                429
Raiders of the Lost Ark (1981)               420
Godfather, The (1972)                        413
Pulp Fiction (1994)                          394
Twelve Monkeys (1995)                        392
Silence of the Lambs, The (1991)             390
Jerry Maguire (1996)                         384
Chasing Amy (1997)                           379
Rock, The (1996)                             378
Empire Strikes Back, The (1980)              367
Star Trek: First Contact (1996)              365
Back to the Fu

In [11]:
# check out movie lens movies df
movies

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)
...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998)
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...


In [12]:
# connect to aws postgresql db
import psycopg2
import sqlalchemy
import matplotlib as plt
from sqlalchemy import create_engine

# Postgres username, password, and database name
POSTGRES_ADDRESS = 'ls-5bad198e6aa4dcbede073ad80fecaadf5ac155b2.csin2y9m3fif.us-east-1.rds.amazonaws.com' ## AWS POSTGRES ENDPOINT
POSTGRES_PORT = '5432' ## PORT
POSTGRES_USERNAME = 'vvmmzgog' ## USERNAME
POSTGRES_PASSWORD = 'y7YL8wXb11DKKOtXlBO4wXtearw3Lluy' ## PASSWORD 
POSTGRES_DBNAME = 'MachineLearningProject_db' ## DB NAME
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
  .format(
      username=POSTGRES_USERNAME,
      password=POSTGRES_PASSWORD,
      ipaddress=POSTGRES_ADDRESS,
      port=POSTGRES_PORT,
      dbname=POSTGRES_DBNAME))
# Create the connection
cnx = create_engine(postgres_str)

In [13]:
# query movies_choices table
movie_choices_df = pd.read_sql_query('SELECT * FROM "movie_choices"',con=cnx)
movie_choices_df

Unnamed: 0,movie_1,movie_2,movie_3,movie_id
0,The Seven Samurai,Bonnie and Clyde,Reservoir Dogs,1
1,Airplane!,Doctor Zhivago,The Deer Hunter,2
2,Close Encounters of the Third Kind,Up,Rocky,3
3,Memento,Braveheart,Slumdog Millionaire,4
4,The Lord of the Rings: The Return of the King,Beauty and the Beast,Seven,5
5,Panther,Mr. Jones,Legal Deceit,6
6,That Darn Cat!,Squeeze,For the Moment,7


In [14]:
# select the last user submission form on website
last_user = movie_choices_df.tail(1)
last_user = pd.DataFrame(last_user)
last_user

Unnamed: 0,movie_1,movie_2,movie_3,movie_id
6,That Darn Cat!,Squeeze,For the Moment,7


In [15]:
# transpose df and drop movie_id
last_user=last_user.transpose()
last_user

Unnamed: 0,6
movie_1,That Darn Cat!
movie_2,Squeeze
movie_3,For the Moment
movie_id,7


In [16]:
#  drop movie_id row
last_user = last_user.drop(['movie_id'])
last_user

Unnamed: 0,6
movie_1,That Darn Cat!
movie_2,Squeeze
movie_3,For the Moment


In [17]:
# extract last submission index
last_submission = movie_choices_df.index[-1]
last_submission

6

In [18]:
# more transformation stuff
last_user.iloc[[0]]

# rename column
last_user.rename(columns={
    last_submission:'movie_title',}, inplace=True)
last_user
movie_1 = str(last_user.iloc[0]['movie_title'])
movie_2 = str(last_user.iloc[1]['movie_title'])
movie_3 = str(last_user.iloc[2]['movie_title'])
print(movie_1)
print(movie_2)
print(movie_3)

That Darn Cat!
Squeeze
For the Moment


In [19]:
last_user

Unnamed: 0,movie_title
movie_1,That Darn Cat!
movie_2,Squeeze
movie_3,For the Moment


In [20]:
movies_no_year_df = pd.read_csv('csv/ml-100k-movies-no-year-2.csv')
movies_no_year_df

Unnamed: 0,movie_id,title
0,1,Toy Story
1,2,GoldenEye
2,3,Four Rooms
3,4,Get Shorty
4,5,Copycat
...,...,...
1677,1678,Mat' i syn
1678,1679,B. Monkey
1679,1680,Sliding Doors
1680,1681,You So Crazy


In [21]:
# test whether or not user submission exists in movie lens db
movies_no_year_df.loc[movies_no_year_df['title'] == movie_2]

Unnamed: 0,movie_id,title
1383,1384,Squeeze


In [22]:
# for loop to create movies_id list

movie_to_check = [movie_1,movie_2,movie_3]
movie_ids = []

for movie in movie_to_check:
  # check if user input exists in ml-100k db
  if movie in movies_no_year_df.values:
    # print(movie)
    print(movie)
    m_id = movies_no_year_df.loc[movies_no_year_df['title'] == movie,'movie_id'].values[0]
    # print(u_id)
    movie_ids.append(m_id)
  else:
    # print('nothing')
    movie_ids.append(0)

movie_ids

That Darn Cat!
Squeeze
For the Moment


[878, 1384, 1357]

In [23]:
user_input_to_movie_data = {
    'title':movie_to_check,
    'movie_id':movie_ids,
}

user_input_w_movieId_df = pd.DataFrame(user_input_to_movie_data)

user_input_w_movieId_df

Unnamed: 0,title,movie_id
0,That Darn Cat!,878
1,Squeeze,1384
2,For the Moment,1357


In [24]:
# loop through movie_ids for recommendations
for movie_id in movie_ids:
    
    i = 0
    movieDict = {}
    if movie_id != 0:

        with open(u_item_path, encoding="ISO-8859-1") as f:
            temp = ''
            for line in f:
                #line.encode().decode("ISO-8859-1")
                fields = line.rstrip('\n').split('|')
                movieID = int(fields[0])
                name = fields[1]
                genres = fields[5:25]
                genres = map(int, genres)
                movieDict[movieID] = (name, np.array(list(genres)), movieNormalizedNumRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean'))

        def ComputeDistance(a, b):
            genresA = a[1]
            genresB = b[1]
            genreDistance = spatial.distance.cosine(genresA, genresB)
            popularityA = a[2]
            popularityB = b[2]
            popularityDistance = abs(popularityA - popularityB)
            return genreDistance + popularityDistance

            ComputeDistance(movieDict[1], movieDict[4])

        def getNeighbors(movieID, K):
            distances = []
            for movie in movieDict:
                if (movie != movieID):
                    dist = ComputeDistance(movieDict[movieID], movieDict[movie])
                    distances.append((movie, dist))
            distances.sort(key=operator.itemgetter(1))
            neighbors = []
            for x in range(K):
                neighbors.append(distances[x][0])
            return neighbors

        K = 10
        avgRating = 0
        
        print('-----------------------------------------------------------------', '\n')
        print(movieDict[movie_id], '\n')
        print('-----------------------------------------------------------------')
        neighbors = getNeighbors(movie_id, K)
        for neighbor in neighbors:
            avgRating += movieDict[neighbor][3]
            print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))
            
        avgRating /= K

    else:
        print('-----------------------------------------------------------------', '\n')
        print('The movie ' + movie_to_check[i] + ' is not found in the database.', '\n')
        print('-----------------------------------------------------------------')


----------------------------------------------------------------- 

('That Darn Cat! (1997)', array([0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]), 0.054982817869415807, 2.515151515151515) 

-----------------------------------------------------------------
That Darn Cat! (1965) 2.473684210526316
That Darn Cat! (1997) 2.25
Flintstones, The (1994) 2.064516129032258
Shaggy Dog, The (1959) 3.0
Manhattan Murder Mystery (1993) 3.6296296296296298
Angels in the Outfield (1994) 2.923076923076923
Harriet the Spy (1996) 2.7
First Kid (1996) 2.925
Air Bud (1997) 2.558139534883721
Mouse Hunt (1997) 2.477272727272727
----------------------------------------------------------------- 

('Squeeze (1996)', array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.003436426116838488, 1.6666666666666667) 

-----------------------------------------------------------------
Frisk (1995) 2.6666666666666665
Last Time I Saw Paris, The (1954) 3.6666666666666665
Quiet Room, The (1996) 3.6666