In [8]:
import pandas as pd
import numpy as np
from scipy import sparse
import csv
import os
import sys

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

sys.path.append("/Users/eric/Code/notflix/")

In [10]:
from src.data.db import common, utils, DB_HOST

## Dataset

### Dataset preparation

In [12]:
df = pd.read_json("../../../datasets/movielens/omdb.csv", lines=True)

In [13]:
movies = df[["id", "Title", "Plot", "Country", "Actors", "Director", "Production", "Genre", "Language", "Released", "imdbVotes", "imdbRating"]]

In [14]:
movies.replace("N/A", np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


### Cleaning features

In [15]:
movies["Released_year"] = movies["Released"].fillna("").str.split(" ").str[-1].replace("", 0).astype(int)
movies["Released_decade"] = pd.cut(movies["Released_year"], range(1920, 2020, 10))
movies["imdbVotes"] = movies["imdbVotes"].str.replace(",", "").fillna(0).astype(int)
movies["popularity"] = pd.cut(movies["imdbVotes"], 10)

In [19]:
movies.head()

Unnamed: 0,id,Title,Plot,Country,Actors,Director,Production,Genre,Language,Released,imdbVotes,imdbRating,Released_year,Released_decade,popularity
0,1,Toy Story,A cowboy doll is profoundly threatened and jea...,USA,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",John Lasseter,Buena Vista,"Animation, Adventure, Comedy, Family, Fantasy",English,22 Nov 1995,783232,8.3,1995,"(1990, 2000]","(620814.0, 827752.0]"
1,2,Jumanji,When two kids find and play a magical board ga...,USA,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",Joe Johnston,Sony Pictures Home Entertainment,"Adventure, Comedy, Family, Fantasy","English, French",15 Dec 1995,266124,7.0,1995,"(1990, 2000]","(206938.0, 413876.0]"
2,3,Grumpier Old Men,John and Max resolve to save their beloved bai...,USA,"Walter Matthau, Jack Lemmon, Sophia Loren, Ann...",Howard Deutch,Warner Home Video,"Comedy, Romance","English, Italian, German",22 Dec 1995,22329,6.6,1995,"(1990, 2000]","(-2069.38, 206938.0]"
3,4,Waiting to Exhale,"Based on Terry McMillan's novel, this film fol...",USA,"Whitney Houston, Angela Bassett, Loretta Devin...",Forest Whitaker,Twentieth Century Fox Home Entertainment,"Comedy, Drama, Romance",English,22 Dec 1995,8640,5.8,1995,"(1990, 2000]","(-2069.38, 206938.0]"
4,5,Father of the Bride Part II,George Banks must deal not only with the pregn...,USA,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Charles Shyer,Disney,"Comedy, Family, Romance",English,08 Dec 1995,31091,6.0,1995,"(1990, 2000]","(-2069.38, 206938.0]"


### The Model

In [20]:
country_vect = CountVectorizer()
director_vect = CountVectorizer()
genre_vect = CountVectorizer()
language_vect = CountVectorizer()
plot_vect = TfidfVectorizer(min_df=2, max_df=0.5)
title_vect = TfidfVectorizer(min_df=2, max_df=0.5)

In [21]:
X = sparse.hstack([
    country_vect.fit_transform(movies["Country"].fillna("")),
    genre_vect.fit_transform(movies["Genre"].fillna("")),
    language_vect.fit_transform(movies["Language"].fillna("")),
    director_vect.fit_transform(movies["Director"].fillna("")),
    # pd.get_dummies(movies["Released_decade"]).values,
    # plot_vect.fit_transform(movies["Plot"].fillna("")),
    # title_vect.fit_transform(movies["Title"].fillna("")),
])

In [22]:
nbrs = NearestNeighbors(n_neighbors=30, metric="cosine").fit(X)

In [23]:
%time distances, neighbors = nbrs.kneighbors(X)

CPU times: user 20.3 s, sys: 8.5 s, total: 28.8 s
Wall time: 29 s


In [36]:
%%time

to_insert = []

for index, movie_id in enumerate(movies.id):
    scores = 1. - distances[index]
    recommendations = list(map(lambda r: movies.id[r], neighbors[index]))
    
    for score, recommended_movie_id in zip(scores, recommendations):
        if movie_id == recommended_movie_id:
            continue
            
        r = common.Recommendation(
            engine_name="OneHotMultiInput",
            source_item_id=movie_id,
            recommended_item_id=recommended_movie_id,
            score=score
        )
        
        to_insert.append(r)

CPU times: user 30.9 s, sys: 403 ms, total: 31.3 s
Wall time: 31.3 s


In [38]:
utils.insert(to_insert)

OperationalError: (psycopg2.OperationalError) could not translate host name "None" to address: nodename nor servname provided, or not known

(Background on this error at: http://sqlalche.me/e/e3q8)