## Recommendation System
Collaborative filtering with implicit feedback based on latent factors. Prepare data on user-item relationships for each user-company in format that ALS can use.
We require each unique assignee ID in the rows of the matrix, and each unique item ID in columns of matrix.
Values of matrix should be (?) binary user-item preference * confidence

In [8]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import Row
from pyspark.sql.types import ArrayType, IntegerType
from pyspark.mllib.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k

import pandas as pd
import numpy as np

from test_model import (get_patent_fields_list, get_ml_patents, 
                        create_title_abstract_col,trim_data, 
                        structure_dataframe, partition_dataframe, 
                        build_pipeline, process_docs, pat_inv_map, get_topics)
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary, mmcorpus
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamodel import LdaModel
from gensim.models import AuthorTopicModel
from gensim.test.utils import common_dictionary, datapath, temporary_file
from smart_open import smart_open

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, punkt, RegexpTokenizer, wordpunct_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

import json
from pandas.io.json import json_normalize
import requests
import re
import os
import calendar
import requests
from bs4 import BeautifulSoup
import pickle

import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim

from pprint import pprint

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark

In [10]:
sc = spark.sparkContext
sc

### Data understanding - Acquire data

In [12]:
# load pickled dataset
with open('/Users/lee/Documents/techniche/techniche/data/raw_data_1000', 'rb') as f:
    raw_data_1000 = pickle.load(f)

In [16]:
# define desired keys/columns as criteria to subset dataset
retained_keys = ['patent_number', 'patent_date', 'patent_year',
                 'patent_firstnamed_assignee_id',
                 'patent_firstnamed_assignee_location_id',
                 'patent_firstnamed_assignee_city',
                 'patent_firstnamed_assignee_state',
                 'patent_firstnamed_assignee_country',
                 'patent_firstnamed_assignee_latitude',
                 'patent_firstnamed_assignee_longitude',
                 'assignees', 'IPCs', 'cpcs', 
                 'application_citations', 'cited_patents']

In [17]:
# subset raw dataset by desired keys/columns
data`_1000 = trim_data(data=raw_data_1000, keys=retained_keys)

In [19]:
df_1000 = pd.DataFrame(data_1000)

In [20]:
df_1000.head(5)

Unnamed: 0,IPCs,application_citations,assignees,cited_patents,cpcs,patent_date,patent_firstnamed_assignee_city,patent_firstnamed_assignee_country,patent_firstnamed_assignee_id,patent_firstnamed_assignee_latitude,patent_firstnamed_assignee_location_id,patent_firstnamed_assignee_longitude,patent_firstnamed_assignee_state,patent_number,patent_year
0,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2001/20010027335', 'ap...","[{'assignee_city': 'Louisville', 'assignee_cou...",[{'cited_patent_category': 'cited by applicant...,"[{'cpc_category': None, 'cpc_first_seen_date':...",2019-03-12,Louisville,US,org_VU2IXnxgxGIK8A8oQrwm,39.9778,39.9778|-105.1314,-105.131,CO,10226194,2019
1,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2004/20040174542', 'ap...","[{'assignee_city': 'New South Wales', 'assigne...",[{'cited_patent_category': 'cited by applicant...,"[{'cpc_category': None, 'cpc_first_seen_date':...",2019-03-12,New South Wales,AU,org_9cmRc2rH8nbl8O9VuxYL,-37.803,-37.803|145.002,145.002,,10228278,2019
2,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2015/20150344028', 'ap...","[{'assignee_city': 'Dearborn', 'assignee_count...",[{'cited_patent_category': 'cited by applicant...,"[{'cpc_category': None, 'cpc_first_seen_date':...",2019-03-12,Dearborn,US,org_8O8xQifxyiW5pZB2KuDx,42.3222,42.3222|-83.1764,-83.1764,MI,10228693,2019
3,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2006/20060080285', 'ap...","[{'assignee_city': 'Armonk', 'assignee_country...","[{'cited_patent_category': None, 'cited_patent...","[{'cpc_category': None, 'cpc_first_seen_date':...",2019-03-12,Armonk,US,org_q9Bn28RHhpYrQjKvraAH,41.1264,41.1264|-73.7144,-73.7144,NY,10228922,2019
4,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2004/20040143828', 'ap...","[{'assignee_city': 'Redmond', 'assignee_countr...",[{'cited_patent_category': 'cited by examiner'...,"[{'cpc_category': None, 'cpc_first_seen_date':...",2019-03-12,Redmond,US,org_EilEWQcC6UiqHcSGx9mb,40.4789,40.4789|-105.0403,-105.04,CA,10228931,2019


In [21]:
# TODO (Lee) - from topic_model - convert dataframe from subsetted dict, organize columns and sort by patent_date
# df_1000 = structure_dataframe(data=data_1000)

In [None]:
# partition df_1000 into train and test dataframes
# data_train_1000, data_test_1000 = partition_dataframe(df_1000, .8)

In [23]:
df = spark.read.csv(data_1000)
df = spark.read.load("data/data_1000",
                     format="csv", sep=":", inferSchema="true", header="true")

Py4JJavaError: An error occurred while calling o32.csv.
: java.lang.ClassCastException: java.util.HashMap cannot be cast to java.lang.String
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:545)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
	at scala.collection.immutable.List.flatMap(List.scala:355)
	at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary(DataSource.scala:545)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:359)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:615)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
df.persist()

In [None]:
df.limit(5).toPandas()

### Data preparation
Prepare data on user-item relationships for each user-company in format that ALS can use.
We require each unique assignee ID in the rows of the matrix, and each unique item ID in columns of matrix.
Values of matrix should be (?) binary user-item preference * confidence

In [None]:
latent_ratings = 

In [None]:
# Columns are: assigneeId, itemId, (transaction x confidence)

In [None]:
#### Partition data into training and test sets
(training, test) = latent_ratings.randomSplit([0.8, 0.2])

### Model # 1

In [12]:
# set implicitPrefs to True to get better results b/c latent_ratings matrix 
# derived from another source of information (i.e. it is inferred from other signals), 

In [10]:
# build recommendation model using ALS
rank = 10
numIterations = 10
alpha=

# als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True,
#           userCol="userId", itemCol="movieId", ratingCol="rating")

# build recommendation model using ALS based on implicit ratings
als = ALS(maxIter=5,rank=4, regParam=0.01, userCol="asigneeId", itemCol="patentId", ratingCol="rating",
          coldStartStrategy="True")

# second example
model = ALS.trainImplicit(latent_ratings, rank, numIterations, alpha=0.01)


NameError: name 'ratings' is not defined

#### Model #1 - Evaluation - Compare to naive baseline
Compare model evaluation result with naive baseline model that only outputs (for explicit - the average rating (or you may try one that outputs the average rating per movie).

#### Model #1 - Optimize model

In [None]:
als_model =  ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")

                 
params = ParamGridBuilder().addGrid(als_model.regParam, [0.01,0.001,0.1]).addGrid(als_model.rank, [4,10,50]).build()


## instantiate crossvalidator estimator
cv = CrossValidator(estimator=als_model, estimatorParamMaps=params,evaluator=evaluator,parallelism=4)
best_model = cv.fit(movie_ratings)    

In [None]:
# Getting Predictions for a New User