## Recommendation System
Collaborative filtering with implicit feedback based on latent factors. Prepare data on user-item relationships for each user-company in format that ALS can use.
We require each unique assignee ID in the rows of the matrix, and each unique item ID in columns of matrix.
Values of matrix should be (?) binary user-item preference * confidence

In [2]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import Row
from pyspark.sql.types import ArrayType, IntegerType
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import pandas as pd
import numpy as np

from test_model import (get_patent_fields_list, get_ml_patents, 
                        create_title_abstract_col,trim_data, 
                        structure_dataframe, partition_dataframe, 
                        build_pipeline, process_docs, pat_inv_map, get_topics)

from rec_system import alphanum_to_int, int_to_alphanum

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.cluster import KMeans

import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary, mmcorpus
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamodel import LdaModel
from gensim.models import AuthorTopicModel
from gensim.test.utils import common_dictionary, datapath, temporary_file
from smart_open import smart_open

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, punkt, RegexpTokenizer, wordpunct_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

import json
from pandas.io.json import json_normalize
import requests
import re
import os
import calendar
import requests
from bs4 import BeautifulSoup
import pickle
import math

import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim

from pprint import pprint

%load_ext autoreload
%autoreload 2

In [3]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark

In [4]:
sc = spark.sparkContext
sc

### Data understanding - Acquire data

#### Data understanding - Acquire data for text workflows

In [5]:
# load pickled dataset
with open('/Users/lee/Documents/techniche/techniche/data/raw_data_1000', 'rb') as f:
    raw_data_1000 = pickle.load(f)

In [6]:
# define keys as criteria to subset dataset #1 for non-text workflows
retained_keys = ['patent_number', 'patent_firstnamed_assignee_id']

# subset raw dataset by desired keys/columns
data_1000 = trim_data(data=raw_data_1000, keys=retained_keys)

In [7]:
# define keys as criteria to subset dataset #2, for text workflows
retained_keys_2 = ['patent_number', 'patent_firstnamed_assignee_id',
                   'patent_title', 'patent_abstract']

# subset raw dataset by desired keys/columns for text analysis workflows
data_1000_2 = trim_data(data=raw_data_1000, keys=retained_keys_2)

#### Data preparation

In [8]:
# create new item in dataset #2 by concat of patent_title and patent_abstract
data_1000_2 = create_title_abstract_col(data=data_1000_2)

In [9]:
# create Pandas dataframe from dataset #1
df_1000 = pd.DataFrame(data_1000)

In [10]:
# create Pandas dataframe from dataset #2
df_1000_2 = pd.DataFrame(data_1000_2)

In [11]:
# for dataset #1: drop row that contains invalid data
df_1000[df_1000.patent_number.str.contains('[RE]')]
df_1000 = df_1000.drop(df_1000.index[[717]])

# drop NaNs in patent_firstnamed_assignee_id column
df_1000 = df_1000.dropna()

In [12]:
# for dataset#2: drop row that contains invalid data
df_1000_2[df_1000_2.patent_number.str.contains('[RE]')]
df_1000_2 = df_1000_2.drop(df_1000_2.index[[717]])

# drop NaNs in patent_firstnamed_assignee_id column
df_1000_2 = df_1000_2.dropna()

#### Data preparation - model #1
Prepare data on user-item relationships for each user-company in format that ALS can use.
We require each unique assignee ID in the rows of the matrix, and each unique item ID in columns of matrix.
Values of matrix should be (?) binary user-item preference * confidence

In [13]:
# create new rating column and assign value of 1
df_1000['rating'] = 1

In [14]:
# convert patent_number column from string to int
df_1000 = df_1000.astype({'patent_number': 'int64'})
# uncomment to confirm
# df_1000.info()

In [15]:
# convert alphanumeric patent_firstnamed_assignee_id col to int
df_1000 = df_1000.astype({'patent_number': 'int64'})

In [16]:
# df_1000['patent_firstnamed_assignee_id'] = df_1000['patent_firstnamed_assignee_id'].apply(hash).apply(abs)
df_1000['patent_firstnamed_assignee_id'] = df_1000['patent_firstnamed_assignee_id'].apply(hash).apply(abs) % 65536 # 2^16

In [17]:
# df_1000['patent_firstnamed_assignee_id'] = df_1000['patent_firstnamed_assignee_id'].apply(hash).apply(abs)
df_1000['patent_number'] = df_1000['patent_number'] % 65536 # 2^16

In [18]:
df_1000 = df_1000.astype({'patent_firstnamed_assignee_id': 'int'})

#### Data preparation - model #1 - create Spark dataframe from pandas dataframe

In [19]:
sp_df_1000 = spark.createDataFrame(df_1000)

In [20]:
# cast columns from bigint to int
sp_df_1000_2 = sp_df_1000.withColumn("patent_firstnamed_assignee_id", sp_df_1000["patent_firstnamed_assignee_id"].cast(IntegerType())).withColumn("patent_number", sp_df_1000["patent_number"].cast(IntegerType())).withColumn("rating", sp_df_1000["rating"].cast(IntegerType()))

In [21]:
# partition dataframe 
(training, test) = sp_df_1000.randomSplit([0.8, 0.2])

### Model # 1
Build the recommendation model using ALS on the training data

In [22]:
# build ALS recommendation model
als = ALS(maxIter=5,
          regParam=0.01, 
          rank=10, # number of latent topics- ME-10?
          alpha=30,
          implicitPrefs=True, # # implicitPrefs=True b/c ratings are implicit
          userCol="patent_firstnamed_assignee_id", 
          itemCol="patent_number", 
          ratingCol="rating",
          coldStartStrategy="nan") # coldStartStrategy="nan" to retain NaNs

In [23]:
# fit ALS model to the training set
model = als.fit(training)

#### Model #1 - Evaluation - Compare to naive baseline
Compare model evaluation result with naive baseline model that only outputs (for explicit - the average rating (or you may try one that outputs the average rating per movie).

#### Model #1 - Optimize model

In [None]:
# optimize model

#### Getting Predictions

In [30]:
# get predictions for test set
predictions_test = model.transform(test)
predictions_test_df = predictions_test.toPandas()

In [29]:
# get predictions for training set
predictions_train = model.transform(training)
predictions_train_df = predictions_train.toPandas()
predictions_train_df

Unnamed: 0,patent_firstnamed_assignee_id,patent_number,rating,prediction
0,59878,3997,1,0.977067
1,25713,9900,1,0.363925
2,59878,26087,1,0.977067
3,59878,32304,1,0.977067
4,26407,43256,1,0.364367
5,6623,54258,1,0.878919
6,30640,34602,1,0.364231
7,11061,2525,1,0.875135
8,30542,3986,1,0.364142
9,59749,4042,1,0.364071


In [32]:
predictions_train_df.dropna()

Unnamed: 0,patent_firstnamed_assignee_id,patent_number,rating,prediction
0,59878,3997,1,0.977067
1,25713,9900,1,0.363925
2,59878,26087,1,0.977067
3,59878,32304,1,0.977067
4,26407,43256,1,0.364367
5,6623,54258,1,0.878919
6,30640,34602,1,0.364231
7,11061,2525,1,0.875135
8,30542,3986,1,0.364142
9,59749,4042,1,0.364071


In [33]:
predictions_test_df.dropna()

Unnamed: 0,patent_firstnamed_assignee_id,patent_number,rating,prediction


### Model #2 - Data preparation

#### Model 2 - Data preparation - text data

- TF-IDF vectorization of patents - metrics - avg distance between individual patents, with ranking
- take tf-idf vector and argsort by absolute value, to see which features are most important to patent
- get top 20 features. normally would do cosine distance betweel all vectors. BUT, only do cosine distance between these top 20 features, for cold start patents

In [36]:
# instantiate TF-IDF Vectorizer using standard English stopwords
tfidf = TfidfVectorizer(stop_words='english')

In [37]:
# fit TF-IDF matrix on text column
tfidf_matrix = tfidf.fit_transform(df_1000_2['patent_title_abstract'])

In [38]:
# output matrix, 972 docs, 5364 terms
tfidf_matrix.shape

(972, 5364)

### Model 3 - compute distance metric

In [42]:
# compute cosine similarity matrix between docs using linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

(972, 972)

In [None]:
# construct reverse map of indices and pat_title_abstract
indices = pd.Series(df_1000_2.index, index=df_1000_2['patent_title_abstract']).drop_duplicates()

In [None]:
indices

In [None]:
# tfidf vec requires list, not just string
unseen_data = 'computer vision natural language processor'
unseen_data=[unseen_data]

In [25]:
unseen_tfidf = tfidf.transform(unseen_data)

NameError: name 'tfidf' is not defined

In [1]:
# take user input of straing and output most similar documents
def get_recommendations(string, cosine_sim=cosine_sim):
    # get idx of patent_number that matches text
    idx = indices[text_string]

    # calculate pairwise similarity scores of all patents with given patent
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort patents based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get scores of 10 most similar patents
    sim_scores = sim_scores[1:11]

    # get patent indices
    patent_indices = [i[0] for i in sim_scores]

    # Return top 10 most similar documents
    return df_1000_1['patent_title_abstract'].iloc[movie_indices]

NameError: name 'cosine_sim' is not defined

#### Model #2 - Apply K means clustering to distance matrix

In [None]:
km = KMeans(20)

In [None]:
kmresult = km.fit(tfidf_matrix).predict(unseen_tfidf)

In [None]:
kmresult_p = km.predict(unseen_tfidf)

In [None]:
kmresult_p