# Model training and optimzation
The recommendation model training and optimiation involves two parts as shown below: 
1. Data cleaning and processing <br>
    a. Access the category information from new articles in validation set <br>
    b. Get two feature TF-IDF matrixes for previous articles and new articels respectively <br>
    c. Compute the cosine similarity bewtween previous articles and new articles <br>
    d. Modelling <br>
2. Model optimization <br>
a. Optimization on category features using min_df and max_df in "TfidfVectorizer" process <br>
b. Optimization on the total number of articles recommended <br>
c. Optimization on the ratio of the number of new articles to previous articles <br>

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer 

### Load the data

In [203]:
train_df = pd.read_csv('../data/cleaned/train_df.csv')
valid_df = pd.read_csv('../data/cleaned/valid_df.csv')
test_df = pd.read_csv('../data/cleaned/test_df.csv')

### 1. Data cleaning and processing

In [256]:
train_df.head()

Unnamed: 0,contentID,headline,categories,visitID,visitorID,visitDateTime
0,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ...",1530421855,5.364833e+18,2018-07-01
1,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ...",1530421855,5.364833e+18,2018-07-01
2,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ...",1530421855,5.364833e+18,2018-07-01
3,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ...",1530421855,5.364833e+18,2018-07-01
4,www.arkansasonline.com/news/2018/jun/30/suspec...,Suspect arrested in fatal shooting in Pulaski ...,/News/Arkansas/Crime,1530422162,9.015515e+18,2018-07-01


In [204]:
train_articles_df = train_df.drop_duplicates(subset='contentID', keep='first', inplace=False)[['contentID', 'headline', 'categories']].reset_index(drop=True)
val_articles_df = valid_df.drop_duplicates(subset='contentID', keep='first', inplace=False)[['contentID', 'headline', 'categories']].reset_index(drop=True)

In [205]:
train_articles_df.head()

Unnamed: 0,contentID,headline,categories
0,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ..."
1,www.arkansasonline.com/news/2018/jun/30/suspec...,Suspect arrested in fatal shooting in Pulaski ...,/News/Arkansas/Crime
2,www.arkansasonline.com/news/2018/jun/30/police...,Police: Officer fatally shoots self at central...,/News/Arkansas/Law & Government/Public Safety/...
3,www.arkansasonline.com/news/2018/jun/30/motorc...,Motorcyclist killed in head-on crash in Little...,"/News/Arkansas, /News/Arkansas/Crime"
4,www.arkansasonline.com/news/2018/jun/30/early-...,The Recruiting Guy: Early offer gets highly re...,"None/Recruiting, /Sports/College/Razorbacks/Ra..."


In [282]:
#  get the dataframe showing the number of page visits and visitors for each article
train_article_visit_counts = train_df.groupby(['contentID']).count().sort_values('headline', ascending = False)
train_article_visit_counts['contentID'] = train_article_visit_counts.index
train_article_visit_counts = train_article_visit_counts.reset_index(drop = True)
train_article_visitor_counts = train_df.drop_duplicates(subset = ['contentID', 'visitorID']).groupby(['contentID']).count()
train_article_visitor_counts['contentID'] = train_article_visitor_counts.index
train_article_visitor_counts = train_article_visitor_counts.reset_index(drop = True)

train_article_counts_df = pd.merge(train_article_visit_counts, train_article_visitor_counts, on = 'contentID', how = 'inner')[['contentID', 'headline_x','headline_y']]
train_article_counts_df.columns = ['read_article', 'visits', 'visitor']
train_article_counts_df

Unnamed: 0,read_article,visits,visitor
0,www.arkansasonline.com/news/2018/aug/29/food-n...,910,393
1,www.arkansasonline.com/news/2018/aug/31/former...,734,296
2,www.arkansasonline.com/news/2018/aug/22/arkans...,656,316
3,www.arkansasonline.com/news/2018/jul/19/sherif...,594,243
4,www.arkansasonline.com/news/2018/aug/16/little...,583,268
...,...,...,...
8524,www.arkansasonline.com/news/2018/aug/12/look-f...,1,1
8525,www.arkansasonline.com/news/2018/aug/12/major-...,1,1
8526,www.arkansasonline.com/news/2018/aug/12/mercy-...,1,1
8527,www.arkansasonline.com/news/2018/aug/12/new-le...,1,1


In [206]:
val_articles_df.head()

Unnamed: 0,contentID,headline,categories
0,www.arkansasonline.com/news/2018/sep/03/trump-...,Trump attacks union leader on Labor Day,"/News/National, /News/Politics/National"
1,www.arkansasonline.com/news/2018/sep/03/injuri...,Injuries force Merrick into football retirement,/Sports/College/Razorbacks/Razobacks-College-F...
2,www.arkansasonline.com/news/2018/sep/03/galler...,GALLERY: 33rd annual National Championship Chu...,/News/Arkansas
3,www.arkansasonline.com/news/2018/sep/03/33-yea...,33-year-old Arkansan dies in wreck involving P...,"/News/Arkansas, /News/Fatalwrecks"
4,www.arkansasonline.com/news/2018/sep/03/arkans...,Arkansas Cinema Society supports local filmmak...,"None/Lr, /News/Arkansas, /News/Arkansas/Entert..."


In [207]:
train_visitors = train_df['visitorID'].unique().tolist() # 2884
val_visitors = valid_df['visitorID'].unique().tolist() # 1868

In [209]:
# the page visits info in training set
# which we could refer to in order to find similar new articles for former visitors
train_visits = train_df.drop_duplicates(subset = ['contentID', 'visitorID'])[['contentID', 'visitorID']]
train_visits = train_visits.sort_values('visitorID').reset_index(drop = True)
train_visits

Unnamed: 0,contentID,visitorID
0,www.arkansasonline.com/news/2018/aug/08/sherif...,2.011047e+14
1,www.arkansasonline.com/news/2011/nov/15/walton...,2.011047e+14
2,www.arkansasonline.com/news/2018/jul/17/author...,2.011047e+14
3,www.arkansasonline.com/news/2018/aug/09/arkans...,2.011047e+14
4,www.arkansasonline.com/news/2018/aug/09/top-6-...,2.011047e+14
...,...,...
103036,www.arkansasonline.com/news/2018/aug/24/channe...,1.805825e+19
103037,www.arkansasonline.com/news/2018/sep/03/kanis-...,1.805825e+19
103038,www.arkansasonline.com/news/2018/sep/03/school...,1.805825e+19
103039,www.arkansasonline.com/news/2018/sep/03/arkans...,1.805825e+19



#### a. Access the category information from new articles in validation set

In [210]:
# the page visits info in validation set
# which could be used to evaluate the precision and recall of our recommendation system
valid_visits = valid_df.drop_duplicates(subset = ['contentID', 'visitorID']).sort_values('visitorID').reset_index(drop = True)[['contentID', 'visitorID']]
valid_visits

Unnamed: 0,contentID,visitorID
0,www.arkansasonline.com/news/2018/sep/12/bigger...,2.011047e+14
1,www.arkansasonline.com/news/2018/sep/12/morris...,2.011047e+14
2,www.arkansasonline.com/news/2018/sep/12/video-...,2.011047e+14
3,www.arkansasonline.com/news/2018/sep/06/jury-t...,4.259291e+15
4,www.arkansasonline.com/news/2018/sep/07/confro...,4.259291e+15
...,...,...
25188,www.arkansasonline.com/news/2018/sep/12/fronti...,1.805825e+19
25189,www.arkansasonline.com/news/2018/sep/13/board-...,1.805825e+19
25190,www.arkansasonline.com/news/2018/sep/15/key-pi...,1.805825e+19
25191,www.arkansasonline.com/news/2018/sep/05/judge-...,1.805825e+19


In [300]:
valid_visits.groupby('visitorID').count().describe()

Unnamed: 0,contentID
count,1868.0
mean,13.486617
std,23.609784
min,1.0
25%,2.0
50%,7.0
75%,15.0
max,356.0


In validation set, I will use the contentID to decide if an article is a new article or not

In [211]:
# info about new articles in validation set
# select the new articles using the date info in contentID
val_article_split = val_articles_df['contentID'].str.split("/", n = 5, expand = True)
val_articles_df['year'] = [int(x) for x in val_article_split[2]]
val_articles_df['month'] = val_article_split[3]
val_articles_df['day'] = [int(x) for x in val_article_split[4]]
# select articles released during the validation period
val_new_articles_df = val_articles_df[(val_articles_df['year'] == 2018) & (val_articles_df['month'] == 'sep') & (val_articles_df['day'] < 18) & (val_articles_df['day'] >3)][['contentID','headline', 'categories']].reset_index(drop = True)
val_new_articles_df.head()

Unnamed: 0,contentID,headline,categories
0,www.arkansasonline.com/news/2018/sep/04/homele...,Homelessness battle fought by Samaritan's open...,/News/Arkansas
1,www.arkansasonline.com/news/2018/sep/04/south-...,"South LR site for shopping, soccer on table",/News/Arkansas
2,www.arkansasonline.com/news/2018/sep/04/herita...,Heritage teacher recognized for physics instru...,/News/Arkansas
3,www.arkansasonline.com/news/2018/sep/04/traffi...,Traffic stop nets stolen police gun,"/News/Arkansas, /News/Arkansas/Crime"
4,www.arkansasonline.com/news/2018/sep/04/letter...,Letters,"/Editorial, /Editorial/Letters"


In [225]:
len(val_new_articles_df)

1399

In [212]:
# compare two lists
def notintersection(lst1, lst2): 
    lst3 = [value for value in lst2 if value not in lst1] 
    return lst3 

def intersection(lst1, lst2): 
    lst3 = [value for value in lst2 if value in lst1] 
    return lst3 

In [217]:
# validation visits about new visitors
val_new_visitors = notintersection(train_visitors, val_visitors)
val_new_visitors_df = valid_visits[valid_visits['visitorID'].isin(val_new_visitors)].reset_index(drop= True)
val_new_visitors_df
# val_new_visitors_df = valid_df[valid_df['visitorID'].isin(val_new_visitors)].drop_duplicates(subset = ['contentID', ])
# val_new_visitors_df = val_new_visitors_df.reset_index(drop = True)
# val_new_visitors_df.head()# 2702 rows × 6 columns

Unnamed: 0,contentID,visitorID
0,www.arkansasonline.com/news/2018/sep/11/pedest...,1.284325e+16
1,www.arkansasonline.com/news/2018/sep/10/health...,1.284325e+16
2,www.arkansasonline.com/news/2018/sep/10/wwi-ca...,1.284325e+16
3,www.arkansasonline.com/news/2018/sep/13/2-cent...,1.284325e+16
4,www.arkansasonline.com/news/2018/sep/12/jonesb...,1.284325e+16
...,...,...
1483,www.arkansasonline.com/news/2018/sep/16/picked...,1.437823e+19
1484,www.arkansasonline.com/news/2018/sep/16/red-wo...,1.437823e+19
1485,www.arkansasonline.com/news/2018/sep/13/asu-mi...,1.437823e+19
1486,www.arkansasonline.com/news/2018/sep/17/tyson-...,1.437823e+19


In [218]:
# validation information about former visitors
val_former_visitors = intersection(train_visitors, val_visitors)
val_former_visitors_df = valid_visits[valid_visits['visitorID'].isin(val_former_visitors)].reset_index(drop= True)
val_former_visitors_df 
# val_former_visitors_df = valid_df[valid_df['visitorID'].isin(val_former_visitors)]
# val_former_visitors_df = val_former_visitors_df.reset_index(drop = True)
# val_former_visitors_df.head() # 50122 rows × 6 columns

Unnamed: 0,contentID,visitorID
0,www.arkansasonline.com/news/2018/sep/12/bigger...,2.011047e+14
1,www.arkansasonline.com/news/2018/sep/12/morris...,2.011047e+14
2,www.arkansasonline.com/news/2018/sep/12/video-...,2.011047e+14
3,www.arkansasonline.com/news/2018/sep/06/jury-t...,4.259291e+15
4,www.arkansasonline.com/news/2018/sep/07/confro...,4.259291e+15
...,...,...
23700,www.arkansasonline.com/news/2018/sep/12/fronti...,1.805825e+19
23701,www.arkansasonline.com/news/2018/sep/13/board-...,1.805825e+19
23702,www.arkansasonline.com/news/2018/sep/15/key-pi...,1.805825e+19
23703,www.arkansasonline.com/news/2018/sep/05/judge-...,1.805825e+19


#### b. Get two feature TF-IDF matrixes for previous articles and new articels respectively

In [130]:
# data cleaning on category
def clean_category(text):
    text = re.sub(r'&', "", text) # remove "&"
    text = re.sub(r'-', "", text) # remove "-"
    text = re.sub(r"'", " ", text) # remove '
    text = re.sub(r' ', "", text) # remove space
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text) # remove all the punctuations with space
    return text

In [182]:
train_articles_df.head()

Unnamed: 0,contentID,headline,categories
0,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ..."
1,www.arkansasonline.com/news/2018/jun/30/suspec...,Suspect arrested in fatal shooting in Pulaski ...,/News/Arkansas/Crime
2,www.arkansasonline.com/news/2018/jun/30/police...,Police: Officer fatally shoots self at central...,/News/Arkansas/Law & Government/Public Safety/...
3,www.arkansasonline.com/news/2018/jun/30/motorc...,Motorcyclist killed in head-on crash in Little...,"/News/Arkansas, /News/Arkansas/Crime"
4,www.arkansasonline.com/news/2018/jun/30/early-...,The Recruiting Guy: Early offer gets highly re...,"None/Recruiting, /Sports/College/Razorbacks/Ra..."


In [219]:
# clean the category columns for previous and new articles
train_articles_df['categories'] = train_articles_df['categories'].apply(lambda x: clean_category(x))
val_new_articles_df['categories'] = val_new_articles_df['categories'].apply(lambda x: clean_category(x))
train_articles_df['categories']

0        News Arkansas  News Politics Arkansas LawGove...
1                                     News Arkansas Crime
2        News Arkansas LawGovernment PublicSafety Crim...
3                      News Arkansas  News Arkansas Crime
4       None Recruiting  Sports College Razorbacks Raz...
                              ...                        
8524     News World Health HealthConditions Infectious...
8525                                           News World
8526                         Entertainment Features Style
8527                         Entertainment Features Style
8528     Sports College Razorbacks RazobacksCollegeFoo...
Name: categories, Length: 8529, dtype: object

In [220]:
# convert into TF-IDF featuer matrixes
tfidf = TfidfVectorizer() # min_df = 2, max_df = 0.4 norm = 'l2'
train_tfidf = tfidf.fit_transform(train_articles_df['categories'])
val_tfidf = tfidf.transform(val_new_articles_df['categories'])

In [500]:
# define a function to create the article-feature dataframe for previous articles and new articles
def feature_df(tfidf, articleIDlist, train_features):
    '''Convert the input TF-IDF matrix into TF-IDF data frame whose index is article ID and column names are feature names '''
    df = pd.DataFrame(tfidf.toarray())
    df['contentID'] = articleIDlist
    df = df.set_index('contentID')
    df.columns = train_features
    return df

In [226]:
previous_articleID = train_articles_df['contentID'].tolist()
val_new_articleID = val_new_articles_df['contentID'].tolist()

train_features = tfidf.get_feature_names()

In [471]:
len(previous_articleID)

8529

In [226]:
train_tfidf_df = feature_df(train_tfidf, previous_articleID, train_features)
val_tfidf_df = feature_df(val_tfidf, val_new_articleID, train_features)
val_tfidf_df.head() # 1399*423

Unnamed: 0_level_0,2010election,accountingauditing,actingtheater,activestyle,activities,acxiom,adgbreaking,adghighschool,adgpolitics,adgsports,...,world,worldbusiness,wrestling,zballot18,zcongress18,zeditorial2018,zgovernor18,zoosaquariumspreserves,zstatehouse18,zsupremecourt
contentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
www.arkansasonline.com/news/2018/sep/04/homelessness-battle-fought-by-samaritan/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
www.arkansasonline.com/news/2018/sep/04/south-lr-site-for-shopping-soccer-on-ta/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
www.arkansasonline.com/news/2018/sep/04/heritage-teacher-recognized-for-physics/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
www.arkansasonline.com/news/2018/sep/04/traffic-stop-nets-stolen-police-gun-201/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
www.arkansasonline.com/news/2018/sep/04/letters-20180904/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### c. Compute the cosine similarity bewtween previous articles and new articles

In [231]:
# create similarity matrix between all the articles and test articles
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
# the similarity dataframe between previous articles and new articles
# the index names stand for new articles and column names stand for previous articles
similarity_df = pd.DataFrame(1- pairwise_distances(np.array(val_tfidf_df), np.array(train_tfidf_df), metric = 'cosine'))
similarity_df.index = val_tfidf_df.index
similarity_df.columns = train_tfidf_df.index

similarity_df # 1399*8529

contentID,www.arkansasonline.com/news/2018/jun/30/arrested-lawmaker-urged-to-quit-post-20/,www.arkansasonline.com/news/2018/jun/30/suspect-arrested-fatal-shooting-pulaski-county/,www.arkansasonline.com/news/2018/jun/30/police-officer-fatally-shoots-self-central-arkansa/,www.arkansasonline.com/news/2018/jun/30/motorcyclist-killed-head-crash-little-rock-suv-dri/,www.arkansasonline.com/news/2018/jun/30/early-offer-gets-highly-recruited-receiver-campus-/,www.arkansasonline.com/news/2018/may/03/1-killed-1-injured-killed-when-car-runs-curve-sout/,www.arkansasonline.com/news/2018/apr/24/facebook-profile-lands-arkansas-sex-offender-jail-/,www.arkansasonline.com/news/2018/jun/30/3-motorcyclists-among-highway-deaths-20/,www.arkansasonline.com/news/2018/jun/30/cupcakes-idea-takes-boot-camp-top-prize/,www.arkansasonline.com/news/2018/jun/30/steel-mill-to-expand-add-500-new-worker/,...,www.arkansasonline.com/news/2018/sep/03/asu-coach-confirms-junior-wr-will-miss-rest-season/,www.arkansasonline.com/news/2018/sep/03/tropical-storm-gordon-brings-hurricane-watch-gulf-/,www.arkansasonline.com/news/2018/sep/03/man-gets-25-year-sentence-fatal-shooting-arkansas-/,www.arkansasonline.com/news/2018/sep/03/arkansas-man-accused-fatally-shooting-his-uncle-gr/,www.arkansasonline.com/news/2018/sep/02/couple-named-arkansas-foster-parents-year/,www.arkansasonline.com/news/2018/sep/03/ebola-survivors-face-stigma-in-congo-20/,www.arkansasonline.com/news/2018/sep/03/s-korea-security-officials-to-visit-nor/,www.arkansasonline.com/news/2016/aug/29/rock-skipping-champ-coming-to-local-con/,www.arkansasonline.com/news/2017/aug/28/run-or-skip-but-hop-to-it-to-get-regist/,www.arkansasonline.com/news/2018/sep/03/injuries-force-merrick-football-retirement/
contentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
www.arkansasonline.com/news/2018/sep/04/homelessness-battle-fought-by-samaritan/,0.373725,0.621556,0.169838,0.846027,0.0,0.459226,0.319208,0.561965,1.000000,0.274287,...,0.0,0.143809,0.735318,0.846027,0.000000,0.025816,0.221102,0.0,0.0,0.0
www.arkansasonline.com/news/2018/sep/04/south-lr-site-for-shopping-soccer-on-ta/,0.373725,0.621556,0.169838,0.846027,0.0,0.459226,0.319208,0.561965,1.000000,0.274287,...,0.0,0.143809,0.735318,0.846027,0.000000,0.025816,0.221102,0.0,0.0,0.0
www.arkansasonline.com/news/2018/sep/04/heritage-teacher-recognized-for-physics/,0.373725,0.621556,0.169838,0.846027,0.0,0.459226,0.319208,0.561965,1.000000,0.274287,...,0.0,0.143809,0.735318,0.846027,0.000000,0.025816,0.221102,0.0,0.0,0.0
www.arkansasonline.com/news/2018/sep/04/traffic-stop-nets-stolen-police-gun-201/,0.316182,0.943499,0.143688,1.000000,0.0,0.388517,0.377302,0.475438,0.846027,0.232054,...,0.0,0.121667,0.786795,1.000000,0.000000,0.021841,0.187059,0.0,0.0,0.0
www.arkansasonline.com/news/2018/sep/04/letters-20180904/,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
www.arkansasonline.com/news/2018/sep/17/comedian-john-mulaney-perform-central-arkansas/,0.200182,0.332930,0.090972,0.453166,0.0,0.245980,0.170981,0.301011,0.535640,0.146919,...,0.0,0.077030,0.393866,0.453166,0.000000,0.013828,0.118431,0.0,0.0,0.0
www.arkansasonline.com/news/2018/sep/17/for-ex-captive-today-in-charleston-case/,0.156372,0.187008,0.051099,0.254546,0.0,0.192146,0.096041,0.235134,0.300872,0.052838,...,0.0,0.477976,0.221236,0.254546,0.000000,0.017727,0.151827,0.0,0.0,0.0
www.arkansasonline.com/news/2018/sep/16/goodbye-my-dear-friend-fred/,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.746774,0.000000,0.000000,0.0,0.0,0.0
www.arkansasonline.com/news/2018/sep/16/uca-professorartist-displays-work-downtown-gallery/,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.720358,0.000000,0.000000,0.0,0.0,0.0


#### d. Modelling

In [502]:
# find top recommendation for former visitors
def fv(n, similarity_df):
    '''Takes an integer n and gives the top-n article recommendation from new articles for former visitors'''
    blank_df = pd.DataFrame({'visitorID':[], 'read_article':[],'contentID': [], 'similarity':[]})
    fv_df = blank_df
    i = 0
    # if the visitor is not a new visitor
    for visitor in val_former_visitors:
        # filtered out the artilces visitors have already read
        read_articles = list(train_visits.loc[train_visits['visitorID'] == visitor]['contentID'])
        i += 1
        print(i)
        # calculate the similarity between unread articles and read articles
        new_articles = blank_df
        for article in read_articles:
            # select similar articles for one article that former visitor had read: 
            one_article = pd.DataFrame({'visitorID': visitor, 'read_article': article,'contentID': similarity_df.index, 'similarity': similarity_df[article]})
            one_article.index = similarity_df.index
            one_article = one_article[one_article['similarity'] != 0]
            # combine all the similar articles for former visitor: 
            new_articles = new_articles.append(one_article) 

        # similar articles results for one former visitor
        new_articles = new_articles.drop_duplicates(subset = ['read_article', 'contentID']).sort_values('similarity', ascending=False)
    #     print(f"find {len(new_articles)} new articles combinations")
        new_articles = pd.merge(new_articles, train_article_counts_df, on = 'read_article', how = 'left')
        new_articles = new_articles.sort_values(['similarity', 'visitor'], ascending = False).head(n)
        new_articles = new_articles[['visitorID', 'contentID']]
    #     print(new_articles_5)

        # combine the similarity results for all visitors
        fv_df = fv_df.append(new_articles)
        fv_df = fv_df.reset_index(drop = True)
        
    return fv_df

In [429]:
# find top recommendation for new visitors
def nv(n, fv_df):
    new_article_top = fv_df.groupby('contentID').count().sort_values('visitorID', ascending = False)
    new_article_top = new_article_top.head(n).index.tolist()
    nv_df = pd.DataFrame({'visitorID':[], 'contentID':[]})
    i = 0
    for visitor in val_new_visitors:
        print(visitor)
        i += 1
        print(i)
        one_new_visitor_df = pd.DataFrame({'visitorID': visitor, 'contentID': new_article_top})
        nv_df = nv_df.append(one_new_visitor_df)
    
    return nv_df

In [430]:
# compare recommendation with actural condition
def precision_recall(n, fv_nv_df):
    prediction_count = len(fv_nv_df)
    actual_count = len(valid_visits)
    TP = pd.merge(valid_visits, fv_nv_df, on=['visitorID','contentID'], how = 'inner')
    TP_count = len(TP)
    precision = TP_count/prediction_count
    recall = TP_count/actual_count
    print(f"Number of true positive cases: {TP_count}")
    print(f"Top-{n} recommendation precision: {round(precision*100)}%")
    print(f"Top-{n} recommendation recall: {round(recall*100)}%")
    return [precision, recall]
    

In [457]:
# compare recommendation with actural condition
def avg_precision_recall(n, fv_nv_df):
    recommendation = fv_nv_df[['visitorID', 'contentID']]
    recommendation['predicted'] = recommendation['contentID']
    actual = valid_visits
    actual['actual'] = actual['contentID']

    results = pd.merge(actual, recommendation, on = ['visitorID','contentID'], how = 'left')
    results['match'] = np.where(results['predicted'] == results['actual'], 1, 0)

    val_visitor_counts = results.groupby('visitorID').count()['contentID']
    val_visitor_counts.columns = ['counts']

    val_TP = results.groupby('visitorID').sum()
    val_TP.columns = ['TP']

    results_calculation = pd.merge(val_visitor_counts, val_TP, left_index=True, right_index=True)
    results_calculation['precision'] = results_calculation['TP']/n
    results_calculation['recall'] = results_calculation['TP']/results_calculation['contentID']

    avg_precision = results_calculation['precision'].mean()
    avg_recall = results_calculation['recall'].mean()

    print(f"Top-{n} recommendation average precision: {round(avg_precision*100,2)}%")
    print(f"Top-{n} recommendation average recall: {round(avg_recall*100,2)}%")
    return [avg_precision, avg_recall]
    

### 2. Model optimization
#### a. Optimization on category features using min_df and max_df in "TfidfVectorizer" process

In [499]:
# convert into TF-IDF featuer matrixes
def tfidf(tfidf):
    train_tfidf = tfidf.fit_transform(train_articles_df['categories'])
    val_tfidf = tfidf.transform(val_new_articles_df['categories'])
    
    previous_articleID = train_articles_df['contentID'].tolist()
    val_new_articleID = val_new_articles_df['contentID'].tolist()

    train_features = tfidf.get_feature_names()

    train_tfidf_df = feature_df(train_tfidf, previous_articleID, train_features) 
    val_tfidf_df = feature_df(val_tfidf, val_new_articleID, train_features)
    val_tfidf_df.head() # 1399*423

    similarity_df = pd.DataFrame(1- pairwise_distances(np.array(val_tfidf_df), np.array(train_tfidf_df), metric = 'cosine'))
    similarity_df.index = val_tfidf_df.index
    similarity_df.columns = train_tfidf_df.index

    return similarity_df

In [501]:
tfidf1 = TfidfVectorizer(min_df = 2) # min_df = 2, max_df = 0.4 norm = 'l2'
similarity_df1 = tfidf(tfidf1)

In [503]:
fv_df_52 = fv(5, similarity_df1)
nv_df_52 = nv(5, fv_df_52)
fv_nv_df_52 = fv_df_52.append(nv_df_52).reset_index(drop = True)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [504]:
precision_recall(5, fv_nv_df_52)
avg_precision_recall(5, fv_nv_df_52)

Number of true positive cases: 238
Top-5 recommendation precision: 3%
Top-5 recommendation recall: 1%
Top-5 recommendation average precision: 2.55%
Top-5 recommendation average recall: 0.97%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendation['predicted'] = recommendation['contentID']


[0.025481798715203462, 0.009661479962663847]

In [505]:
tfidf2 = TfidfVectorizer(min_df = 2, max_df = 0.4) # min_df = 2, max_df = 0.4 norm = 'l2'
similarity_df2 = tfidf(tfidf2)
fv_df_53 = fv(5, similarity_df2)
nv_df_53 = nv(5, fv_df_52)
fv_nv_df_53 = fv_df_53.append(nv_df_53).reset_index(drop = True)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [506]:
precision_recall(5, fv_nv_df_52)
avg_precision_recall(5, fv_nv_df_52)

Number of true positive cases: 238
Top-5 recommendation precision: 3%
Top-5 recommendation recall: 1%
Top-5 recommendation average precision: 2.55%
Top-5 recommendation average recall: 0.97%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendation['predicted'] = recommendation['contentID']


[0.025481798715203462, 0.009661479962663847]

In [184]:
# # define a function to create the article-feature dataframe for previous articles and new articles
# def feature_df(tfidf, articleIDlist):
#     '''Convert the input TF-IDF matrix into TF-IDF data frame whose index is article ID and column names are feature names '''
#     df = pd.DataFrame(tfidf.toarray())
#     df['contentID'] = articleIDlist
#     df = df.set_index('contentID')
#     df.columns = train_features
#     return df

# previous_articleID = train_articles_df['contentID'].tolist()
# val_new_articleID = val_new_articles_df['contentID'].tolist()

# train_features = tfidf.get_feature_names()

# train_tfidf_df = feature_df(train_tfidf, previous_articleID)
# val_tfidf_df = feature_df(val_tfidf, val_new_articleID)
# val_tfidf_df.head() # 1399*423

In [184]:
# # create similarity matrix between all the articles and test articles
# from sklearn.metrics import pairwise_distances
# from scipy.spatial.distance import cosine
# # the similarity dataframe between previous articles and new articles
# # the index names stand for new articles and column names stand for previous articles
# similarity_df = pd.DataFrame(1- pairwise_distances(np.array(val_tfidf_df), np.array(train_tfidf_df), metric = 'cosine'))
# similarity_df.index = val_tfidf_df.index
# similarity_df.columns = train_tfidf_df.index

# similarity_df # 1399*8529

#### b. Optimization on the total number of articles recommended

In [435]:
fv_df_5 = fv(5, similarity_df)
nv_df_5 = nv(5, fv_df_5)
fv_nv_df_5 = fv_df_5.append(nv_df_5).reset_index(drop = True)

2.834563625215906e+18
1
6.230708417889792e+18
2
2.4829623562526406e+17
3
1.6839777067334917e+18
4
2.6453676831009326e+18
5
1.361027540810554e+18
6
8.438123919282267e+18
7
4.5304035147780736e+18
8
1.4734425588224748e+18
9
6.161510877835587e+17
10
6.325111013049222e+18
11
2.613663492947233e+18
12
2.020500085248086e+18
13
3.6888380750876646e+18
14
2.230066808222616e+18
15
2.564616941313766e+17
16
6.81649994798405e+18
17
8.533229282788286e+18
18
2.598541488623584e+18
19
2.468646846980868e+18
20
5.365529411112139e+18
21
7.351395145577677e+17
22
5.94618341740155e+18
23
1.891725523791952e+18
24
1.4374585840777247e+18
25
1.767471548828082e+18
26
7.315303856167512e+18
27
1.9597127817756849e+18
28
6.686158200342512e+18
29
5.74143543297127e+18
30
7.566940098126559e+17
31
6.890397579617532e+17
32
4.3580138353897513e+18
33
8.497321260237338e+18
34
4.661488480732943e+18
35
6.217569683425906e+18
36
4.123890310524557e+18
37
4.225843526882248e+18
38
5.03696745048334e+17
39
4.670188602561e+18
40
5.30035

In [458]:
precision_recall(5, fv_nv_df_5)
avg_precision_recall(5, fv_nv_df_5)

Number of true positive cases: 240
Top-5 recommendation precision: 3%
Top-5 recommendation recall: 1%
Top-5 recommendation average precision: 2.57%
Top-5 recommendation average recall: 0.94%


[0.025695931477516098, 0.009415484970536505]

In [423]:
fv_df_10 = fv(10, similarity_df)
nv_df_10 = nv(fv_df_10, 10)
fv_nv_df_10 = fv_df_10.append(nv_df_10).reset_index(drop = True)

In [459]:
precision_recall(10, fv_nv_df_10)
avg_precision_recall(10, fv_nv_df_10)

Number of true positive cases: 414
Top-10 recommendation precision: 2%
Top-10 recommendation recall: 2%
Top-10 recommendation average precision: 2.22%
Top-10 recommendation average recall: 1.56%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendation['predicted'] = recommendation['contentID']


[0.022162740899357693, 0.015603246344275597]

In [450]:
fv_df_20 = fv(20, similarity_df)
nv_df_20 = nv(20, fv_df_20)
fv_nv_df_20 = fv_df_20.append(nv_df_20).reset_index(drop = True)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [460]:
precision_recall(20, fv_nv_df_20)
avg_precision_recall(20, fv_nv_df_20)

Number of true positive cases: 740
Top-20 recommendation precision: 2%
Top-20 recommendation recall: 3%
Top-20 recommendation average precision: 1.98%
Top-20 recommendation average recall: 2.77%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendation['predicted'] = recommendation['contentID']


[0.01980728051391865, 0.027715408715392666]

In [454]:
fv_df_40 = fv(40, similarity_df)
nv_df_40 = nv(40, fv_df_40)
fv_nv_df_40 = fv_df_40.append(nv_df_40).reset_index(drop = True)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [461]:
precision_recall(40, fv_nv_df_40)
avg_precision_recall(40, fv_nv_df_40)

Number of true positive cases: 1280
Top-40 recommendation precision: 2%
Top-40 recommendation recall: 5%
Top-40 recommendation average precision: 1.71%
Top-40 recommendation average recall: 4.9%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendation['predicted'] = recommendation['contentID']


[0.017130620985010645, 0.04897415553577482]

In [456]:
fv_df_80 = fv(80, similarity_df)
nv_df_80 = nv(80, fv_df_80)
fv_nv_df_80 = fv_df_80.append(nv_df_80).reset_index(drop = True)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [462]:
precision_recall(80, fv_nv_df_80)
avg_precision_recall(80, fv_nv_df_80)

Number of true positive cases: 2340
Top-80 recommendation precision: 2%
Top-80 recommendation recall: 9%
Top-80 recommendation average precision: 1.57%
Top-80 recommendation average recall: 9.05%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendation['predicted'] = recommendation['contentID']


[0.015658458244111228, 0.09046944161418334]

We can see that, the top-10 recommendation of new articles to former visitors and new visitors does not perform well. As a result, we need to optimze the recommendation further

In [440]:
import random
randomlist = random.sample(range(0, 1399), 10)
len(val_new_articleID)

1399

In [445]:
nv_df_10['contentID'].unique()

array(['www.arkansasonline.com/news/2018/sep/11/can-t-verify-complaints-police-tell-sla/',
       'www.arkansasonline.com/news/2018/sep/16/15-800-names-on-sex-offender-rolls-2018/',
       'www.arkansasonline.com/news/2018/sep/07/drive-shooter-targets-little-rock-home-5-minors-in/',
       'www.arkansasonline.com/news/2018/sep/17/2-hurt-separate-little-rock-shootings-police-say/',
       'www.arkansasonline.com/news/2018/sep/13/little-rock-man-robbed-50-during-ride-arkansas-chi/',
       'www.arkansasonline.com/news/2018/sep/12/man-wearing-razorback-shirt-steals-cash-little-roc/',
       'www.arkansasonline.com/news/2018/sep/13/stray-bullet-travels-just-above-2-year-old-asleep-/',
       'www.arkansasonline.com/news/2018/sep/17/2-separate-armed-robberies-reported-little-rock-le/',
       'www.arkansasonline.com/news/2018/sep/04/56-year-old-little-rock-woman-identified-citys-25t/',
       'www.arkansasonline.com/news/2018/sep/04/homeless-man-attacked-kitchen-knives-near-little-r/'],
   

In [449]:
valid_df.groupby(['headline']).count().sort_values('visitorID', ascending = False).head(20)

Unnamed: 0_level_0,contentID,categories,visitID,visitorID,visitDateTime
headline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
28 Arkansas high school football players suspended after in-game brawl; sheriff says 2 fans arrested,695,695,695,695,695
"Dallas officer mistakes apartment for own, kills man, police say; 26-year-old graduated from Arkansas university in 2016",624,624,624,624,624
Police identify 2 killed in Hot Springs when stolen vehicle that had been chased by officers struck their car,599,599,599,599,599
LR attorney admits stealing funds from trust,556,556,556,556,556
Judge says he will consider $1M grant to firm run by lobbyist in sentencing of ex-Arkansas lawmaker,493,493,493,493,493
Little Rock airport to offer new nonstop flight,482,482,482,482,482
Diocese of Little Rock says 12 priests who served in state accused of sexual abuse,437,437,437,437,437
Mother's call for help ends with son's death minutes after Little Rock officers handcuffed him,430,430,430,430,430
LR Diocese releases list of clergy implicated in sex abuse,384,384,384,384,384
Floridian told to take down online jabs at pathologist,364,364,364,364,364


#### c. Optimization on the ratio of the number of new articles to previous articles

In [184]:
t_attributes_df = attributes_df.iloc[:,1:].transpose()
attributes_count = list(t_attributes_df.sum())
attributes_count

[3,
 6,
 3,
 6,
 3,
 3,
 2,
 3,
 6,
 3,
 2,
 4,
 5,
 17,
 2,
 2,
 13,
 4,
 3,
 6,
 4,
 7,
 11,
 6,
 3,
 7,
 2,
 5,
 4,
 4,
 3,
 5,
 3,
 3,
 5,
 6,
 2,
 2,
 6,
 5,
 9,
 4,
 7,
 6,
 3,
 3,
 6,
 7,
 4,
 4,
 5,
 10,
 1,
 5,
 7,
 9,
 2,
 7,
 2,
 4,
 4,
 7,
 3,
 6,
 3,
 3,
 8,
 4,
 3,
 2,
 11,
 3,
 9,
 6,
 3,
 2,
 2,
 2,
 5,
 3,
 3,
 4,
 9,
 3,
 8,
 2,
 7,
 5,
 3,
 6,
 3,
 3,
 6,
 3,
 8,
 3,
 3,
 2,
 3,
 10,
 9,
 11,
 14,
 7,
 4,
 2,
 5,
 5,
 2,
 2,
 3,
 3,
 3,
 4,
 5,
 4,
 2,
 3,
 5,
 5,
 2,
 9,
 6,
 9,
 5,
 3,
 7,
 3,
 6,
 3,
 9,
 3,
 3,
 3,
 9,
 4,
 6,
 4,
 2,
 2,
 2,
 11,
 11,
 4,
 1,
 4,
 6,
 5,
 4,
 11,
 1,
 2,
 4,
 9,
 8,
 7,
 4,
 2,
 7,
 5,
 5,
 4,
 14,
 4,
 5,
 4,
 6,
 9,
 6,
 5,
 10,
 4,
 4,
 6,
 5,
 9,
 3,
 5,
 4,
 3,
 1,
 6,
 3,
 3,
 12,
 2,
 2,
 6,
 5,
 9,
 2,
 6,
 3,
 5,
 3,
 2,
 2,
 2,
 9,
 4,
 7,
 3,
 4,
 3,
 3,
 2,
 8,
 3,
 3,
 7,
 5,
 5,
 10,
 2,
 3,
 6,
 8,
 3,
 3,
 3,
 3,
 6,
 2,
 3,
 8,
 3,
 3,
 2,
 2,
 2,
 3,
 3,
 10,
 3,
 3,
 4,
 5,
 7,
 3,
 3,
 2,
 3,
 4,
 3,
 3,
 11,

Unnamed: 0,visitorID,contentID
0,745397300000000.0,www.arkansasonline.com/news/2018/sep/05/gateho...
1,745397300000000.0,www.arkansasonline.com/news/2018/sep/16/bankru...
2,745397300000000.0,www.arkansasonline.com/news/2018/jun/17/bankru...
3,745397300000000.0,www.arkansasonline.com/news/2018/aug/12/buildi...
4,745397300000000.0,www.arkansasonline.com/news/2018/sep/16/quick-...
5,1410271000000000.0,www.arkansasonline.com/news/2018/sep/11/pedest...
6,1410271000000000.0,www.arkansasonline.com/news/2018/sep/16/cotton...
7,1410271000000000.0,www.arkansasonline.com/news/2018/sep/24/arkans...
8,1410271000000000.0,www.arkansasonline.com/news/2018/aug/31/20-yea...
9,1410271000000000.0,www.arkansasonline.com/news/2018/sep/30/deadli...


In [257]:
actual_articles = test_utility[['visitorID','contentID']]
actual_articles['actual'] = actual_articles['contentID']
top5rec['predicted'] = top5rec['contentID']
results5 = pd.merge(top5rec,actual_articles, on=['visitorID','contentID'],how ="left")
results5['match'] = np.where(results5['predicted'] == results5['actual'], 0.2, 0)
results5.head(20)

Unnamed: 0,visitorID,contentID,predicted,actual,match
0,745397300000000.0,www.arkansasonline.com/news/2018/sep/05/gateho...,www.arkansasonline.com/news/2018/sep/05/gateho...,,0.0
1,745397300000000.0,www.arkansasonline.com/news/2018/sep/16/bankru...,www.arkansasonline.com/news/2018/sep/16/bankru...,,0.0
2,745397300000000.0,www.arkansasonline.com/news/2018/jun/17/bankru...,www.arkansasonline.com/news/2018/jun/17/bankru...,,0.0
3,745397300000000.0,www.arkansasonline.com/news/2018/aug/12/buildi...,www.arkansasonline.com/news/2018/aug/12/buildi...,,0.0
4,745397300000000.0,www.arkansasonline.com/news/2018/sep/16/quick-...,www.arkansasonline.com/news/2018/sep/16/quick-...,,0.0
5,1410271000000000.0,www.arkansasonline.com/news/2018/sep/11/pedest...,www.arkansasonline.com/news/2018/sep/11/pedest...,,0.0
6,1410271000000000.0,www.arkansasonline.com/news/2018/sep/16/cotton...,www.arkansasonline.com/news/2018/sep/16/cotton...,,0.0
7,1410271000000000.0,www.arkansasonline.com/news/2018/sep/24/arkans...,www.arkansasonline.com/news/2018/sep/24/arkans...,,0.0
8,1410271000000000.0,www.arkansasonline.com/news/2018/aug/31/20-yea...,www.arkansasonline.com/news/2018/aug/31/20-yea...,,0.0
9,1410271000000000.0,www.arkansasonline.com/news/2018/sep/30/deadli...,www.arkansasonline.com/news/2018/sep/30/deadli...,,0.0


In [258]:
scores = results5[['visitorID','match']].groupby(['visitorID']).sum()
scores

Unnamed: 0_level_0,match
visitorID,Unnamed: 1_level_1
7.453973e+14,0.0
1.410271e+15,0.0
4.259291e+15,0.2
9.595882e+15,0.0
1.284325e+16,0.0
...,...
1.572794e+19,0.0
1.714312e+19,0.0
1.718321e+19,0.0
1.722582e+19,0.0


In [260]:
precision = scores['match'].mean()
precision

0.038479532163742794