In [9]:
import os
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [0]:
!pip install turicreate

Collecting turicreate
[?25l  Downloading https://files.pythonhosted.org/packages/05/bf/d1c74d5002be47db24b13adde42e5898b1ff63cae31e582ddd908006ae37/turicreate-5.8-cp36-cp36m-manylinux1_x86_64.whl (90.7MB)
[K     |████████████████████████████████| 90.7MB 256kB/s 
Collecting pillow>=5.2.0 (from turicreate)
[?25l  Downloading https://files.pythonhosted.org/packages/19/66/6113477dc3206ccb1e192cffd626f2840ead02375a6cebe2436ad4c19f61/Pillow-6.2.0-cp36-cp36m-manylinux1_x86_64.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 37.6MB/s 
[?25hCollecting numpy==1.16.4 (from turicreate)
[?25l  Downloading https://files.pythonhosted.org/packages/87/2d/e4656149cbadd3a8a0369fcd1a9c7d61cc7b87b3903b85389c70c989a696/numpy-1.16.4-cp36-cp36m-manylinux1_x86_64.whl (17.3MB)
[K     |████████████████████████████████| 17.3MB 38.8MB/s 
Collecting mxnet<1.2.0,>=1.1.0 (from turicreate)
[?25l  Downloading https://files.pythonhosted.org/packages/96/98/c9877e100c3d1ac92263bfaba7bb8a49294e0990465920

In [0]:
# importing packages
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from html.parser import HTMLParser
import turicreate as tc

In [0]:
path = '/gdrive/My Drive/hngi6/team_python/posts.json'

In [0]:
df_posts = pd.read_json(path)

In [0]:
# df_posts.head()

In [0]:
df_posts.shape

(876, 12)

In [0]:
# Dropping irrelevant cols
col = ['slug', 'created_at', 'updated_at', 'image', 'action', 'status_id', 'post_id', 'user_id']

df_posts2 = df_posts.drop(col, axis=1)
df_posts2.rename(columns={'id': 'post_id'}, inplace = True)
# df_posts2.head()

In [0]:
# Functions for cleaning data
class MLStripper(HTMLParser):
    '''inherting the html parser to remove all matching html tags'''
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    '''functions that instantiates the MLStripper class and passed the text ''' 
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def strip_line(x):
    '''removes regex and extra spaces'''
    stri = re.sub(r'[^\w]', ' ', x)
    return stri.strip()

In [0]:
# Applying functions to clean data
df_posts2['content'] = df_posts2['content'].map(strip_tags)
df_posts2['content'] = df_posts2['content'].map(strip_line)
df_posts2['content'] = df_posts2['content'].map(lambda x: x.strip().lower())
df_posts2['title'] = df_posts2['title'].apply(lambda x: x.strip().lower())

# features that will be usd to find similarity between posts
features = ['title', 'content', 'tags']

# replace nonetypes with empty string
df_posts2[features] = df_posts2[features].fillna('')
# df_posts2.head()

In [122]:
# creating a combined features column that has applyed the combine_features function to each row 
def combine_features(row):
    ''' combine features together to find similarity '''
    try:
        return row['title']+" "+row['content']+" "+row['tags']
    except:
        pass

df_posts2['combined_features'] = df_posts2.apply(combine_features, axis=1)
data = df_posts2.drop(features, axis=1)
data

Unnamed: 0,post_id,combined_features
0,1,what i have learnt so far on html i learnt how...
1,2,html begins here i am on this journey with sta...
2,4,my laziness in the open i have not been attend...
3,6,my task 2 my journey on startng pre intern...
4,7,task 2 a summary on the idongesit html cv i...
5,8,my journey on html using the hyper text markup...
6,9,startng html exposition storage 2040 images im...
7,11,my task 2 my journey on startng pre intern...
8,12,startng html task i have learned a lot about h...
9,13,on startng pre-internship what i have learned ...


In [123]:
# Checking missing values
df_posts2.isnull().sum()

content              0
post_id              0
tags                 0
title                0
combined_features    0
dtype: int64

In [0]:
# converting data to turicreate Sframe
data2 = tc.SFrame(data)

In [0]:
contentbased_model = tc.recommender.item_content_recommender.create(item_data=data2, item_id='post_id', verbose=False)

In [0]:
#Construct a reverse map of indices and post titles
indices = pd.Series(df_posts2['post_id'].values, index=df_posts2['title'].values)
indices2 = pd.Series(df_posts2['title'].values, index=df_posts2['post_id'].values)


def get_recommendations(title, indices=indices): # Function to get 5 recommendations based on similar posts
  
  try:
    idx = indices[title.lower()]

    # Get 5 recommendations based on similar post
    rec = list(contentbased_model.recommend_from_interactions([idx], k=5))
    
    # Get post_id of recommended posts
    post_id = [rec[x]['post_id'] for x, i in enumerate(rec)]
    
    # Return the top 5 most similar posts
    print("RECOMMENDATIONS BASED ON YOUR READING PATTERN\n")
    return indices2.loc[post_id]
  
  except KeyError:
    print("We Have No Recommendations For You!")



In [161]:
title = 'html begins here'
get_recommendations(title)

RECOMMENDATIONS BASED ON YOUR READING PATTERN



7                                 task 2
900                   my first html page
140                 introduction to html
663       start.ng: work done using html
664    start.ng: work done so far - html
dtype: object