In [1]:
import pickle
import pandas as pd
import streamlit as st
from streamlit import session_state as session
import re

In [2]:
data = pd.read_csv("medium_data.csv", index_col="id")

def fix_titles(title):
    # Define a pattern to match HTML tags
    html_tags_pattern = re.compile(r'<.*?>')
    
    # Replace HTML tags with an empty string
    cleaned_title = re.sub(html_tags_pattern, '', title)
    
    return cleaned_title

def preprocess_data(df):
    df['date'] = pd.to_datetime(df['date'])
    df['title'] = df['title'].apply(fix_titles)
    df["claps"] = df["claps"].fillna(0)
    df["subtitle"] = df["subtitle"].fillna(df["title"])
    return df
data = data.drop_duplicates()
df = preprocess_data(data)

  df['date'] = pd.to_datetime(df['date'])


In [4]:
def top_content(df):
    pub_popularity = df.groupby('publication')[['claps', 'responses']].mean().round().astype(int).sort_values(by='claps', ascending=False)
    top_three_publications = pub_popularity['claps'].nlargest(3).index
    channels = top_three_publications.tolist()
    top_articles = pd.DataFrame()  # Initialize an empty DataFrame to store top articles
    
    for channel in channels:
        cont = df[df['publication'] == channel]
        top_n_articles = cont.nlargest(3, 'claps')  # Select top 3 articles for the channel
        top_articles = pd.concat([top_articles, top_n_articles])  # Concatenate with previous top articles
    
    return top_articles
        
# print(top_content(df))

def trending_article(df):
    latest_date = df['date'].max()
    latest_week = latest_date - pd.Timedelta(days=6)
    latest_articles = df[df['date'] >= latest_week]
    top_three = df.loc[latest_articles['claps'].nlargest(3).index]
    top_three_trending = top_three['title'].tolist()
    return top_three

# print(trending_article(df))

def popular_quick_reads(df):
    quick_reads = df[df['reading_time'] <= 5.0]
    quick_reads_df = df.loc[quick_reads['claps'].nlargest(3).index]
#     popular_quick_reads = quick_reads_df['title'].tolist()
    return quick_reads_df

In [5]:
top_publication_content = top_content(df)
trending_articles = trending_article(df)
top_quick_reads = popular_quick_reads(df)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.decomposition import NMF

def content_based(df):
    # Create a new feature article which is combination of both title and subtitle
    df['article'] = df['title'] + df['subtitle']
    # Sort Data by number of claps
    df = df.sort_values(by="claps", ascending=False)
    # Now, we have to vectorize the articles using Tf-IDF vecotizer.
    # Pre processing and NMF
    vectorizer = TfidfVectorizer()
    articles = vectorizer.fit_transform(df["article"])
    # Now we can apply NMF on our data and create the recommender. I choose 10 as number of components.
    model = NMF(n_components=10, random_state=0)
    nmf_features = model.fit_transform(articles)
#     model.components_
    normalized = normalize(nmf_features)
    recom_df = pd.DataFrame(data=normalized)
    recom_df.set_index(df['title'], inplace=True)
    recom_df.to_csv("data/recom_df.csv")
    
content_based(df)



In [20]:
def recommend_articles(recom_df, article):    
    similarities = recom_df.dot(article)
    sims = pd.DataFrame(similarities.nlargest(10))
    sims = sims.merge(df[["title", "claps"]], how='inner', on="title")
    sims.set_index("title", drop=True, inplace=True)
    sims.sort_values(by="claps", ascending=False)
    return sims
recom_df = pd.read_csv("data/recom_df.csv", index_col=0)
recom_df
article = recom_df.loc[df['title'][419]]
articles = recommend_articles(recom_df, article)
# articles = recommend_articles(df.loc[df['title'][419]])

In [21]:
articles

Unnamed: 0_level_0,0,claps
title,Unnamed: 1_level_1,Unnamed: 2_level_1
How ChatGPT Works: The Model Behind The Bot,1.0,7100.0
TikTok’s unprecedented ability to engineer the “Consent of the Masses”,0.999169,282.0
Summarizing the latest Spotify releases with ChatGPT,0.997739,67.0
Balancing complexity and simplicity in chart design,0.997261,100.0
Generative Q&A With GPT 3.5 and Long-Term Memory,0.996355,119.0
Visualizing direction and the use of arrows,0.996254,372.0
Are Expert Systems Dead?,0.995508,29.0
Identifying Drivers of Spotify Song Popularity With Causal ML,0.994542,50.0
How Duolingo drives subscription conversion,0.994361,516.0
How to avoid getting designs shot down in the name of consistency,0.994243,140.0


In [35]:
import types
# @st.cache(persist=True, show_spinner=False, suppress_st_warning=True)
# def load_data():
#     """
#     load and cache data
#     :return: tfidf data
#     """
#     recom_df = pd.read_csv("data/recom_df.csv", index_col=0)
#     article_list = recom_df.index.tolist()
#     return recom_df, article_list

# recom_df, article_list = load_data()


def my_hash_func(func):
    # Custom hash function for functions
    return hash(func.__code__)

@st.cache_data(persist=True, show_spinner=False, hash_funcs={types.FunctionType: my_hash_func})
def load_data():
    """
    load and cache data
    :return: tfidf data
    """
    recom_df = pd.read_csv("data/recom_df.csv", index_col=0)
    article_list = recom_df.index.tolist()
    return recom_df, article_list

recom_df, article_list = load_data()

2024-03-13 00:24:29.120 No runtime found, using MemoryCacheStorageManager


In [36]:
dataframe = None

st.title("""
Medium Article Recommendation System
This is an Content Based Recommender System based on claps and responses :smile:.
 """)

st.text("")
st.text("")
st.text("")
st.text("")

session.options = st.multiselect(label="Select Article", options=article_list)

st.text("")
st.text("")

session.slider_count = st.slider(label="Article Count", min_value=5, max_value=10)

st.text("")
st.text("")

buffer1, col1, buffer2 = st.columns([1.45, 1, 1])

is_clicked = col1.button(label="Recommend")

if is_clicked:
    dataframe = recommend_articles(recom_df=recom_df, article = session.options)

st.text("")
st.text("")
st.text("")
st.text("")

if dataframe is not None:
    st.table(dataframe)