# Model Deployment
To deploy all the models under 1 common docker file and upload to google cloud. Visualise using streamlit.

There are total 4 models for deployment
- Classification between Science and Philosophy subreddit `project_3.31_main.ipynb`
- Multiclassification of flairs (reddit's name for sub-categories) `project_3.32_multiclass.ipynb`
- Auto summariser of text `project_3.33_summary.ipynb`
- Sentiment Analysis of text `project_3.34_sentiment.ipynb`

## Flask

In [49]:
%%writefile inference.py 
from flask import Flask, request 
import pandas as pd 
import os 
import joblib
import time
import mlflow.pyfunc
# We have to import the following io, otherwise pandas will output error because we only passing in a string and not a dictionary.
# https://stackoverflow.com/questions/63553845/pandas-read-json-valueerror-protocol-not-known/63655099#63655099
from io import StringIO
# To scrap website and run summariser
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
# To perform sentiment analysis
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob


api = Flask('ModelEndpoint')

#########################
##    Classification   ##
#########################
model_classify = joblib.load("./models/model_classify.pkl")

#########################
## MultiClassification ##
#########################
model_multiclass = mlflow.pyfunc.load_model(model_uri="./models/model_multiclass")
transform_multiclass = joblib.load("./models/model_multiclass/tfidf.pkl")
#function to run multiclass predictions
def multiclass_predict(text):
    flair_dict = {1: 'Medicine',
                  2: 'Social Science',
                  3: 'Animal Science',
                  4: 'Anthropology',
                  5: 'Environment',
                  6: 'Psychology',
                  7: 'Health',
                  8: 'Nanoscience',
                  9: 'Engineering',
                  10: 'Biology',
                  11: 'Earth Science',
                  12: 'Astronomy',
                  13: 'Genetics',
                  14: 'Economics',
                  15: 'Paleontology',
                  16: 'Chemistry',
                  17: 'Neuroscience',
                  18: 'Cancer',
                  19: 'Mathematics',
                  20: 'Epidemiology',
                  21: 'Physics',
                  22: 'Geology',
                  23: 'Materials Science',
                  24: 'Computer Science',
                  25: 'Breaking News',
                  26: 'Retraction',
                  27: 'Best of r/science'}
    flair_no = model_multiclass.predict(transform_multiclass.transform(text))[0]
    return flair_dict[flair_no]

#########################
##     Summariser      ##
#########################
#function to run summariser
hf_summarizer = pipeline('summarization', 'sshleifer/distilbart-cnn-12-6')
def summariser(url):
    if len(url) > 1:
        # pass in header in attempt to hide that this is an automated web crawler
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            html = response.text
            soup = BeautifulSoup(html, 'lxml')
            all_p = soup.find_all('p')
            relevant_p = ''
            for p in all_p:
                # We assume that if the sentence has less than 10 words, they are likely to be ads or link's descriptions to other pages. We are trying to keep this as general as possible so that we can scrap more sites without issues
                if len(p.text.split(' ')) > 10:
                    relevant_p += (p.text + ' ')
            #we cap it to 700 because there's a limit of words that the hugging face model can take
            relevant_p_trimmed = ' '.join(relevant_p.split(' ')[:700])
            return hf_summarizer(relevant_p_trimmed)[0]['summary_text'], relevant_p
        else:
            #return error message if unable to crawl website
            return "ERROR! Unable to crawl website. Please check if the link is valid or if the website allows automated web crawling", "a"
    else:
        #return error message if no the url length is only 1
        return "ERROR! Please pass in a valid url", "a"

#########################
##      Sentiment      ##
#########################
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')
#function to retrieve sentiment and subjectivity
def sentiment_predict(title, selftext, page):
    text = title + selftext + page
    spacy_output = nlp(text)
    if spacy_output._.polarity > 0.33:
        sentiment = 'Positive'
    elif spacy_output._.polarity < -0.33:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    if spacy_output._.subjectivity < 0.5:
        subjectivity = 'Objective'
    else:
        subjectivity = 'Subjective'
    return sentiment, round(spacy_output._.polarity,2), subjectivity, round(spacy_output._.subjectivity,2)

#########################
##      FLASK API      ##
#########################
@api.route('/') 
def home(): 
    return {"message": "Hello!", "success": True}, 200

@api.route('/predict', methods = ['POST']) 
def make_predictions():
    user_input = request.get_json(force=True)
    df_schema = {'title':str, 'selftext': str, 'url':str}
    user_input_df = pd.read_json(StringIO(user_input), lines=True, dtype=df_schema)
    combined_user_input = pd.Series(user_input_df['title'] + user_input_df['selftext'] + user_input_df['url']) #this will output as {'0':'all the texts'}
    

    #failsafe in case someone input nothing in either of the 3 fields
    if len(user_input_df['title'].tolist()[0])==0:
        #input 'a' as a common letter that should give no meaning in nlp
        user_input_df['title'] = 'a'
    if len(user_input_df['selftext'].tolist()[0])==0:
        #input 'a' as a common letter that should give no meaning in nlp
        user_input_df['selftext'] = 'a'
    if len(user_input_df['url'].tolist()[0])==0:
        #input 'a' as a common letter that should give no meaning in nlp
        user_input_df['url'] = 'a'
    
    # return {'test':user_input_df['title'].tolist()[0]}
    # PREDICTIONS
    predict_class = model_classify.predict(combined_user_input).tolist() #we need to pass number to list so that it can be convered to json later   
    predict_flair = multiclass_predict(combined_user_input) #For text, we don't need to pass in tolist
    summary, page = summariser(user_input_df['url'].tolist()[0])
    sentiment, senti_score, subjectivity, subj_score = sentiment_predict(user_input_df['title'].tolist()[0], user_input_df['selftext'].tolist()[0], page)
    

    # RETURN OUTPUT
    if predict_class[0]==1:
        return {'subreddit': 'Science', 'flair': predict_flair, 'summary':summary, 'sentiment': sentiment, 'senti_score':senti_score, 'subjectivity':subjectivity, 'subj_score':subj_score}
    else:
        return {'subreddit': 'Philosophy', 'summary':summary, 'sentiment': sentiment, 'senti_score':senti_score, 'subjectivity':subjectivity, 'subj_score':subj_score}
    
if __name__ == '__main__': 
    api.run(host='0.0.0.0', 
            debug=True, 
            port=int(os.environ.get("PORT", 8080))
           ) 

Overwriting inference.py


### Testing on localhost

In [50]:
user_input_science = {"title": "Physicist Stephen Hawking dies aged 76", "selftext": "We regret to hear that Stephen Hawking died tonight at the age of 76. We are creating a megathread for discussion of this topic here. The typical r/science comment rules will not apply and we will allow mature, open discussion. This post may be updated as we are able. A few relevant links: Stephen Hawking's AMA on /r/science. BBC's Obituary for Stephen Hawking. If you would like to make a donation in his memory, the Stephen Hawking Foundation has the Dignity Campaign to help buy adapted wheelchair equipment for people suffering from motor neuron diseases. You could also consider donating to the ALS Association to support research into finding a cure for ALS and to provide support to ALS patients", "url": "http://www.bbc.com/news/uk-43396008"}

In [51]:
user_input_philo = {"title": "Only fragments of ancient Greek philosopher Epicurus’s writings remain. Among them are his Principal Doctrines: 40 brilliant, authoritative aphorisms that summarize the Epicurean approach to living a good life — an approach focused on removing pain & anxiety, & on emphasizing friendship & community.", "selftext":"", "url": "https://philosophybreak.com/articles/epicurus-principal-doctrines-40-aphorisms-for-living-well/?utm_source=reddit&utm_medium=social"}

In [55]:
empty = {"title":"A qualitative study of an incel discussion board says that incels justify their misogyny by seeing themselves as victims of women.","selftext":"","url":"1"}

In [56]:
import requests, json

api_url = 'http://localhost:8080' # specify the URL to access
api_route = '/predict' # specify the `route` to access in the URL

# we'll need to use `requests.post()` based on our earlier specification in `\predict` route to only accept a `POST` request 
response = requests.post(f'{api_url}{api_route}', json=json.dumps(empty))
predictions = response.json()

print(predictions)

{'flair': 'Social Science', 'senti_score': 0.0, 'sentiment': 'Neutral', 'subj_score': 0.0, 'subjectivity': 'Objective', 'subreddit': 'Science', 'summary': 'ERROR! Please pass in a valid url'}


## Docker

In [31]:
%%writefile Dockerfile
# Use the official lightweight Python image from
# https://hub.docker.com/_/python
FROM python:3.8-slim 

# Copy all the files needed for the app to work
COPY inference.py .
COPY models/ ./models/
COPY requirements.txt .

# Install all the necessary libraries
RUN pip install -r requirements.txt
RUN python -m textblob.download_corpora
RUN python -m spacy download en_core_web_sm

# Run the API!
CMD python inference.py

Overwriting Dockerfile


In [32]:
%%writefile requirements.txt
pandas
flask
mlflow-skinny
scikit-learn==0.23.2
torch==1.10.2
spacy
spacytextblob
bs4
transformers
lxml==4.9.1

Overwriting requirements.txt


## Streamlit

In [4]:
%%writefile streamlit_app.py
import streamlit as st
import requests
import json

# Title of the page
st.title("Science 🧪 vs 🧠 Philosophy Subreddit")

st.caption("""Don't know which subreddit to share your posts? Use our app!\n
For testing purposes, the cells have been populated with the *most upvoted* post on Science subreddit. Change it to your own!""")

Overwriting streamlit_app.py


In [5]:
%%writefile -a streamlit_app.py

#we have to put the inputs all inside a form to prevent the whole app from being re-run each time a input is change. 
# https://blog.streamlit.io/introducing-submit-button-and-forms/
# Get user inputs
with st.form(key='my_form'):
    title = st.text_input('Post Title', 'Physicist Stephen Hawking dies aged 76')
    selftext = st.text_area('Post Content', "We regret to hear that Stephen Hawking died tonight at the age of 76. We are creating a megathread for discussion of this topic here. The typical r/science comment rules will not apply and we will allow mature, open discussion. This post may be updated as we are able. A few relevant links: Stephen Hawking's AMA on /r/science. BBC's Obituary for Stephen Hawking. If you would like to make a donation in his memory, the Stephen Hawking Foundation has the Dignity Campaign to help buy adapted wheelchair equipment for people suffering from motor neuron diseases. You could also consider donating to the ALS Association to support research into finding a cure for ALS and to provide support to ALS patients.",
                            height = 150)
    url = st.text_input('URL', 'http://www.bbc.com/news/uk-43396008')
    submit = st.form_submit_button(label='Inspect')
    
    user_input = {'title': title,
                  'selftext': selftext,
                  'url': url}

Appending to streamlit_app.py


In [6]:
%%writefile -a streamlit_app.py

# code to run after submit button is pressed
if submit:
    with st.spinner('🪄 ✨Gathering magic dusts...✨'):
        user_input = {'title': title,
                      'selftext': selftext,
                      'url': url}
        # Code to post the user inputs to the API and get the predictions
        # Paste the URL to your GCP Cloud Run API here!
        api_url = 'https://science-philo-reddit-class-vrfckmfjmq-as.a.run.app'
        api_route = '/predict'

        response = requests.post(f'{api_url}{api_route}', json=json.dumps(user_input)) # json.dumps() converts dict to JSON
        predictions = response.json()
        
        #SNOW!!
        st.snow()
        
        st.header("Inspection Result")

        col11, col12 = st.columns(2)
        col11.metric("Subreddit", predictions['subreddit'], help = "Most likely subreddit based on binary classification")
        if predictions['subreddit'] == 'Science':
            col12.metric("Flair", predictions['flair'], help = "Most likely subcategory based on multiclass classification")
        st.caption("""
        <p style="color: grey;font-size: 80%;">
        The above section predicts the most accurate subreddit to post the article based on machine learning of 50,000 previous posts. For Science subreddit, further subcategory (flair) is suggested for you based on 25,000 previous classification\n
        \n
        </p>
        """, unsafe_allow_html=True)

        #hide the arrow which is default by streamlit
        st.write(
            """
            <style>
            [data-testid="stMetricDelta"] svg {
                display: none;
            }
            </style>
            """,
            unsafe_allow_html=True,
        )
        col21, col22 = st.columns(2)
        col21.metric("Sentiment", predictions['sentiment'], delta = predictions['senti_score'], delta_color = 'off', help = """
                     0.33 to 1 : Positive\n
                     -0.33 to 0.33 : Neutral\n
                     -1 to -0.33 : Negative
                     """)
        col22.metric("Subjectivity", predictions['subjectivity'], delta = predictions['subj_score'], delta_color = 'off', help = """
                     0.5 to 1 : Subjective\n
                     0 to 0.5 : Objective
                     """)
        st.caption("""
        <p style="color: grey;font-size: 80%;">
        The above section is the cumulative sentiment of post title, post content and article after crawling through the url. A more neutral sentiment with objective tone is a better showcase that your posts are not informed by bias\n
        \n
        </p>
        """, unsafe_allow_html=True)


        st.caption('<p class = "myclass">Article Summary 📃</p>', unsafe_allow_html=True)
        st.markdown(predictions['summary'])
        st.caption("""
        <p style="color: grey;font-size: 80%;">
        The above section is an auto generated summary of the article content after crawling through the url. 99.9% of subreddits does not have any post content other than title. Use our auto summary to fill up the gap!\n
        </p>
        """, unsafe_allow_html=True)

        st.markdown(
             f"""
             <style>
             .css-rvekum .myclass {{
                 color:black
             }}
             .css-dg4u6x .myclass {{
                 color:white
             }}
             </style>
             """,
             unsafe_allow_html=True
         )
        

Appending to streamlit_app.py
