# Imports and Settings

In [7]:
## library imports

# data processing imports
import pandas as pd
# import numpy as np

# # plotting imports
# import matplotlib.pyplot as plt
# import seaborn as sns

# modelling imports
from sklearn.model_selection import train_test_split
import imblearn

# # metric imports
# from sklearn import metrics 

# # misc imports
# import datetime
# from dateutil.relativedelta import *
# import mlflow

# nlp imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# from emoji import demojize

import json 
import pickle

# Data Imports

In [2]:
df = pd.read_csv('data/df_cleaned.csv') # importing data from saved csv file
df['created'] = pd.to_datetime(df['created']) # convert the created column to a datetime column
df.head() # displaying the top 5 row of the loaded dataframe

Unnamed: 0,author,created_utc,id,is_self,num_comments,score,selftext,title,upvote_ratio,created,subreddit,subreddit_stocks,title_char_len,title_word_len,score_pmaw
0,cloudboyy,1627751015,ovatu6,False,1,1,,Any thoughts on OSTK? It sold out of the wedge...,1.0,2021-08-01 01:03:35,wsb,0,208,40,1
1,itbc1info,1627751020,ovatwa,False,2,2,,"U.S. prosecutors charge Trevor Milton, founder...",1.0,2021-08-01 01:03:40,wsb,0,61,8,1
2,dadryp,1627751064,ovaudw,True,0,1,,Stocks have been horrible for me 2021,1.0,2021-08-01 01:04:24,stocks,1,37,7,1
3,insta_man,1627751065,ovauee,True,40,7,Long story I started investing in the market w...,Want to get back into investing. Any tips?,1.0,2021-08-01 01:04:25,stocks,1,42,8,1
4,PenIslandGaylien,1627751202,ovavxl,True,1,2,So I already contributed 6k to my Roth in 2021...,Roth Income Limits,1.0,2021-08-01 01:06:42,wsb,0,18,3,1


# Model Setup

## Setting Features (X) and Target Variables (y)

In [3]:
X = df[['title']]
y = df['subreddit_stocks']

# Modelling (CVEC + Multinomial NB | SMOTE Oversampling)

In [4]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

model = Pipeline([('count_vectorizer',CountVectorizer()),
                  ('SMOTE',SMOTE(n_jobs=-1,random_state=42)),
                  ('multinomialNB',MultinomialNB())
                 ])

In [5]:
model.fit(X['title'],y)

In [8]:
# save the model to disk
filename = './deployment/model.sav'
pickle.dump(model, open(filename, 'wb'))

# Checking Pipelines for Prediction with JSON

In [9]:
user_input_df = X.head(10) # getting the data for testing
user_input = user_input_df.to_json(orient="records", lines=True) # convert df to JSONL
print(user_input) # displaying the JSONL

{"title":"Any thoughts on OSTK? It sold out of the wedge but financials look great to me. Annual sales almost = market cap, order numbers down but average order price is up. No company debt. Let me know what you think!"}
{"title":"U.S. prosecutors charge Trevor Milton, founder of electric..."}
{"title":"Stocks have been horrible for me 2021"}
{"title":"Want to get back into investing. Any tips?"}
{"title":"Roth Income Limits"}
{"title":"Does dividend investing suck?"}
{"title":"What happened to the Glory Days?"}
{"title":"3 GOP Congressmen face ethics complaints for failing to disclose $22 million in stock trades"}
{"title":"Let's not forget we Like the Stock and believe in the DD but let's also support our company with more than Buying and Holding. Take a Friend, your Wife, Your Child or A loved one to watch a Movie and Enjoy a Popcorn. \ud83c\udf7f #Apestrong \ud83d\udc8e\ud83d\ude4c\ud83c\udffe\ud83e\udd8d\ud83d\ude80\ud83c\udf15"}
{"title":"BABA SHORT SQEEZE @ 160"}



In [10]:
# Convert user inputs to pandas dataframe
df_schema = {"tilte":object} # To ensure the columns get the correct datatype
user_input_df = pd.read_json(user_input, lines=True, dtype=df_schema) # Convert JSONL to dataframe
user_input_df # display the dataframe

Unnamed: 0,title
0,Any thoughts on OSTK? It sold out of the wedge...
1,"U.S. prosecutors charge Trevor Milton, founder..."
2,Stocks have been horrible for me 2021
3,Want to get back into investing. Any tips?
4,Roth Income Limits
5,Does dividend investing suck?
6,What happened to the Glory Days?
7,3 GOP Congressmen face ethics complaints for f...
8,Let's not forget we Like the Stock and believe...
9,BABA SHORT SQEEZE @ 160


In [11]:
model.predict(user_input_df['title']) # predicting user input

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 0], dtype=int64)

In [12]:
model.predict(X.head(10)['title']) # checking against prediction directly from dataframe
# prediction is the same for both, no loss of info by converting to JSONL and back

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 0], dtype=int64)

# Create an inference.py file

In [14]:
%%writefile inference.py
from flask import Flask, request
import pandas as pd
import os
import pickle
import json

# Step 2: Instantiate the Flask API
api = Flask('ModelEndpoint')

# Step 3: Load the model
model = pickle.load(open('./deployment/model.sav','rb'))

# Step 4: Create the routes
## route 1: Health check. Just return success if the API is running
@api.route('/')
def home():
    # return a simple string
    return {"message": "Hi there!", "success": True}, 200

# route 2: accept input data
# Post method is used when we want to receive some data from the user
@api.route('/predict', methods = ['POST'])
def make_predictions():
    # Get the data sent over the API
    user_input = request.get_json(force=True)
    
    # Convert user inputs to pandas dataframe
    df_schema = {"post_title":object} # To ensure the columns get the correct datatype
    user_input_df = pd.read_json(user_input, lines=True, dtype=df_schema) # Convert JSONL to dataframe
    
    # Run predictions and convert to list
    predictions = model.predict(user_input_df['title']).tolist()
    
    return {'predictions': predictions}
    

# Step 5: Main function that actually runs the API!
if __name__ == '__main__':
    api.run(host='0.0.0.0', 
            debug=True, # Debug=True ensures any changes to inference.py automatically updates the running API
            port=int(os.environ.get("PORT", 8080))
           ) 

Overwriting inference.py


# Test the API
- To test out if our API is working, we first need to run the API code `inference.py`
- Open a new terminal window and navigate to this `solution-code` directory. You should find the `inference.py` file that we just created here.
- Run the file as a normal python file: `python inference.py`
- Now your API is running on your local computer and is ready to accept input data at `http://localhost:8080` URL
- We can interact with any route in the API simply by posting a request to that route. For example, type `http://localhost:8080/` in your browser and see what you get!
- To get predictions, we need to post our input data to the `/predict` route which gets appended at the end of the URL. So the URL will become `http://localhost:8080/predict`
- Let's load the same data we used to train the model and send the first 5 rows to the API for predictions

In [15]:
user_input_df = X.tail(10) # getting the data for testing
user_input = user_input_df.to_json(orient="records", lines=True) # convert df to JSONL
# print(user_input) # displaying the JSONL

In [17]:
# Send the JSONL data as request to the API and print the response
import requests
import json 

api_url = 'http://192.168.1.142:8080'
api_route = '/predict'

response = requests.post(f'{api_url}{api_route}',
                         json=user_input,
                        )
response

predictions = response.json()
print(predictions)

{'predictions': [1, 1, 1, 0, 1, 0, 1, 0, 0, 1]}


In [18]:
model.predict(X.tail(10)['title']) # checking against prediction directly from dataframe
# prediction is the same for both, no loss of info when getting predictions from the flask api

array([1, 1, 1, 0, 1, 0, 1, 0, 0, 1], dtype=int64)

# Creating Dockerfile

In [24]:
%%writefile Dockerfile
# Use the official lightweight Python image from
# https://hub.docker.com/_/python
FROM python:3.8-slim

# Copy all the files needed for the app to work
COPY inference.py .
COPY deployment/ ./deployment

# Install all the necessary libraries
RUN pip install -r ./deployment/requirements.txt

# Run the API!
CMD python inference.py

Overwriting Dockerfile


In [25]:
%%writefile deployment/requirements.txt
pandas
flask
mlflow-skinny
scikit-learn==1.1.1
imblearn

Overwriting deployment/requirements.txt


## Testing Dockerfile from Google Cloud Run API

In [41]:
# Extract 5 lines from X to send to the API for predictions
# We'll convert the pandas dataframe to a JSON Lines (JSONL) object so it can be sent to the API
# We cannot directly send a dataframe over the internet. We can only send JSON over the internet

user_input_df = X.tail(10)
user_input = user_input_df.to_json(orient="records", lines=True) # convert df to JSONL
print(user_input)

{"title":"What are your thoughts on $SFT ? First post here"}
{"title":"Anyone notice how AH ceased Friday on the Gstop?"}
{"title":"Financial Advisors.."}
{"title":"Bullish on DoorDash"}
{"title":"What you think about Baba $"}
{"title":"I LOST EVERYTHING Lost over 10k on the AMD crash easier this year. Pulled my money out and went on with my life. I kept $500 in there waiting for the market to go regard again and here we are baby let\u2019s ride!"}
{"title":"Current market conditions"}
{"title":"Why y'all worried?"}
{"title":"JPOW and the fed trying to fix inflation and prevent the economy from going into a recession at the same time"}
{"title":"Hey guys a question"}



In [42]:
# Send the JSONL data as request to the API and print the response
import requests

api_url = 'https://subreddit-classification-runrqp42la-as.a.run.app'
api_route = '/predict'

response = requests.post(f'{api_url}{api_route}', json=user_input)
predictions = response.json()

print(predictions)

{'predictions': [1, 1, 1, 0, 1, 0, 1, 0, 0, 1]}


In [43]:
model.predict(X.tail(10)['title']) # checking against prediction directly from dataframe
# prediction is the same for both, no loss of info when getting predictions from the GCloud Run api

array([1, 1, 1, 0, 1, 0, 1, 0, 0, 1], dtype=int64)

# Creating Streamlit App

In [63]:
%%writefile streamlit_app.py
import streamlit as st
import requests
import json

# Title of the page
st.title("📈 r/Stocks vs 💎🤲 r/WallStreetBets Classification")
st.header("This app will predict if a given post comes from r/Stocks or r/WSB, given its post title")
st.caption("For more details, please visit https://github.com/gilbertsg/stocks_subreddit_analysis")
st.header("")
st.subheader("You may want to try the following stereotypical sentences from each subreddit:")
st.caption('- typical r/Stocks sentence: "I am learning to invest in index funds to get good dividends in the long term"')
st.caption('- typical r/WallStreetBets sentence: "Dumping all my life savings to GME tonight. GME TO THE MOON 🚀🚀🚀"')
st.header("")

Overwriting streamlit_app.py


## Testing  the streamlit app
1. Open a new terminal window and navigate to this directory. You should find the `streamlit_app.py` file that we just created here.
1. Run: `streamlit run streamlit_app.py`

Done! Your streamlit app is now running and you can access it on your browser at the URL: http://localhost:8501

- Let's now add 2 variables `gre` and `gpa` and get their values from the user input in the UI
- We can use the `-a` flag in `%%writefile` to append lines to the same file instead of overwriting it

Adding inputs

In [64]:
%%writefile -a streamlit_app.py

# Get user inputs
title = st.text_area("📚 Please input the reddit post title:")

# Display the inputs
user_input = {"title":title}
st.write("User input:")
st.write(user_input)

Appending to streamlit_app.py


Adding output

In [65]:
%%writefile -a streamlit_app.py

# Code to post the user inputs to the API and get the predictions
# Paste the URL to your GCP Cloud Run API here!
api_url = 'https://subreddit-classification-runrqp42la-as.a.run.app'
api_route = '/predict'

response = requests.post(f'{api_url}{api_route}', json=json.dumps(user_input)) # json.dumps() converts dict to JSON
predictions = response.json()

# Add a submit button
if st.button("Submit"): 
    st.write(f"Prediction: {predictions['predictions'][0]}")
    
st.caption("1 indicates that the post is predicted to be from r/Stocks")
st.caption("0 indicates that the post is predicted to be from r/WallStreetBets")

Appending to streamlit_app.py
