# Imports and Settings

In [1]:
## library imports

# data processing imports
import pandas as pd
import numpy as np

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns

# modelling imports
from sklearn.model_selection import train_test_split
import imblearn

# metric imports
from sklearn import metrics 

# misc imports
import datetime
from dateutil.relativedelta import *
import mlflow

# nlp imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from emoji import demojize

In [2]:
## Settings
sns.set_style("darkgrid") # setting seaborn style

# enable inline plotting
%matplotlib inline 

pd.set_option('display.max_columns', None) # disable column overflow in pandas display, this is toenable us to display all the columns

In [3]:
# presentation_plotting = False # don't comment this part out

# # Settings for presentation plotting

# sns.set(style="darkgrid", context="talk", palette="Set2")

# sns.color_palette("pastel")

# plt.rcParams.update({"axes.facecolor":'#10131a', 
#                      "figure.facecolor":'#1b212c'})

# textcolor = 'white'
# plt.rcParams.update({"text.color":textcolor, 
#                      "axes.labelcolor":textcolor,
#                      "xtick.color":textcolor,
#                      "ytick.color":textcolor})


# plt.rcParams.update({"grid.linewidth":0.5, 
#                      "grid.alpha":0.5})
# plt.rcParams["figure.figsize"] = (12, 9)

# presentation_plotting = True


# # commented out for notebook setting

# Data Imports

In [4]:
df = pd.read_csv('data/df_cleaned.csv') # importing data from saved csv file
df['created'] = pd.to_datetime(df['created']) # convert the created column to a datetime column
df.head() # displaying the top 5 row of the loaded dataframe

Unnamed: 0,author,created_utc,id,is_self,num_comments,score,selftext,title,upvote_ratio,created,subreddit,subreddit_stocks,title_char_len,title_word_len,score_pmaw
0,cloudboyy,1627751015,ovatu6,False,1,1,,Any thoughts on OSTK? It sold out of the wedge...,1.0,2021-08-01 01:03:35,wsb,0,208,40,1
1,itbc1info,1627751020,ovatwa,False,2,2,,"U.S. prosecutors charge Trevor Milton, founder...",1.0,2021-08-01 01:03:40,wsb,0,61,8,1
2,dadryp,1627751064,ovaudw,True,0,1,,Stocks have been horrible for me 2021,1.0,2021-08-01 01:04:24,stocks,1,37,7,1
3,insta_man,1627751065,ovauee,True,40,7,Long story I started investing in the market w...,Want to get back into investing. Any tips?,1.0,2021-08-01 01:04:25,stocks,1,42,8,1
4,PenIslandGaylien,1627751202,ovavxl,True,1,2,So I already contributed 6k to my Roth in 2021...,Roth Income Limits,1.0,2021-08-01 01:06:42,wsb,0,18,3,1


# Model Setup

## Setting Features (X) and Target Variables (y)

In [84]:
X = df['title']
y = df['subreddit_stocks']

# Modelling (CVEC + Multinomial NB | SMOTE Oversampling)

In [85]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

model = Pipeline([('count_vectorizer',CountVectorizer()),
                  ('SMOTE',SMOTE(n_jobs=-1,random_state=42)),
                  ('multinomialNB',MultinomialNB())
                 ])

In [86]:
model.fit(X,y)

In [87]:
pipeline_predict = model.predict(X)

In [88]:
import pickle

pickle.dump(model,open('model.sav','wb'))

# Create an inference.py file

In [118]:
%%writefile inference.py
from flask import Flask, request
import pandas as pd
import os
import pickle

# Step 2: Instantiate the Flask API
api = Flask('ModelEndpoint')

# Step 3: Load the model
model = pickle.load(open('model.sav','rb'))

# Step 4: Create the routes
## route 1: Health check. Just return success if the API is running
@api.route('/')
def home():
    # return a simple string
    return {"message": "Hi there!", "success": True}, 200

# route 2: accept input data
# Post method is used when we want to receive some data from the user
@api.route('/predict', methods = ['POST'])
def make_predictions():
    # Get the data sent over the API
    user_input = request.get_json(force=True)
    
    # Convert user inputs to pandas dataframe
    df_schema = {"gre":float, "gpa": float, "post_title":string} # To ensure the columns get the correct datatype
    user_input_df = pd.read_json(user_input, lines=True, dtype=df_schema) # Convert JSONL to dataframe
    
    # Run predictions and convert to list
    predictions = model.predict(user_input_df).tolist()
    
    return {'predictions': predictions}
    

# Step 5: Main function that actually runs the API!
if __name__ == '__main__':
    api.run(host='0.0.0.0', 
            debug=True, # Debug=True ensures any changes to inference.py automatically updates the running API
            port=int(os.environ.get("PORT", 8080))
           ) 

Overwriting inference.py


# Test the API
- To test out if our API is working, we first need to run the API code `inference.py`
- Open a new terminal window and navigate to this `solution-code` directory. You should find the `inference.py` file that we just created here.
- Run the file as a normal python file: `python inference.py`
- Now your API is running on your local computer and is ready to accept input data at `http://localhost:8080` URL
- We can interact with any route in the API simply by posting a request to that route. For example, type `http://localhost:8080/` in your browser and see what you get!
- To get predictions, we need to post our input data to the `/predict` route which gets appended at the end of the URL. So the URL will become `http://localhost:8080/predict`
- Let's load the same data we used to train the model and send the first 5 rows to the API for predictions

In [146]:
user_input_df = X.head(10)

user_input = json.dumps(user_input_df.to_list())

In [147]:
print(user_input)

["Any thoughts on OSTK? It sold out of the wedge but financials look great to me. Annual sales almost = market cap, order numbers down but average order price is up. No company debt. Let me know what you think!", "U.S. prosecutors charge Trevor Milton, founder of electric...", "Stocks have been horrible for me 2021", "Want to get back into investing. Any tips?", "Roth Income Limits", "Does dividend investing suck?", "What happened to the Glory Days?", "3 GOP Congressmen face ethics complaints for failing to disclose $22 million in stock trades", "Let's not forget we Like the Stock and believe in the DD but let's also support our company with more than Buying and Holding. Take a Friend, your Wife, Your Child or A loved one to watch a Movie and Enjoy a Popcorn. \ud83c\udf7f #Apestrong \ud83d\udc8e\ud83d\ude4c\ud83c\udffe\ud83e\udd8d\ud83d\ude80\ud83c\udf15", "BABA SHORT SQEEZE @ 160"]


In [148]:
# Send the JSONL data as request to the API and print the response
import requests
import json 

api_url = 'http://localhost:8080'
api_route = '/predict'

response = requests.post(f'{api_url}{api_route}', json=user_input)
predictions = response.json()

print(predictions)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
user_input_df

In [149]:
# model_load = pickle.load(open('model.sav','rb'))
model_load.predict(X)

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [150]:
X

0         Any thoughts on OSTK? It sold out of the wedge...
1         U.S. prosecutors charge Trevor Milton, founder...
2                     Stocks have been horrible for me 2021
3                Want to get back into investing. Any tips?
4                                        Roth Income Limits
                                ...                        
272411    I LOST EVERYTHING Lost over 10k on the AMD cra...
272412                            Current market conditions
272413                                   Why y'all worried?
272414    JPOW and the fed trying to fix inflation and p...
272415                                  Hey guys a question
Name: title, Length: 272416, dtype: object