# Cloud MongoDB + TFIDF + JobLib

- Build a model using MongoDB and TFIDF
- Deployd and predict a value

In [9]:
# Install
!pip install pymongo
!pip install dnspython
!pip install pymongo[srv]



In [10]:
# Imports
import pymongo
import pprint
import urllib.parse
import uuid
import flask
import json

import requests as r
import pandas as pd
import numpy as np
import joblib as jb

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

### MongoDb Connection

In [11]:
# Load the client
client = pymongo.MongoClient("mongodb+srv://dbuser:dbpassword@cluster0-tdmr9.mongodb.net/test?retryWrites=true&w=majority")

# Load the database
database = client['sample_training']

# Load a collection
companies = database['companies']

# Load the instances
instances = companies.find()

# Build a dataframe using the instances
dataframe = pd.DataFrame(instances)

### Cleaning, transforming and encoding of data

In [12]:
print('Shape of dataframe, before clean:', dataframe.shape)

#dataframe['category_code'].value_counts()
#dataframe['tag_list'].isnull().sum()
dataframe = dataframe[['tag_list', 'category_code']]
dataframe.dropna(inplace=True)

print('Shape of dataframe, after clean:', dataframe.shape)

Shape of dataframe, before clean: (9500, 42)
Shape of dataframe, after clean: (7192, 2)


In [13]:
def clean_text(value):
    if value:
        return value.lower().replace(r'[^\w\s]+', ' ').replace('...', '')
    else:
        return ''
    
def sort_values(value):
    if (value):
        value = value.split(', ')
        value.sort()
        return ' '.join(value)
    else:
        return ''
    
dataframe = dataframe.applymap(str)
dataframe['category_code'] = dataframe['category_code'].map(clean_text)

dataframe['tag_list'] = dataframe['tag_list'].map(clean_text)
#dataframe['tag_list'] = dataframe['tag_list'].map(sort_values)

dataframe.head()

Unnamed: 0,tag_list,category_code
0,"wiki, seattle, elowitz, media-industry, media-...",web
1,"facebook, college, students, profiles, network...",social
2,"storage, sharing, edit, online",network_hosting
3,"text, messaging, social, community, twitter, t...",social
4,"content-discovery, find, content, stumble, too...",web


In [14]:
# Label encoding
#features = ['category_code']
encoder = LabelEncoder()
dataframe['category_code'] = dataframe[['category_code']].apply(encoder.fit_transform)

dataframe.head()

Unnamed: 0,tag_list,category_code
0,"wiki, seattle, elowitz, media-industry, media-...",38
1,"facebook, college, students, profiles, network...",33
2,"storage, sharing, edit, online",23
3,"text, messaging, social, community, twitter, t...",33
4,"content-discovery, find, content, stumble, too...",38


### Train, Test and model

In [17]:
X_train = dataframe['tag_list'].iloc[:dataframe.shape[0] // 2]
y_train = dataframe['category_code'].iloc[:dataframe.shape[0] // 2]

X_test = dataframe['tag_list'].iloc[dataframe.shape[0] // 2:]
y_test = dataframe['category_code'].iloc[dataframe.shape[0] // 2:]

In [None]:
#mdl = make_pipeline(TfidfVectorizer(ngram_range=(1,2)), 
# RandomForestClassifier(n_estimators=1000, max_depth =100, n_jobs=6, random_state=0))

#mdl.fit(X_train, y_train)
#predict_rfc = mdl.predict_proba(X_test)

In [None]:
#predict_rfc_list = []
#for row in predict_rfc:
#    idmax = pd.Series(row).idxmax()
#    predict_rfc_list.append(idmax)

#df_rfc = pd.DataFrame()
#df_rfc['y'] = y_test
#df_rfc['rfc'] = predict_rfc_list

#df_rfc.loc[df_rfc['y'] == df_rfc['rfc']].shape

In [18]:
mdl = make_pipeline(TfidfVectorizer(ngram_range=(1,2)), 
                    LogisticRegression(penalty='l1', multi_class='auto', C = 100,random_state = 0, solver='liblinear'))

mdl.fit(X_train, y_train)
predict_lr = mdl.predict_proba(X_test)



In [None]:
#predict_lr_list = []
#for row in predict_lr:
#    idmax = pd.Series(row).idxmax()
#    predict_lr_list.append(idmax)

#df_lr = pd.DataFrame()
#df_lr['y'] = y_test
#df_lr['lr'] = predict_lr_list

#df_lr.loc[df_lr['y'] == df_lr['lr']].shape

In [58]:
encoder_map = {l: i for i, l in enumerate(encoder.classes_)}
encoder_map

def get_category(target):
    for key, value in encoder_map.items():
        if value == target:
            return key

def predict_category(tags):
    predict = mdl.predict_proba([tags])
    #print(predict)
    key = pd.Series(predict[0]).idxmax()
    #print(key)
    return get_category(key)

In [60]:
#get_category(0)
predict_category('machine-learning digital web')

'games_video'

### Build dump of model

In [39]:
jb.dump(mdl, "mdl.pkl.z")

['mdl.pkl.z']

### Test dump

In [57]:
encoded =  urllib.parse.quote("machine-learning digital web")

response = r.get("http://localhost:5000/category_code?tags={}".format(encoded))
json_data = json.loads(response.text)
get_category(json_data['category_code'])

'games_video'