# Cloud MongoDB + NLTK + TfidfVectorizer + LogisticRegression

- Build a dataframe using a MongoDB data
- Apply NLTK functions 
- Build a model using TFIDF and LogisticRegression

In [1]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Imports
import pymongo

import pandas as pd
import numpy as np
import nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
nltk.download('rslp')
nltk.download('punkt')
nltk.download('wordnet')

%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dfds\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\dfds\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dfds\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dfds\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## MongoDb Connection

In [3]:
# Load the client
client = pymongo.MongoClient("mongodb+srv://dbuser:dbpassword@cluster0-tdmr9.mongodb.net/test?retryWrites=true&w=majority")

# Load the database
database = client['sample_training']

# Load a collection
companies = database['stories']

# Load the instances
instances = companies.find()

# Build a dataframe using the instances
df_stories = pd.DataFrame(instances)

## Build the dataframe

In [4]:
# Load the target features
df_stories = df_stories[['title', 'description', 'topic']]

# Define funciton to load the topic value
def load_topic(value):
    return value['name']

# Load the topic value
df_stories['topic_name'] = df_stories['topic'].map(load_topic)

# Remove the unnecessary column
del df_stories['topic']

df_stories.head()

Unnamed: 0,title,description,topic_name
0,The Swastikas Hockey Team of 1916...WTF? (PIC),"Before it became associated with the Nazis, sw...",Odd Stuff
1,Are Americans Giving Up On The Environment?,A survey released yesterday shows just 34 perc...,Environment
2,Obama to be interviewed on Fox News,Barack Obama must really want health care to p...,Television
3,Workers Crushed by Toyota,Though Toyota has reaped endless benefits from...,Business & Finance
4,NASA - The Wizard Nebula,"This image of the open star cluster NGC 7380, ...",Space


## Clear the dataframe

In [5]:
df_stories.replace('', np.nan, inplace=True)
df_stories.dropna(how='all', inplace=True)

df_stories.head()

Unnamed: 0,title,description,topic_name
0,The Swastikas Hockey Team of 1916...WTF? (PIC),"Before it became associated with the Nazis, sw...",Odd Stuff
1,Are Americans Giving Up On The Environment?,A survey released yesterday shows just 34 perc...,Environment
2,Obama to be interviewed on Fox News,Barack Obama must really want health care to p...,Television
3,Workers Crushed by Toyota,Though Toyota has reaped endless benefits from...,Business & Finance
4,NASA - The Wizard Nebula,"This image of the open star cluster NGC 7380, ...",Space


## NLTK: Instances and functions

In [6]:
# Load the instances of NLTK objects
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.RSLPStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# Remove unnecessary words in the text
def apply_stopwords(text):
    words = [i for i in text.split() if not i in stopwords]
    return (' '.join(words))

# Replace words by the 'radical'
def apply_stermin(text):
    words = []
    for word in text.split():
        words.append(stemmer.stem(word))
    return (' '.join(words))
    
# Avoid conflict with radicals
def apply_limmatizer(text):
    words = []
    for word in text.split():
        words.append(lemmatizer.lemmatize(word))
    return (' '.join(words))

### Testing NLTK

In [7]:
description = df_stories['description'][0]
description_sw = apply_stopwords(description)
description_sm = apply_stermin(description)
description_lm = apply_limmatizer(description)

print('Normal: \n -', description, '\n')
print('Apply stopwords: \n -', description_sw, '\n')
print('Apply stermin: \n -', description_sm, '\n')
print('Apply limmatizer: \n -',description_lm, '\n')

Normal: 
 - Before it became associated with the Nazis, swastikas had been used for hundreds of years as a symbol of good luck and prosperity. 

Apply stopwords: 
 - Before became associated Nazis, swastikas used hundreds years symbol good luck prosperity. 

Apply stermin: 
 - befor it becam associated with the nazis, swastik had been used for hundred of ye as a symbol of good luck and prosperity. 

Apply limmatizer: 
 - Before it became associated with the Nazis, swastika had been used for hundred of year a a symbol of good luck and prosperity. 



## Applying NLTK in the dataframe..

In [8]:
# Apply stopwords and stermin in a text
def apply_nltk(text):
    draft = apply_stopwords(text)
    # The stermin was not useful
    #draft = apply_stermin(text)
    return draft

In [9]:
# Set all features as string
df_stories = df_stories.applymap(str)

# Applying NLTK in the title and description
df_stories['title'] = df_stories['title'].map(apply_nltk)
df_stories['description'] = df_stories['description'].map(apply_nltk)

df_stories.head()

Unnamed: 0,title,description,topic_name
0,The Swastikas Hockey Team 1916...WTF? (PIC),"Before became associated Nazis, swastikas used...",Odd Stuff
1,Are Americans Giving Up On The Environment?,A survey released yesterday shows 34 percent p...,Environment
2,Obama interviewed Fox News,Barack Obama must really want health care pass.,Television
3,Workers Crushed Toyota,Though Toyota reaped endless benefits Californ...,Business & Finance
4,NASA - The Wizard Nebula,"This image open star cluster NGC 7380, also kn...",Space


## Define the train and test data

In [10]:
X_train = df_stories['description'].iloc[:df_stories.shape[0] // 2]
y_train = df_stories['topic_name'].iloc[:df_stories.shape[0] // 2]

X_test = df_stories['description'].iloc[df_stories.shape[0] // 2:]
y_test = df_stories['topic_name'].iloc[df_stories.shape[0] // 2:]

## Pipeline: TfidfVectorizer + LogisticRegression

In [11]:
mdl = make_pipeline(TfidfVectorizer(ngram_range=(1,2)), 
                    LogisticRegression(penalty='l1', multi_class='auto', C = 100,random_state = 0, solver='liblinear'))

mdl.fit(X_train, y_train)
predict_lr = mdl.predict_proba(X_test)



## Predict: Map and Function

In [39]:
encoder = LabelEncoder()

topics = df_stories[['topic_name']].apply(encoder.fit_transform)
encoder_map = {l: i for i, l in enumerate(encoder.classes_)}

In [32]:
def get_topic(target):
    for key, value in encoder_map.items():
        if value == target:
            return key

def predict_topic(tags):
    print(tags)
    predict = mdl.predict_proba([tags])
    key = pd.Series(predict[0]).idxmax()
    return get_category(key)

In [34]:
df_stories['predict'] = df_stories['description'].map(predict_category)

df_stories.head()

Unnamed: 0,title,description,topic_name,predict
0,The Swastikas Hockey Team 1916...WTF? (PIC),"Before became associated Nazis, swastikas used...",Odd Stuff,Odd Stuff
1,Are Americans Giving Up On The Environment?,A survey released yesterday shows 34 percent p...,Environment,Environment
2,Obama interviewed Fox News,Barack Obama must really want health care pass.,Television,Television
3,Workers Crushed Toyota,Though Toyota reaped endless benefits Californ...,Business & Finance,Business & Finance
4,NASA - The Wizard Nebula,"This image open star cluster NGC 7380, also kn...",Space,Space


In [37]:
df_stories.shape

(9842, 4)

In [38]:
df_stories[df_stories['topic_name'] != df_stories['predict']].shape

(3575, 4)