In [1]:
import pandas as pd
import nltk
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.externals import joblib

In [4]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vishnu.raju\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vishnu.raju\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from sklearn.externals import joblib
loaded_model = joblib.load('LRmodel.pkl')
vectorizer = joblib.load('vectorizer.pkl')
labelEncoder = joblib.load('labelEncoder.pkl')

In [4]:
#Data cleaning
default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english') # or any other list of your chose
def clean_text(text, ):

    def tokenize_text(text):
        return [w for s in sent_tokenize(text) for w in word_tokenize(s)]

    def remove_special_characters(text, characters=string.punctuation.replace('-', '')):
        tokens = tokenize_text(text)
        pattern = re.compile('[{}]'.format(re.escape(characters)))
        return ' '.join(filter(None, [pattern.sub('', t) for t in tokens]))

    def stem_text(text, stemmer=default_stemmer):
        tokens = tokenize_text(text)
        return ' '.join([stemmer.stem(t) for t in tokens])

    def remove_stopwords(text, stop_words=default_stopwords):
        tokens = [w for w in tokenize_text(text) if w not in stop_words]
        return ' '.join(tokens)

    text = re.sub(r'[0-9]+', '', text)
    text = text.lower() #lowercase
    text = stem_text(text) #stemming
    text = remove_special_characters(text) #remove punctuation and symbols
    text = remove_stopwords(text) #remove stopwords
    #text.strip(' ') # strip white spaces again?

    return text

In [5]:
def predictFromFile(filename):
    data = pd.read_csv(filename)
    data.columns = ['Product','Description']
    
    features = pd.DataFrame()
    features['Description'] = data.Description.apply(clean_text)
    
    
    #vectorize using the save count vectorizer
    features = vectorizer.transform(features.Description)
    
    #make prediction using loaded model
    y_pred = loaded_model.predict(features)
    
    #inverse transform to get the category label
    y_pred = labelEncoder.inverse_transform(y_pred)
    data['Predicted Category'] = y_pred
    #save the file as csv
    data.to_csv('Output_of_model')
    print('File Generated!')
    return data

In [12]:
data = predictFromFile('test_table.csv')

File Generated!


  if diff:


In [10]:
def predictFromText(text):
    text = pd.Series(text)
    text = text.apply(clean_text)
    text = vectorizer.transform(text)
    y_pred = loaded_model.predict(text)
    y_pred = labelEncoder.inverse_transform(y_pred)
    return (y_pred[0])

In [12]:
predictFromText('BT Diverse 7450 DECT Phone BT')

  if diff:


'Corded Telephones'

In [16]:
data.tail()

Unnamed: 0,Product,Description,Predicted Category
44,7249729,A2010 Semi Circular Table White 1600mm,Tables
45,7249730,A2010 Circular Table White 850mm,Tables
46,7249731,Radial Desk White A2010 1600 x 1600mm,Desks and Workstations
47,7249732,A2010 Wave Desk White Left Hand 1400mm,Desks and Workstations
48,7249733,A2010 Circular Table White 1200mm 725H x 1200D mm,Tables


In [19]:
data.to_json(orient='records')

'[{"Product":9683165,"Description":"FLUTE OUTFIT ODYSSEY","Predicted Category":"Standard"},{"Product":202859,"Description":"Oki C9200\\/C9400 Toner Cartridge 41515210 Magenta","Predicted Category":"Toner Cartridges"},{"Product":9683166,"Description":"CLARINET OUTFIT ODYSSEY","Predicted Category":"Standard"},{"Product":202858,"Description":"Oki C9200\\/C9400 Toner Cartridge 41515211 Cyan","Predicted Category":"Toner Cartridges"},{"Product":9683167,"Description":"ALTO SAX OUTFIT ODYSSEY","Predicted Category":"Standard"},{"Product":202853,"Description":"Oki C9200\\/C9400 Drum Unit 41514712 Black","Predicted Category":"Printing Machine Drums"},{"Product":7249420,"Description":"Alba Wired Wall Display Pockets  7 x A4 - 1120H x 250W x 130D mm","Predicted Category":"Literature Organisers"},{"Product":7249421,"Description":"Alba Floor Standing Literature Display - Mobile Triple 1660H x 820W x 510D mm","Predicted Category":"Literature Organisers"},{"Product":7249426,"Description":"BT Diverse 71