## To test result:
1. Extract unknown category data from Hadoop and create a list
2. Feed the data into machine learning preprocessing
3. Extracting the data and remove unwanted content

In [1]:
import pandas as pd

df_url = pd.read_excel('./input2.xlsx') #fetch data from hadoop output
df_url.columns = ['hour_id','msisdn','imsi','imei','host','url','domain_id','subdomain_id','conn_dur_ms','up_byte','down_byte','cdr_cnt','date_id']
df_url.head()

Unnamed: 0,hour_id,msisdn,imsi,imei,host,url,domain_id,subdomain_id,conn_dur_ms,up_byte,down_byte,cdr_cnt,date_id
0,10,0,0,0,www.lazada.com.my,edifier-luna-e235-thx-home-theatre-speaker-sys...,6,29,0,2211,12055,28,2017xxxx
1,12,0,0,0,www.lazada.com.my,huawei-honor-8-lite-pra-al00x-5234-smartphone-...,-1,-1,0,452,228,6,2017xxxx
2,12,0,0,0,www.lazada.com.my,bathroom-hand-washing-room-double-layer-toilet...,-1,-1,61284,850,687,2,2017xxxx
3,12,0,0,0,www.lazada.com.my,kindle-e-reader-black-6-glare-free-touchscreen...,-1,-1,0,452,228,6,2017xxxx
4,12,0,0,0,www.lazada.com.my,new-m88-smart-watch-wristband-smartband-heart-...,-1,-1,0,452,228,6,2017xxxx


In [42]:
url_list = 'https://' + df_url['host'] + '/' + df_url['url']
url_list = url_list.unique()
for a in url_list:
    print(a)

https://www.lazada.com.my/edifier-luna-e235-thx-home-theatre-speaker-system-with-bluetooth-function-26673440.html?ff=1
https://www.lazada.com.my/huawei-honor-8-lite-pra-al00x-5234-smartphone-32gb-us-plug-54375703.html?ff=1&sc=EwI=
https://www.lazada.com.my/bathroom-hand-washing-room-double-layer-toilet-storage-rack-shelf-98460754.html?ff=1&sc=IYsE
https://www.lazada.com.my/kindle-e-reader-black-6-glare-free-touchscreen-displaywi-fiblack-69131774.html?spm=a2o4k.search.0.0.568db469LP6ncW&ff=1
https://www.lazada.com.my/new-m88-smart-watch-wristband-smartband-heart-rate-blood-pressure-monitor-fitness-tracker-bracelet-pedometer-for-ios-android-pk-mi-band-2-fitbit-72945202.html?ff=1&sc=Eeog
https://www.lazada.com.my/canon-powershot-g7x-mkii-black-16gb-1-year-warranty-by-canon-malaysia-24992697.html?spm=a2o4k.category-040500000000.0.0.5304c9d3oF85AA&ff=1&sc=EfU=


## Import back trained classification

In [43]:
# turn off pickle warning
import warnings
warnings.filterwarnings('ignore')

In [44]:
#import all required package
from nltk.tokenize import word_tokenize
from nltk import pos_tag, FreqDist
import random
import pickle

from nltk.classify import ClassifierI
from statistics import mode, StatisticsError 

from nltk import classify

In [45]:
class IABClassification:

    def __init__(self):
#         classifiers = TrainClassifiers()
        classifiers = LoadClassifiers()
        self.votedClassifier = VoteClassifier(classifiers)
        self.new_features = LoadFeatures()

    def Analyse(self, text):
        new_features = find_features(text, self.new_features)
        return self.votedClassifier.classify(new_features), self.votedClassifier.confidence(new_features)
#         return self.votedClassifier.classify(new_features), self.votedClassifier.confidence(new_features), text

In [46]:
class VoteClassifier(ClassifierI):
    def __init__(self, classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
#         ret = "neg"
        
        try:
            ret = mode(votes)
        except StatisticsError:
            print("Caught1")
            pass
        return ret

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        try:
            choice_votes = votes.count(mode(votes))
            conf = choice_votes / len(votes)
            return conf
        except StatisticsError:
            print("Caught2")
            return 0.5

In [47]:
def find_features(document, word_features):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features


def LoadData(shuffle=False):

    documents_f = open("saved/documents.p", "rb")
    documents = pickle.load(documents_f)
    documents_f.close()

    documents_f = open("saved/word_features5k.p", "rb")
    word_features = pickle.load(documents_f)
    documents_f.close()

    features = [(find_features(rev, word_features), category) for (rev, category) in documents]

    if shuffle:
        random.shuffle(features)

    testing_set = features[10000:]
    training_set = features[:10000]

    return training_set, testing_set


def LoadClassifiers():
    document = open("saved/classifier_name.p", "rb")
    classifier_name = pickle.load(document)
    document.close()
    # print(classifier_name)
    classifiers = list()

#     training_set, testing_set = LoadData()

    for name in classifier_name:
        document = open("saved/" + name + ".p", "rb")
        classifier = pickle.load(document)
        classifiers.append(classifier)
        document.close()

    return classifiers


def LoadFeatures():
    documents_f = open("saved/word_features5k.p", "rb")
    word_features = pickle.load(documents_f)
    documents_f.close()
    return word_features

In [48]:
# prepare the data
def PrepareData():
    train_620, train_621, train_622, train_623, train_624 = DataSources()
    documents = []
    all_words = []
    
#    j is adjective, r is adverb, and v is verb
    allowed_word_types = ["J","R","V"]
#     allowed_word_types = ["J"]

    for p in train_620.split('\n'):
        documents.append((p,"Others"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())


    for p in train_621.split('\n'):
        documents.append((p,"Consumer Electronics > Cameras and Camcorders"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())

    for p in train_622.split('\n'):
        documents.append((p,"Consumer Electronics > Home Entertainment Systems"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())

    for p in train_623.split('\n'):
        documents.append((p,"Technology & Computing > Consumer Electronics > Smartphones"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())

    for p in train_624.split('\n'):
        documents.append((p,"Technology & Computing > Consumer Electronics > Tablets and E-readers"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())
                
    save_documents = open("saved/documents.p", "wb")
    pickle.dump(documents, save_documents)
    save_documents.close()

    all_words = FreqDist(all_words)
#     word_features = list(all_words.keys())[:5000]
    word_features = list(all_words.keys())[:]

    save_word_features = open("saved/word_features5k.p", "wb")
    pickle.dump(word_features, save_word_features)
    save_word_features.close()

    features = [(find_features(rev, word_features), category) for (rev, category) in documents]
    return features


def find_features(document, word_features):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features


def TestTrainData():
    featuresets = PrepareData()
    random.shuffle(featuresets)
#    print(len(featuresets))
    testing_set = featuresets[100:]
    training_set = featuresets[:100]
    return training_set, testing_set

## Test with data that extracted from Hadoop

In [49]:
# import all the required package
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup

import re
import pandas as pd
import numpy as np
import csv

In [50]:
#load IAB classfication
s = IABClassification()

In [None]:
def gettingResult(pageSource):
    pageSource = browser.page_source
    bsObj = BeautifulSoup(pageSource,'lxml')  

    try:
        result = bsObj.find(id = 'prod_title').text.strip() + bsObj.find(class_='product-description__block').getText(' ').strip().replace("\n"," ").replace("\t"," ") + bsObj.find(class_='prd-attributesList ui-listBulleted js-short-description').getText(' ').strip()
    except:
        result = bsObj.find(class_ = 'product-description__title').text.strip() + bsObj.find(class_='product-description__block').getText(' ').strip().replace("\n"," ").replace("\t"," ") + bsObj.find(class_='prd-attributesList ui-listBulleted js-short-description').getText(' ').strip()

    result2 = re.sub(' +',' ',result.lower().strip().replace(")"," ").replace("("," ").replace("\"","\'").replace("\n"," ").replace("/","or"))
    print(s.Analyse(result2),'\n')
    return

### Result Testing
Using the data that we got from hadoop on unknown website classifier we then test it using our scraping engine then feed the data into our trained model.

In [52]:
count = 1
ttl_url = len(url_list)

for url in url_list:
    print('====== Getting web browser ready ======')
#     browser = webdriver.Chrome('../input/chromedriver.exe')
    browser = webdriver.PhantomJS('../input/phantomjs.exe')
    browser.get(url)
    counter = str(count)+'/'+str(ttl_url)
    print(counter,'Brows to',url)
    delay = 3 # seconds
    count +=1
    
    try:
        link3 = browser.find_element_by_class_name("delivery-option-st__label") #this for removing popup on oversea delivery
        ActionChains(browser).move_to_element(link3).perform()
        link3.click()
        
#         try:
#             myElem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'product-description__block-expand-button')))

        try:
            link = browser.find_element_by_class_name("product-description__block-expand-button")
            ActionChains(browser).move_to_element(link).perform()
            link.click()

            try:
                link2 = browser.find_element_by_class_name("more-desc-button")
                ActionChains(browser).move_to_element(link2).perform()
                link2.click()

                gettingResult(browser.page_source)
                browser.close()
            except:
                gettingResult(browser.page_source)
                browser.close()

        except:
            try:
                link2 = browser.find_element_by_class_name("more-desc-button")
                ActionChains(browser).move_to_element(link2).perform()
                link2.click()

                gettingResult(browser.page_source)
                browser.close()

            except:
                gettingResult(browser.page_source)
                browser.close()

#         except TimeoutException:
#             print ("Loading took too much time!")
    except:
        try:
            myElem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'product-description__block-expand-button')))
            
            try:
                link = browser.find_element_by_class_name("product-description__block-expand-button")
                ActionChains(browser).move_to_element(link).perform()
                link.click()
                
                try:
                    link2 = browser.find_element_by_class_name("more-desc-button")
                    ActionChains(browser).move_to_element(link2).perform()
                    link2.click()
                    
                    gettingResult(browser.page_source)
                    browser.close()
                except:
                    gettingResult(browser.page_source)
                    browser.close()

            except:
                try:
                    link2 = browser.find_element_by_class_name("more-desc-button")
                    ActionChains(browser).move_to_element(link2).perform()
                    link2.click()
                    
                    gettingResult(browser.page_source)
                    browser.close()
                    
                except:
                    gettingResult(browser.page_source)
                    browser.close()
                
            
#             final.append(gettingResult(browser.page_source))
#             browser.close()
        except TimeoutException:
            print ("Loading took too much time!")
            browser.close()

1/6 Brows to https://www.lazada.com.my/edifier-luna-e235-thx-home-theatre-speaker-system-with-bluetooth-function-26673440.html?ff=1
('Consumer Electronics > Home Entertainment Systems', 0.6666666666666666) 

2/6 Brows to https://www.lazada.com.my/huawei-honor-8-lite-pra-al00x-5234-smartphone-32gb-us-plug-54375703.html?ff=1&sc=EwI=
('Technology & Computing > Consumer Electronics > Smartphones', 1.0) 

3/6 Brows to https://www.lazada.com.my/bathroom-hand-washing-room-double-layer-toilet-storage-rack-shelf-98460754.html?ff=1&sc=IYsE
('Others', 1.0) 

4/6 Brows to https://www.lazada.com.my/kindle-e-reader-black-6-glare-free-touchscreen-displaywi-fiblack-69131774.html?spm=a2o4k.search.0.0.568db469LP6ncW&ff=1
('Technology & Computing > Consumer Electronics > Tablets and E-readers', 1.0) 

5/6 Brows to https://www.lazada.com.my/new-m88-smart-watch-wristband-smartband-heart-rate-blood-pressure-monitor-fitness-tracker-bracelet-pedometer-for-ios-android-pk-mi-band-2-fitbit-72945202.html?ff=1&sc=

## What can I improve?
1. Using OCR package to read the picture inside the description and extract the words.
2. Finding a way on how to accomodate the training code for 1100++ categories.
3. Making the scraping engine more robust and can be use with other ecommerce website.
4. Label the test data carefully so that no garbage in garbage out situation.