## Get the Data Source, and Prepare Training and Test set

In [1]:
#import all the required package
import pandas as pd

In [2]:
#import all training dataset
df_620 = pd.read_csv('./scraping_data/result_taobao.csv', header=None)
df_621 = pd.read_csv('./scraping_data/result_621_2.csv', header=None)
df_622_1 = pd.read_csv('./scraping_data/result_622.csv', header=None)
df_622_2 = pd.read_csv('./scraping_data/result_622_add.csv', header=None)
df_622 = pd.concat([df_622_1,df_622_2]) #merging these 2 dataframe since I scraped half way for the first 1 
df_623 = pd.read_csv('./scraping_data/result_smartphone_623.csv', header=None)
df_624 = pd.read_csv('./scraping_data/result_ereader_624.csv', header=None)
df_625 = pd.read_csv('./scraping_data/result_625.csv', header=None)

pd.set_option('display.max_colwidth', -1) #set this to remove truncated by jupyter/python

In [3]:
df_620.shape

(214, 1)

In [4]:
#print some sample to see the output
print('df_620 :',df_620[0].to_string(index=False)[:100])
print('df_621 :',df_621[0].to_string(index=False)[:100])
print('df_622 :',df_622[0].to_string(index=False)[:100])
print('df_623 :',df_623[0].to_string(index=False)[:100])
print('df_624 :',df_624[0].to_string(index=False)[:100])
print('df_625 :',df_625[0].to_string(index=False)[:100])

df_620 : korean-style solid spring and elastic waist wide leg pantsproduct details of korean-style solid spri
df_621 : sports camera sport video 4k wifi action cam 16 mp underwater camcorder hd 1080p and 2 batteries 170
df_622 : edifier luna e235 thx home theatre speaker system with bluetooth functionproduct details of edifier 
df_623 : sunweb stonex one android smart phone eu standard blue eu blue product details of sunweb stonex one 
df_624 : e-reader and smart phone holder tablet pc stand for iphone ipadwhiteproduct details of e-reader and 
df_625 : x2 couples wristband heart rate monitor smart watches blood pressure monitor smart band bluetooth ip


In [5]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag, FreqDist
import random
import pickle

In [6]:
# read the labelled all the source data
def DataSources():
    data_620 = df_620[0].to_string(index=False)
    data_621 = df_621[0].to_string(index=False)
    data_622 = df_622[0].to_string(index=False)
    data_623 = df_623[0].to_string(index=False)
    data_624 = df_624[0].to_string(index=False)
    data_625 = df_625[0].to_string(index=False)
    return data_620, data_621, data_622, data_623, data_624,data_625

In [7]:
# prepare the data
def PrepareData():
    train_620, train_621, train_622, train_623, train_624, train_625 = DataSources()
    documents = []
    all_words = []
    
#    j is adjective, r is adverb, and v is verb
    allowed_word_types = ["J","R","V"]
#     allowed_word_types = ["J"]

    for p in train_620.split('\n'):
        documents.append((p,"Others"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())


    for p in train_621.split('\n'):
        documents.append((p,"Consumer Electronics > Cameras and Camcorders"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())

    for p in train_622.split('\n'):
        documents.append((p,"Consumer Electronics > Home Entertainment Systems"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())

    for p in train_623.split('\n'):
        documents.append((p,"Technology & Computing > Consumer Electronics > Smartphones"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())

    for p in train_624.split('\n'):
        documents.append((p,"Technology & Computing > Consumer Electronics > Tablets and E-readers"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())
                
    for p in train_625.split('\n'):
        documents.append((p,"Technology & Computing > Consumer Electronics > Wearable Technology"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())
                
    save_documents = open("saved/documents.p", "wb")
    pickle.dump(documents, save_documents)
    save_documents.close()

    all_words = FreqDist(all_words)
#     word_features = list(all_words.keys())[:5000]
    word_features = list(all_words.keys())[:]

    save_word_features = open("saved/word_features.p", "wb")
    pickle.dump(word_features, save_word_features)
    save_word_features.close()

    features = [(find_features(rev, word_features), category) for (rev, category) in documents]
    return features


def find_features(document, word_features):
    words = word_tokenize(document) #tokenize rev
    features = {}
    for w in word_features: #word_features list all_words.keys
        features[w] = (w in words)
    return features


def TestTrainData():
    featuresets = PrepareData()
    random.shuffle(featuresets)
#    print(len(featuresets))
    testing_set = featuresets[10000:]
    training_set = featuresets[:10000]
    return training_set, testing_set

## Train Classifiers and store them

In [8]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from nltk import NaiveBayesClassifier, classify
import pickle


def TrainClassifiers():
    training_set, testing_set = TestTrainData()

    classifiers = list()
    classifier_name = list()

    NaiveBayesClassifier_classifier = NaiveBayesClassifier.train(training_set)
    classifiers.append(NaiveBayesClassifier_classifier)
    classifier_name.append("NaiveBayesClassifier")

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    classifiers.append(MNB_classifier)
    classifier_name.append("MultinomialNBClassifier")

    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    classifiers.append(BernoulliNB_classifier)
    classifier_name.append("BernoulliNBClassifier")

    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    classifiers.append(LogisticRegression_classifier)
    classifier_name.append("LogisticRegressionClassifier")

    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    classifiers.append(LogisticRegression_classifier)
    classifier_name.append("LinearSVCClassifier")

    SGDC_classifier = SklearnClassifier(SGDClassifier())
    SGDC_classifier.train(training_set)
    classifiers.append(SGDC_classifier)
    classifier_name.append("SGDClassifier")

    SaveClassifiers(classifiers, classifier_name)

    return classifiers


def SaveClassifiers(classifiers, classifier_name):

    for i in range(0, len(classifiers)):
        save_classifier_path = open("saved/" + classifier_name[i] + ".p", "wb")
        pickle.dump(classifiers[i], save_classifier_path)
        save_classifier_path.close()

    save_classifier_path = open("saved/classifier_name.p", "wb")
    pickle.dump(classifier_name, save_classifier_path)

## Category Classifier : Which categories?

In [9]:
class IABClassification:

    def __init__(self):
        classifiers = TrainClassifiers()

#     def Analyse(self, text):
#         new_features = find_features(text, self.new_features)
#         return self.votedClassifier.classify(new_features), self.votedClassifier.confidence(new_features)

## Creating classifier for IABClassification

In [10]:
# turn off pickle warning
import warnings
warnings.filterwarnings('ignore')

In [11]:
s = IABClassification() #initiate train classifier