<center>
# Category prediction: data preprocessing, visualization and validation
<center>

### Categories tree processing

In [1]:
import json

categories_path = '/Users/dmitriy.zenin/dev/lab/categories.json'

def dive_into_category(objects, arr):
    for obj in objects:
        arr.append(obj)
        if obj["child_count"] > 0:
            dive_into_category(obj["children"], arr)
            
def build_category_arr():
    categories_tree = json.load(open(categories_path))
    arr = []
    dive_into_category(categories_tree, arr)
    return arr

def get_subtree(categories, name_en):
    subtree = []
    for category_obj in categories:
        if category_obj["name_en"] != name_en:
            continue
        subtree.append(category_obj)
        break
    if len(subtree) == 0:
        return []
    while subtree[-1]["parent"] != None:
        parent_id = subtree[-1]["parent"]
        for category_obj in categories:
            if category_obj["id_catalog_category"] != parent_id:
                continue
            subtree.append(category_obj)
            break
    return subtree
        
def get_category_lvl(categories, name_en, level):
    subtree = get_subtree(categories, name_en)
    if len(subtree) == 0:
        return []
    if level >= len(subtree):
        return subtree[0]
    return subtree[-level]

categories_arr = build_category_arr()

# map: category -> color
naive_categories = dict()

def get_category(categories, max_color):
    target_category = "none"
    ncs = categories.split('|')
    for nc in ncs:
        c = get_category_lvl(categories_arr, nc, 1)
        if len(c) == 0:
            continue
        target_category = c["name_en"]
        break
    if target_category != "none" and target_category not in naive_categories:
        naive_categories[target_category] = max_color
        max_color += 1
    return (target_category, max_color)

In [2]:
#modelFile = '/Users/dmitriy.zenin/dev/lab/wiki.en.bin'
modelFile = '/Users/dmitriy.zenin/dev/lab/model_cbow_minn3_maxn8/data_model.bin'

In [3]:
import fastText
model = fastText.load_model(modelFile)

In [4]:
help(model)

Help on _FastText in module fastText.FastText object:

class _FastText(builtins.object)
 |  This class defines the API to inspect models and should not be used to
 |  create objects. It will be returned by functions such as load_model or
 |  train.
 |  
 |  In general this API assumes to be given only unicode for Python2 and the
 |  Python3 equvalent called str for any string-like arguments. All unicode
 |  strings are then encoded as UTF-8 and fed to the fastText C++ API.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, model=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  get_dimension(self)
 |      Get the dimension (size) of a lookup vector (hidden layer).
 |  
 |  get_input_matrix(self)
 |      Get a copy of the full input matrix of a Model. This only
 |      works if the model is not quantized.
 |  
 |  get_input_vector(self, ind)
 |      Given an index, get the corresponding vector of the Input Matrix.
 |  
 |  get_labels(self, include

### Regular expressions for data preprocessing

In [5]:
import re
extra_symbols_pattern = re.compile(r"[^A-Za-z0-9\s]")
small_phrases_pattern = re.compile(r"\s\S{,2}\s")
extra_spaces_pattern  = re.compile(r'\s{2,}')

### Preprocessing data with category unification

In [6]:
import csv

inFile    = '/Users/dmitriy.zenin/dev/lab/10000.csv'
outFile   = '/Users/dmitriy.zenin/dev/lab/10000_out.csv'

with open(outFile, "w", newline='') as wfile:
    wtr = csv.writer(wfile)
    with open(inFile, newline='') as rfile:
        rdr = csv.reader(rfile, delimiter=',', quotechar='"')
        i = 1
        for row in rdr:
            if len(row) < 2:
                continue
            categories = row[1].split('|')
            sentence = ' '.join(row[0].splitlines())
            sv = model.get_sentence_vector(sentence) # category unification
            sentence = re.sub(extra_symbols_pattern, "", sentence)
            sentence = re.sub(small_phrases_pattern, "", sentence)
            sentence = re.sub(extra_spaces_pattern, " ", sentence)
            sentence = sentence.lower()
            wtr.writerow((sentence, sv, '|'.join(sorted(categories, key=str.lower))))
            if i % 100000 == 0:
                print("done: ", i)
            i += 1

### Generation datasets from preprocessed data

In [7]:
import numpy as np

X = list()
Y = list()
colors = list()

with open(outFile, newline='') as rfile:
    rdr = csv.reader(rfile, delimiter=',', quotechar='"')
    max_color = 1
    for row in rdr:
        v = np.fromstring(row[1][1:-1], dtype=np.float64, sep=' ')
        (category, max_color) = get_category(row[2], max_color)
        if category == "none":
            continue
        X.append(v)
        Y.append(category)
        colors.append(naive_categories[category])
    X = np.asarray(X)
    colors = np.asarray(colors)

In [8]:
print("X:", X.shape, "Y:", len(Y), "colors:", colors.shape, "naive_categories:", len(naive_categories))

X: (6765, 100) Y: 6765 colors: (6765,) naive_categories: 23


### Calculation centers of masses from raw data

In [9]:
import pandas as pd

mass_centers_df = pd.DataFrame(data=X, 
             columns=[i for i in range(X.shape[1])])
mass_centers_df['text_class'] = Y
mass_centers_gb = mass_centers_df.groupby('text_class')
mass_centers = mass_centers_gb.sum()
mass_center_counts = mass_centers_gb.size()
print('mass_centers:', mass_centers.shape, 'mass_center_counts:', mass_center_counts.shape)
for i in range(len(mass_center_counts)):
    mass_centers.iloc[i] /= mass_center_counts.iloc[i]
print("mass_centers result:", mass_centers.shape)

mass_centers: (23, 100) mass_center_counts: (23,)
mass_centers result: (23, 100)


In [10]:
gb_labels = list()
gb_colors = list()
for label, data in mass_centers_gb:
    gb_labels.append(label)
    gb_colors.append(naive_categories[label])
print('gb_colors:', len(gb_colors), 'gb_labels:', len(gb_labels))

gb_colors: 23 gb_labels: 23


### PCA [ principal component analysis ] - for data

In [11]:
from sklearn import decomposition
pca_X_obj = decomposition.PCA(n_components=3)
pca_X_obj.fit(X)
pca_X = pca_X_obj.transform(X)

In [12]:
import pandas as pd
pca_X_df = pd.DataFrame(data=pca_X, 
             columns=['X', 'Y', 'Z'])
pca_X_df['class'] = colors
pca_X_df['text_class'] = Y
pca_X_df.head()

Unnamed: 0,X,Y,Z,class,text_class
0,-0.109217,-0.015384,-0.10165,1,Health & Beauty
1,0.06616,-0.002414,0.003258,2,Fashion
2,0.045757,0.035837,0.102787,2,Fashion
3,0.19812,0.040379,-0.010218,3,Bags and Travel
4,-0.067842,0.055096,-0.019824,1,Health & Beauty


### Calculation centers of masses from PCA-processed data for visualization

In [13]:
import pandas as pd

pca_X_gb = pca_X_df.groupby('text_class')
pca_X_mass_centers = pca_X_gb.sum()
pca_X_mass_centers_size = pca_X_gb.size()
print('pca_X_mass_centers:', pca_X_mass_centers.shape, 'pca_X_size:', pca_X_mass_centers_size.shape)
for i in range(len(pca_X_mass_centers_size)):
    pca_X_mass_centers.iloc[i] /= pca_X_mass_centers_size.iloc[i]
print("pca_X_mass_centers result:", pca_X_mass_centers.shape)

pca_X_mass_centers: (23, 4) pca_X_size: (23,)
pca_X_mass_centers result: (23, 4)


### Plot titles in category classes

In [14]:
import plotly
from plotly.graph_objs import Scatter, Layout
import plotly.plotly as py
import plotly.graph_objs as go

import numpy as np

plotly.offline.init_notebook_mode(connected=True)

import random
random.seed()

colormap=dict()
for label, index in naive_categories.items():
    r = random.randint(0, 255)
    g = random.randint(0, 255)
    b = random.randint(0, 255)
    color_text = 'rgb({0}, {1}, {2})'.format(r, g, b)
    colormap[label] = color_text

traces = []
for label, data in pca_X_df.groupby('text_class'):
    trace = go.Scatter3d(
        name=label,
        x=data['X'],
        y=data['Y'],
        z=data['Z'],
        mode='markers',
        marker=dict(
            size=5,
            color=colormap[label],
            line=dict(
                color=colormap[label],
                width=0.1
            ),
            opacity=0.2
        )
    )
    traces.append(trace)
    
for index, data in pca_X_mass_centers.iterrows():
    label = data.name
    trace = go.Scatter3d(
        name="MC === "+label,
        x=[data['X']],
        y=[data['Y']],
        z=[data['Z']],
        mode='markers',
        marker=dict(
            size=8,
            color=colormap[label],
        line=dict(
            color=colormap[label],
            width=0.1
        ),
        opacity=0
        )
    )
    traces.append(trace)

layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    legend=dict(x=-.1, y=1.2)
)

plotly.offline.iplot({
    "data": traces,
    "layout": layout
})

### Preprocessing data for cbow and skipgram models

In [15]:
inFileTrain  = '/Users/dmitriy.zenin/dev/lab/10000.csv'
outFileTrain = '/Users/dmitriy.zenin/dev/lab/10000_train.csv'

with open(outFileTrain, "w", newline='') as wfile:
    with open(inFileTrain, newline='') as rfile:
        rdr = csv.reader(rfile, delimiter=',', quotechar='"')
        i = 1
        for row in rdr:
            if len(row) < 2:
                continue
            title = re.sub(extra_symbols_pattern, "", row[0])
            title = re.sub(small_phrases_pattern, "", title)
            title = re.sub(extra_spaces_pattern, " ", title)
            title = title.lower()
            description = re.sub(extra_symbols_pattern, "", row[2])
            description = re.sub(small_phrases_pattern, "", description)
            description = re.sub(extra_spaces_pattern, " ", description)
            description = description.lower()
            wfile.write(title + " " + description)
            if i % 100000 == 0:
                print("done: ", i)
            i += 1

### Calculation words frequency and words frequencies in categories for statistical analysis

In [16]:
%%time
import csv

#import time
#def current_milli_time():
#    return int(round(time.time() * 1000))
#start_time = current_milli_time()
#time_diff = current_milli_time() - start_time
#print("get_category time: ", time_diff)

inStatisticsFile = '/Users/dmitriy.zenin/dev/lab/10000.csv'

wsDict = dict()

with open(inStatisticsFile, newline='') as rfile:
    rdr = csv.reader(rfile, delimiter=',', quotechar='"')
    i = 1
    max_color = 1
    
    for row in rdr:
        if len(row) < 3:
            continue
            
        if len(row[0]) == 0:
            continue
        
        (category, max_color) = get_category(row[1], max_color)
        if category == "none":
            continue
            
        sentence = ' '.join(row[0].splitlines())
        sentence = re.sub(extra_symbols_pattern, "", sentence)
        sentence = re.sub(small_phrases_pattern, "", sentence)
        sentence = re.sub(extra_spaces_pattern, " ", sentence)
        sentence = sentence.lower()
        
        if len(sentence) == 0:
            continue
            
        words = sentence.split()
        try:
            for word in words:
                cDict = wsDict.get(word)
                if cDict == None:
                    wsDict[word] = dict()
                    cDict = wsDict[word]
                if cDict.get(category) == None:
                    cDict[category] = 1
                else:
                    cDict[category] += 1
        except Exception as e:
            print(e)
            print(sentence)
            print(words)
            break
        
        if i % 100000 == 0:
            print("done: ", i)
        i += 1

CPU times: user 786 ms, sys: 11.4 ms, total: 797 ms
Wall time: 809 ms


In [None]:
#import pickle
#def save_obj(obj, name):
#    with open('obj/'+ name + '.pkl', 'wb') as f:
#        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [None]:
#save_obj(wsDict, 'data_classes_distributions')

In [27]:
import pandas as pd
wordStatistics = pd.DataFrame(wsDict).transpose()
wordStatistics = wordStatistics.fillna(0)
ws = wordStatistics
wordStatistics.head()

Unnamed: 0,Bags and Travel,Bedding & Bath,Cameras,Computers & Laptops,Digital Goods,Fashion,Furniture & Décor,Groceries,Health & Beauty,Home Appliances,...,Mobiles & Tablets,Mother & Baby,Motors,Pet Supplies,Sports & Outdoors,Stationery & Craft,"TV, Audio / Video, Gaming & Wearables","Tools, DIY & Outdoor",Toys & Games,Watches Sunglasses Jewellery
0001802609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
003whiteintl,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
006r03205intl,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
007plus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [28]:
ws['count_all'] = ws.sum(axis=1, numeric_only=True)

In [29]:
ws.head()

Unnamed: 0,Bags and Travel,Bedding & Bath,Cameras,Computers & Laptops,Digital Goods,Fashion,Furniture & Décor,Groceries,Health & Beauty,Home Appliances,...,Mother & Baby,Motors,Pet Supplies,Sports & Outdoors,Stationery & Craft,"TV, Audio / Video, Gaming & Wearables","Tools, DIY & Outdoor",Toys & Games,Watches Sunglasses Jewellery,count_all
0001802609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
003whiteintl,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
006643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
006r03205intl,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
007plus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [30]:
ws_sorted = ws.sort_values(by=['count_all'], ascending=False)

In [31]:
ws_sorted.head()

Unnamed: 0,Bags and Travel,Bedding & Bath,Cameras,Computers & Laptops,Digital Goods,Fashion,Furniture & Décor,Groceries,Health & Beauty,Home Appliances,...,Mother & Baby,Motors,Pet Supplies,Sports & Outdoors,Stationery & Craft,"TV, Audio / Video, Gaming & Wearables","Tools, DIY & Outdoor",Toys & Games,Watches Sunglasses Jewellery,count_all
for,27.0,2.0,16.0,360.0,0.0,25.0,6.0,0.0,11.0,3.0,...,26.0,39.0,0.0,179.0,7.0,66.0,3.0,34.0,26.0,1094.0
and,76.0,22.0,13.0,23.0,1.0,55.0,18.0,0.0,12.0,8.0,...,168.0,20.0,0.0,215.0,26.0,24.0,1.0,82.0,66.0,873.0
bag,353.0,1.0,2.0,2.0,0.0,0.0,13.0,0.0,6.0,1.0,...,0.0,5.0,1.0,131.0,52.0,0.0,0.0,20.0,51.0,654.0
new,144.0,10.0,3.0,5.0,0.0,86.0,17.0,0.0,20.0,7.0,...,106.0,8.0,0.0,128.0,10.0,0.0,0.0,42.0,43.0,637.0
shoes,0.0,0.0,0.0,0.0,0.0,99.0,9.0,0.0,0.0,0.0,...,466.0,0.0,0.0,26.0,0.0,0.0,0.0,12.0,1.0,613.0


In [None]:
#ws_sorted.to_csv('obj/data_classes_distributions_df')

In [32]:
import plotly
import plotly.graph_objs as go

plotly.offline.init_notebook_mode(connected=True)

data = [go.Bar(
    x=ws_sorted.index[:1000],
    y=ws_sorted['count_all'][:1000]
)]

plotly.offline.iplot(data)

In [33]:
ws_sorted.loc['storage'].describe()

count     24.000000
mean       8.416667
std       21.596933
min        0.000000
25%        0.000000
50%        1.500000
75%        5.250000
max      101.000000
Name: storage, dtype: float64

In [34]:
import plotly
import plotly.graph_objs as go

plotly.offline.init_notebook_mode(connected=True)

data = [go.Bar(
    x=ws_sorted.loc['fashion'].index,
    y=ws_sorted.loc['fashion']
)]

plotly.offline.iplot(data)