<center>
# Category prediction: data preprocessing, visualization and validation
<center>

### Categories tree processing

In [None]:
import json

categories_path = '/Users/dzendmitry/dev/lab/category_prediction/categories.json'

def dive_into_category(objects, arr):
    for obj in objects:
        arr.append(obj)
        if obj["child_count"] > 0:
            dive_into_category(obj["children"], arr)
            
def build_category_arr():
    categories_tree = json.load(open(categories_path))
    arr = []
    dive_into_category(categories_tree, arr)
    return arr

def get_subtree(categories, name_en):
    subtree = []
    for category_obj in categories:
        if category_obj["name_en"] != name_en:
            continue
        subtree.append(category_obj)
        break
    if len(subtree) == 0:
        return []
    while subtree[-1]["parent"] != None:
        parent_id = subtree[-1]["parent"]
        for category_obj in categories:
            if category_obj["id_catalog_category"] != parent_id:
                continue
            subtree.append(category_obj)
            break
    return subtree
        
def get_category_lvl(categories, name_en, level):
    subtree = get_subtree(categories, name_en)
    if len(subtree) == 0:
        return []
    if level >= len(subtree):
        return subtree[0]
    return subtree[-level]

categories_arr = build_category_arr()

# map: category -> color
naive_categories = dict()

def get_category(categories, max_color):
    target_category = "none"
    ncs = categories.split('|')
    for nc in ncs:
        c = get_category_lvl(categories_arr, nc, 1)
        if len(c) == 0:
            continue
        target_category = c["name_en"]
        break
    if target_category != "none" and target_category not in naive_categories:
        naive_categories[target_category] = max_color
        max_color += 1
    return (target_category, max_color)

In [None]:
#modelFile = '/Users/dzendmitry/dev/lab/category_prediction/fasttext_models/model_wiki_en/model.bin'
modelFile = '/Users/dzendmitry/dev/lab/category_prediction/fasttext_models/model_full_cbow_v1_minn3_maxn10_wordNgrams2_lr0.5_dim100_ws3/model.bin'
#modelFile = '/Users/dzendmitry/dev/lab/category_prediction/fasttext_models/model_full_skipgram_v1_minn3_maxn10_wordNgrams2_lr0.5_dim100_ws3/model.bin'

In [None]:
import fastText
model = fastText.load_model(modelFile)

In [None]:
help(model)

### Regular expressions for data preprocessing

In [None]:
import re
extra_symbols_pattern = re.compile(r"[^A-Za-z0-9\s]")
small_phrases_pattern = re.compile(r"\s\S{,2}\s")
extra_spaces_pattern  = re.compile(r'\s{2,}')

### Preprocessing data with category unification

In [None]:
inFile    = '/Users/dzendmitry/dev/lab/category_prediction/1000000.csv'
outFile   = '/Users/dzendmitry/dev/lab/category_prediction/1000000_out.csv'

In [None]:
import csv

with open(outFile, "w", newline='') as wfile:
    wtr = csv.writer(wfile)
    with open(inFile, newline='') as rfile:
        rdr = csv.reader(rfile, delimiter=',', quotechar='"')
        i = 1
        for row in rdr:
            if len(row) < 2:
                continue
            categories = row[1].split('|')
            sentence = ' '.join(row[0].splitlines())
            sentence = re.sub(extra_symbols_pattern, "", sentence)
            sentence = re.sub(small_phrases_pattern, "", sentence)
            sentence = re.sub(extra_spaces_pattern, " ", sentence)
            sentence = sentence.lower()
            sv = model.get_sentence_vector(sentence) # category unification
            wtr.writerow((sentence, sv, '|'.join(sorted(categories, key=str.lower))))
            if i % 100000 == 0:
                print("done: ", i)
            i += 1

### Generation datasets from preprocessed data

In [None]:
import numpy as np
import csv

X = list()
Y = list()
colors = list()
Y_nums = list()

with open(outFile, newline='') as rfile:
    rdr = csv.reader(rfile, delimiter=',', quotechar='"')
    max_color = 1
    for row in rdr:
        v = np.fromstring(row[1][1:-1], dtype=np.float64, sep=' ')
        (category, max_color) = get_category(row[2], max_color)
        if category == "none":
            continue
        X.append(v)
        Y.append(category)
        colors.append(naive_categories[category])
    X = np.asarray(X)
    colors = np.asarray(colors)
    Y_nums = colors.copy()

In [None]:
print("X:", X.shape, "Y:", len(Y), "colors:", colors.shape, "naive_categories:", len(naive_categories))

### Calculation centers of masses from raw data

In [None]:
import pandas as pd

mass_centers_df = pd.DataFrame(data=X, 
             columns=[i for i in range(X.shape[1])])
mass_centers_df['text_class'] = Y
mass_centers_gb = mass_centers_df.groupby('text_class')
mass_centers = mass_centers_gb.sum()
mass_center_counts = mass_centers_gb.size()
print('mass_centers:', mass_centers.shape, 'mass_center_counts:', mass_center_counts.shape)
for i in range(len(mass_center_counts)):
    mass_centers.iloc[i] /= mass_center_counts.iloc[i]
print("mass_centers result:", mass_centers.shape)

In [None]:
gb_labels = list()
gb_colors = list()
for label, data in mass_centers_gb:
    gb_labels.append(label)
    gb_colors.append(naive_categories[label])
print('gb_colors:', len(gb_colors), 'gb_labels:', len(gb_labels))

### PCA [ principal component analysis ] - for data

In [None]:
from sklearn import decomposition
pca_X_obj = decomposition.PCA(n_components=3)
pca_X_obj.fit(X)
pca_X = pca_X_obj.transform(X)

In [None]:
import pandas as pd
pca_X_df = pd.DataFrame(data=pca_X, 
             columns=['X', 'Y', 'Z'])
pca_X_df['class'] = colors
pca_X_df['text_class'] = Y
pca_X_df.head()

### Calculation centers of masses from PCA-processed data for visualization

In [None]:
import pandas as pd

pca_X_gb = pca_X_df.groupby('text_class')
pca_X_mass_centers = pca_X_gb.sum()
pca_X_mass_centers_size = pca_X_gb.size()
print('pca_X_mass_centers:', pca_X_mass_centers.shape, 'pca_X_size:', pca_X_mass_centers_size.shape)
for i in range(len(pca_X_mass_centers_size)):
    pca_X_mass_centers.iloc[i] /= pca_X_mass_centers_size.iloc[i]
print("pca_X_mass_centers result:", pca_X_mass_centers.shape)

### Plot titles in category classes

In [None]:
import plotly
from plotly.graph_objs import Scatter, Layout
import plotly.plotly as py
import plotly.graph_objs as go

import numpy as np

plotly.offline.init_notebook_mode(connected=True)

import random
random.seed()

colormap=dict()
for label, index in naive_categories.items():
    r = random.randint(0, 255)
    g = random.randint(0, 255)
    b = random.randint(0, 255)
    color_text = 'rgb({0}, {1}, {2})'.format(r, g, b)
    colormap[label] = color_text

traces = []
for label, data in pca_X_df.groupby('text_class'):
    trace = go.Scatter3d(
        name=label,
        x=data['X'],
        y=data['Y'],
        z=data['Z'],
        mode='markers',
        marker=dict(
            size=5,
            color=colormap[label],
            line=dict(
                color=colormap[label],
                width=0.1
            ),
            opacity=0.2
        )
    )
    traces.append(trace)
    
for index, data in pca_X_mass_centers.iterrows():
    label = data.name
    trace = go.Scatter3d(
        name="MC === "+label,
        x=[data['X']],
        y=[data['Y']],
        z=[data['Z']],
        mode='markers',
        marker=dict(
            size=8,
            color=colormap[label],
        line=dict(
            color=colormap[label],
            width=0.1
        ),
        opacity=0
        )
    )
    traces.append(trace)

layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    legend=dict(x=-.1, y=1.2)
)

plotly.offline.iplot({
    "data": traces,
    "layout": layout
})

### Preprocessing data for cbow and skipgram models

In [None]:
inFileTrain  = '/Users/dzendmitry/dev/lab/category_prediction/10000.csv'
outFileTrain = '/Users/dzendmitry/dev/lab/category_prediction/10000_train.csv'

with open(outFileTrain, "w", newline='') as wfile:
    with open(inFileTrain, newline='') as rfile:
        rdr = csv.reader(rfile, delimiter=',', quotechar='"')
        i = 1
        for row in rdr:
            if len(row) < 2:
                continue
            title = re.sub(extra_symbols_pattern, "", row[0])
            title = re.sub(small_phrases_pattern, "", title)
            title = re.sub(extra_spaces_pattern, " ", title)
            title = title.lower()
            description = re.sub(extra_symbols_pattern, "", row[2])
            description = re.sub(small_phrases_pattern, "", description)
            description = re.sub(extra_spaces_pattern, " ", description)
            description = description.lower()
            wfile.write(title + " " + description)
            if i % 100000 == 0:
                print("done: ", i)
            i += 1

### Calculation words frequency and words frequencies in categories for statistical analysis

In [None]:
%%time
import csv

#import time
#def current_milli_time():
#    return int(round(time.time() * 1000))
#start_time = current_milli_time()
#time_diff = current_milli_time() - start_time
#print("get_category time: ", time_diff)

inStatisticsFile = '/Users/dzendmitry/dev/lab/category_prediction/10000.csv'

wsDict = dict()

with open(inStatisticsFile, newline='') as rfile:
    rdr = csv.reader(rfile, delimiter=',', quotechar='"')
    i = 1
    max_color = 1
    
    for row in rdr:
        if len(row) < 3:
            continue
            
        if len(row[0]) == 0:
            continue
        
        (category, max_color) = get_category(row[1], max_color)
        if category == "none":
            continue
            
        sentence = ' '.join(row[0].splitlines())
        sentence = re.sub(extra_symbols_pattern, "", sentence)
        sentence = re.sub(small_phrases_pattern, "", sentence)
        sentence = re.sub(extra_spaces_pattern, " ", sentence)
        sentence = sentence.lower()
        
        if len(sentence) == 0:
            continue
            
        words = sentence.split()
        try:
            for word in words:
                cDict = wsDict.get(word)
                if cDict == None:
                    wsDict[word] = dict()
                    cDict = wsDict[word]
                if cDict.get(category) == None:
                    cDict[category] = 1
                else:
                    cDict[category] += 1
        except Exception as e:
            print(e)
            print(sentence)
            print(words)
            break
        
        if i % 100000 == 0:
            print("done: ", i)
        i += 1

In [None]:
#import pickle
#def save_obj(obj, name):
#    with open('obj/'+ name + '.pkl', 'wb') as f:
#        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [None]:
#save_obj(wsDict, 'data_classes_distributions')

In [None]:
import pandas as pd
wordStatistics = pd.DataFrame(wsDict).transpose()
wordStatistics = wordStatistics.fillna(0)
ws = wordStatistics
wordStatistics.head()

In [None]:
ws['count_all'] = ws.sum(axis=1, numeric_only=True)

In [None]:
ws.head()

In [None]:
ws_sorted = ws.sort_values(by=['count_all'], ascending=False)

In [None]:
ws_sorted.head()

In [None]:
#ws_sorted.to_csv('obj/data_classes_distributions_df')

In [None]:
import plotly
import plotly.graph_objs as go

plotly.offline.init_notebook_mode(connected=True)

data = [go.Bar(
    x=ws_sorted.index[:1000],
    y=ws_sorted['count_all'][:1000]
)]

plotly.offline.iplot(data)

In [None]:
ws_sorted.loc['storage'].describe()

In [None]:
import plotly
import plotly.graph_objs as go

plotly.offline.init_notebook_mode(connected=True)

data = [go.Bar(
    x=ws_sorted.loc['fashion'].index,
    y=ws_sorted.loc['fashion']
)]

plotly.offline.iplot(data)

### Validation procedure (The Euclidean metric between mass centers)

In [None]:
from pandas import merge
import numpy as np
import matplotlib.pylab as plt
import pandas as pd

def calc_cartesian_matrix(mass_centers):
    
    def set_zeroes_below_digonal(m):
        dimx, dimy = m.shape
        for i in range(dimx):
            for j in range(dimy):
                if j < i:
                    m[i, j] = 0
                else:
                    break
        return m
    
    merge_matrix = mass_centers.copy()
    merge_matrix['key'] = 1
    merge_matrix = merge(merge_matrix, merge_matrix, on=['key'])
    merge_matrix = merge_matrix.drop(columns=['key'])
    x = merge_matrix.iloc[:, :int(merge_matrix.shape[1]/2)].as_matrix()
    y = merge_matrix.iloc[:, int(merge_matrix.shape[1]/2):].as_matrix()
    cartesian_matrix = np.reshape(np.sqrt(((x - y) ** 2).sum(axis=1)), (mass_centers.shape[0], mass_centers.shape[0]))
    return set_zeroes_below_digonal(cartesian_matrix)

def draw_matrix(m):
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.set_aspect('equal')
    plt.imshow(m, interpolation='nearest', cmap=plt.cm.ocean)
    plt.colorbar()
    plt.show()
    
def matrix_to_df_table(m):
    pd.set_option("display.max_columns",23)
    return pd.DataFrame(data=m, columns=[i for i in range(1, m.shape[1]+1)]).round(2)

### wiki en model

In [None]:
mass_centers_wiki_en = mass_centers.copy()

In [None]:
cartesian_matrix_wiki_en = calc_cartesian_matrix(mass_centers_wiki_en)
draw_matrix(cartesian_matrix_wiki_en)
matrix_to_df_table(cartesian_matrix_wiki_en)

### cbow model

In [None]:
mass_centers_cbow = mass_centers.copy()

In [None]:
cartesian_matrix_cbow = calc_cartesian_matrix(mass_centers_cbow)
draw_matrix(cartesian_matrix_cbow)
matrix_to_df_table(cartesian_matrix_cbow)

### skipgram model

In [None]:
mass_centers_skipgram = mass_centers.copy()

In [None]:
cartesian_matrix_skipgram = calc_cartesian_matrix(mass_centers_skipgram)
draw_matrix(cartesian_matrix_skipgram)
matrix_to_df_table(cartesian_matrix_skipgram)

### models diff

In [None]:
model_1 = cartesian_matrix_cbow

In [None]:
model_2 = cartesian_matrix_skipgram

In [None]:
cartesian_matrix_diff = model_1 - model_2
draw_matrix(cartesian_matrix_diff)
matrix_to_df_table(cartesian_matrix_diff)

## SVM classifier

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y_nums, test_size=0.2, random_state=40)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
import time
from sklearn.svm import SVC

#clf = SVC(kernel='linear', C=30.0)
clf = SVC(kernel='poly', C=100.0, gamma=0.9, coef0=0.1, degree=3) # 0.8048
#clf = SVC(kernel='sigmoid', C=10.0, gamma=0.9, coef0=0.1)
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
print("Linear SVM", end - start, clf.score(X_test, y_test))
y_predicted = clf.predict(X_test)

In [None]:
import time
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier

start = time.time()
#svm = SVC(kernel='linear', probability=True)
clf = BaggingClassifier(n_jobs=-1)
clf.fit(X_train, y_train)
end = time.time()
print("Bagging SVC", end - start, clf.score(X_test, y_test))
y_predicted = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

### Generate data for libsvm

In [None]:
inFile  = '/Users/dzendmitry/dev/lab/category_prediction/1000000_out.csv'
outFile = '/Users/dzendmitry/dev/lab/category_prediction/1000000.svm'

In [None]:
import numpy as np
import csv

with open(outFile, 'w') as wfile:
    with open(inFile, newline='') as rfile:
        rdr = csv.reader(rfile, delimiter=',', quotechar='"')
        max_color = 1
        for row in rdr:
            v = np.fromstring(row[1][1:-1], dtype=np.float64, sep=' ')
            (category, max_color) = get_category(row[2], max_color)
            if category == "none":
                continue
            c = naive_categories[category]
            indices = range(1, len(v)+1)
            libsvm_line = " ".join(["{}:{}".format(i, v) for (i, v) in zip(indices, v)])
            wfile.write("{} {}\n".format(c, libsvm_line))

In [None]:
def matriceToSvmFile(outFile, X, y):
    with open(outFile, 'w') as wfile:
        for (c, v) in zip(y, X):
            indices = range(1, len(v)+1)
            libsvm_line = " ".join(["{}:{}".format(i, v) for (i, v) in zip(indices, v)])
            wfile.write("{} {}\n".format(c, libsvm_line))

In [None]:
matriceToSvmFile("1000000_train.svm", X_train, y_train)
matriceToSvmFile("1000000_test.svm", X_test, y_test)