# 26. Balancing by copying - Word2Vec
We want to try sample balancing on w2v as well.

## Preprocessing

In [1]:
import pandas as pd
from preprocessing import PreProcessor

pp = PreProcessor()

df = pd.read_csv('../Data/Structured_DataFrame_Main_Categories.csv', index_col=0)
df['Item Description'] = df['Item Description'].apply(lambda d: pp.preprocess(str(d)))
df

Unnamed: 0,Category,Item Description,category_id
0,Services,month huluplu gift code month huluplu code wor...,0
1,Services,pay tv sky uk sky germani hd tv much cccam ser...,0
2,Services,offici account creator extrem tag submiss fix ...,0
3,Services,vpn tor sock tutori setup vpn tor sock super s...,0
4,Services,facebook hack guid guid teach hack facebook ac...,0
...,...,...,...
109585,Drugs,gr purifi opium list gramm redefin opium pefec...,1
109586,Weapons,ship ticket order ship one gun bought must bou...,11
109587,Drugs,gram white afghani heroin full escrow gram whi...,1
109588,Drugs,gram white afghani heroin full escrow gram whi...,1


## Splitting

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df["Item Description"], df.Category, test_size=0.33, random_state=0)

data_train = {'Category': y_train, 'Item_Description': X_train}
df_train = pd.DataFrame(data_train)
print(df_train.shape)

data_test = {'Category': y_test, 'Item_Description': X_test}
df_test = pd.DataFrame(data_test)
print(df_test.shape)

(73407, 2)
(36156, 2)


## Sampling

In [3]:
grouped = df_train.groupby('Category', group_keys=False)
df_train_balanced = pd.DataFrame(grouped.apply(lambda x: x.sample(grouped.size().max(), replace=True))).reset_index(drop=True)
df_train_balanced

Unnamed: 0,Category,Item_Description
0,Chemicals,iodin pure g iodin pure g pleas note reship av...
1,Chemicals,chemistri advic provid high level chemistri su...
2,Chemicals,glacial acet acid gaa ml glacial acet acid ar
3,Chemicals,aeroslab lightweight test plate aeroslab light...
4,Chemicals,kg red phosphoru reagent grade free em ship re...
...,...,...
870879,Weapons,tauru pt mm full escrow list weapon shown come...
870880,Weapons,list daza gun btc
870881,Weapons,otf mini microtech daytona tanto point plane e...
870882,Weapons,black flat kubotan self defens keyr fast ship ...


## Vectorizing

In [4]:
from gensim.models import Word2Vec
from nltk import word_tokenize
import numpy as np

def word2vec(corpus, size):
    tokenized = [word_tokenize(row) for row in corpus]
    model = Word2Vec(tokenized, size=size, workers=8)
    vectors = []
    for i, row in enumerate(tokenized):
        sentence_vectors = [model.wv[word] for word in row if word in model.wv]
        if len(sentence_vectors) == 0:
            vectors.append(np.random.uniform(low=-1, high=1, size=(128,)))
        else:
            sentence_vector = np.average(sentence_vectors, axis=0)
            vectors.append(sentence_vector)
    return vectors, model

def w2vTransform(sentence, model):
    sentence_vectors = [model.wv[word] for word in word_tokenize(sentence) if word in model.wv]
    if len(sentence_vectors) == 0:
        return np.random.uniform(low=-1, high=1, size=(128,))
    return np.average(sentence_vectors, axis=0)

X_train, model = word2vec(df_train_balanced.Item_Description, 128)
y_train = df_train_balanced.Category.values
X_test = df_test.Item_Description.apply(lambda x: w2vTransform(x, model)).tolist()
y_test = df_test.Category.values

## Training

In [5]:
from sklearn.svm import LinearSVC

model = LinearSVC()

model.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [6]:
y_pred = model.predict(X_test)

## Results

In [9]:
from sklearn import metrics

print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print()
print(metrics.classification_report(y_test, y_pred, target_names=df['Category'].unique()))

Accuracy:  0.8659420289855072



ValueError: Number of classes, 14, does not match size of target_names, 109563. Try specifying the labels parameter

## Conclusion