# 26. Main Categories - Word2Vec
For comparison, we want to check the result without balancing with word2vec and main categories, since we didn't know the score of this combination yet.

## Preprocessing

In [54]:
import pandas as pd
from preprocessing import PreProcessor
import warnings

warnings.simplefilter("ignore")

pp = PreProcessor()

df = pd.read_csv('../Data/Structured_DataFrame_Main_Categories.csv', index_col=0)
df['Item Description'] = df['Item Description'].apply(lambda d: pp.preprocess(str(d)))
df

Unnamed: 0,Category,Item Description,category_id
0,Services,month huluplu gift code month huluplu code wor...,0
1,Services,pay tv sky uk sky germani hd tv much cccam ser...,0
2,Services,offici account creator extrem tag submiss fix ...,0
3,Services,vpn tor sock tutori setup vpn tor sock super s...,0
4,Services,facebook hack guid guid teach hack facebook ac...,0
...,...,...,...
109585,Drugs,gr purifi opium list gramm redefin opium pefec...,1
109586,Weapons,ship ticket order ship one gun bought must bou...,11
109587,Drugs,gram white afghani heroin full escrow gram whi...,1
109588,Drugs,gram white afghani heroin full escrow gram whi...,1


## Splitting

In [55]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df["Item Description"], df.Category, test_size=0.33, random_state=0)

data_train = {'Category': y_train, 'Item_Description': X_train}
df_train = pd.DataFrame(data_train)
print(df_train.shape)

data_test = {'Category': y_test, 'Item_Description': X_test}
df_test = pd.DataFrame(data_test)
print(df_test.shape)

(73407, 2)
(36156, 2)


## Vectorizing

In [56]:
from gensim.models import Word2Vec
from nltk import word_tokenize
import numpy as np

def word2vec(corpus, size):
    tokenized = [word_tokenize(row) for row in corpus]
    model = Word2Vec(tokenized, size=size, workers=8)
    vectors = []
    for i, row in enumerate(tokenized):
        sentence_vectors = [model.wv[word] for word in row if word in model.wv]
        if len(sentence_vectors) == 0:
            vectors.append(np.random.uniform(low=-1, high=1, size=(128,)))
        else:
            sentence_vector = np.average(sentence_vectors, axis=0)
            vectors.append(sentence_vector)
    return vectors, model

def w2vTransform(sentence, model):
    sentence_vectors = [model.wv[word] for word in word_tokenize(sentence) if word in model.wv]
    if len(sentence_vectors) == 0:
        return np.random.uniform(low=-1, high=1, size=(128,))
    return np.average(sentence_vectors, axis=0)

X_train, model = word2vec(df_train.Item_Description, 128)
y_train = df_train.Category.values
X_test = df_test.Item_Description.apply(lambda x: w2vTransform(x, model)).tolist()
y_test = df_test.Category.values

## Training

In [57]:
from sklearn.svm import LinearSVC

model = LinearSVC()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)



## Results

In [58]:
from sklearn import metrics

print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print()
print(metrics.classification_report(y_test, y_pred, target_names=df['Category'].unique()))

Accuracy:  0.922281225799314

                    precision    recall  f1-score   support

          Services       1.00      0.10      0.18        31
             Drugs       0.82      0.79      0.81       705
         Forgeries       0.72      0.69      0.71       675
           Tobacco       0.67      0.61      0.63       275
      Counterfeits       0.96      1.00      0.98     30807
              Data       0.73      0.12      0.21       192
       Information       0.82      0.76      0.79       323
       Electronics       0.47      0.57      0.52       725
Drug paraphernalia       0.38      0.12      0.19       642
             Other       0.67      0.57      0.62       129
           Jewelry       0.44      0.01      0.02       474
           Weapons       0.49      0.53      0.51       837
              Info       0.89      0.66      0.76       134
         Chemicals       0.65      0.46      0.54       207

         micro avg       0.92      0.92      0.92     36156
        

## Conclusion