In [47]:
import pickle
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

### Read preprocessed data

In [48]:
df_sentences = pd.read_csv('data/preprocessed_data.csv')
df_sentences

Unnamed: 0,Content,Label
0,binance towel come everywhere include breakfas...,1
1,drop sol address make sure follow dont ask bac...,1
2,take one good altcoin change world sparkle,2
3,disrespectful one top 3 favorite,2
4,congressional republican call affordable conne...,1
...,...,...
838,shakira 's amazing 12th studio album 'las muje...,2
839,news tesla overtook mercedes second large sell...,2
840,ultimate guide festival celebrate latin music ...,1
841,youre invite see rogueport may 23 papermarioth...,2


### Split data into train and test set

In [49]:
train, test = train_test_split(df_sentences, test_size=0.2, random_state=42)
train['Label'].value_counts()

Label
1    351
2    181
0    142
Name: count, dtype: int64

### Split into tokens

In [50]:
train_tokenized = [[word for word in sentence.split()] for sentence in train['Content']]
test_tokenized = [[word for word in sentence.split()] for sentence in test['Content']]
train_tokenized

[['there',
  'extended',
  'episode',
  'vanderpump',
  'rule',
  'online',
  'heres',
  'watch',
  'stream',
  'show',
  'without',
  'cable'],
 ['recent',
  'article',
  'press',
  "'winning",
  "'",
  'proxy',
  'contest',
  'nelson',
  'peltz',
  'base',
  'early',
  'election',
  'return',
  'leak',
  'medium',
  "n't",
  'investment',
  'disney',
  'think',
  'useful',
  'point',
  'inappropriateness'],
 ['sunlight', 'best', 'disinfectant'],
 ['katy',
  'perry',
  'teenage',
  'dream',
  'album',
  'cover',
  'actually',
  'painting'],
 ['cry', 'face', 'hold', 'back', 'tear', 'purple', 'heart', 'love'],
 ['know', 'legacy', 'medium', 'notification'],
 ['entre',
  'parntesis',
  'shakira',
  'grupo',
  'frontera',
  'reach',
  '10',
  'million',
  'view',
  'youtube',
  'deserve',
  'waayy',
  'cowboy',
  'hat',
  'face',
  'watch'],
 ['city',
  'dominate',
  'highway',
  'car',
  "'s",
  'partly',
  'one',
  'influential',
  'people',
  "'ve",
  'never',
  'hear',
  'norman',
  'b

### Train Word2Vec model (skip-gram)

In [51]:
model_skipGram = Word2Vec(sentences=train_tokenized, vector_size=100, window=5,  min_count=1, workers=4, sg=1)
# save the trained model
model_skipGram.save('model/modelSkipGram.bin')

In [52]:
# load the trained model
model_sg = Word2Vec.load('model/modelSkipGram.bin')
print(type(model_sg.wv['binance']))
model_sg.wv['binance']

<class 'numpy.ndarray'>


array([-0.01715061,  0.0024371 , -0.00404767,  0.00152807,  0.00076108,
       -0.01306054,  0.00834662,  0.02935744, -0.00291367, -0.00995072,
       -0.00447798, -0.02028175, -0.00466269,  0.01310427, -0.00465026,
       -0.00780595,  0.0033968 , -0.02193251, -0.00644233, -0.00607408,
       -0.00101301,  0.01504509,  0.00941008, -0.00095922,  0.00543354,
        0.0048363 , -0.0138449 , -0.00268137, -0.00530938,  0.00632365,
        0.01307777,  0.00159252,  0.00789268,  0.0012772 , -0.00772045,
        0.01575958,  0.00909421, -0.01588465, -0.01566426, -0.0183822 ,
       -0.0088186 , -0.01197623,  0.00324388, -0.00476131, -0.00024461,
       -0.01185415, -0.01069439, -0.01249297,  0.0007575 ,  0.00035558,
        0.00949792, -0.01234568, -0.00860571, -0.01030694, -0.01203183,
        0.00513341,  0.01398354, -0.0063936 , -0.00528979,  0.00699421,
        0.01028918,  0.00812965, -0.01125005,  0.00111191, -0.01891954,
        0.01733762,  0.00475594, -0.00304403, -0.00749958,  0.00

### Convert all words to vectors

In [53]:
def convert_to_vector(model, tokenized_sentence):
    vector = []
    for word in tokenized_sentence:
        if word in model.wv:
            vector.append(model.wv[word])
    
    # return the average of the vectors
    output = np.mean(vector, axis=0) if vector else np.zeros(model.vector_size)
    
    return output

train_vectors = np.array([convert_to_vector(model_sg, sentence) for sentence in train_tokenized])
test_vectors = np.array([convert_to_vector(model_sg, sentence) for sentence in test_tokenized])
train_vectors.shape

(674, 100)

### Cross-validation using RandomForestClassifier

In [54]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42)
cross_val_score(clf, train_vectors, train['Label'], cv=5)

array([0.66666667, 0.60740741, 0.60740741, 0.62962963, 0.59701493])

### Train the classifier and get the performance on test set

In [55]:
from sklearn.metrics import accuracy_score

# train the model
clf.fit(train_vectors, train['Label'])

# predict the test set
y_pred = clf.predict(test_vectors)
accuracy_score(test['Label'], y_pred)

0.6804733727810651