In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(1, '/Users/giorgigonashvili/Desktop/Hackathon/FakeNewsDetectorBot/bot_venv/lib/python3.10/site-packages')

## Basic imports

In [3]:
import numpy as np 
import pandas as pd

## Read data

In [4]:
def read_data(dataset_name, label):
    with open(dataset_name) as f:
        lines = f.readlines()
        
    df = pd.DataFrame(lines, columns = ['text'])
    mask = df['text'] != '\n'

    df['label'] = label
    return df[mask].copy()
    

In [5]:
df = pd.concat([read_data('sample.txt', 0), read_data('fakes.txt', 1)], axis=0)
df = df.sample(frac=1)
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,text,label
0,"at almost the same moment, a russian influence...",0
1,"“over the next few months, we will begin worki...",0
2,the shelling of the zaporizhzhia nuclear power...,0
3,](https://www.washingtonpost.com/world/2022/08...,0
4,videos from the scene and an assessment by loc...,0


## Vectorise data

In [6]:
import nltk
nltk.download('punkt')

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/giorgigonashvili/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
data = df['text']
tagged_tex = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]


In [8]:

def get_trained_model(vec_size, tagged_data):
    max_epochs = 100
#     vec_size = 20
    alpha = 0.025

    model = Doc2Vec(vector_size=vec_size,
                  alpha=alpha, 
                  min_alpha=0.00025,
                  min_count=1,
                  dm =1)

    model.build_vocab(tagged_data)

    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model.train(tagged_data,
                  total_examples=model.corpus_count,
                  epochs=10)
        # print(tagged_data)

        # decrease the learning rate
        model.alpha -= 0.0002
        # fix the learning rate, no decay
        model.min_alpha = model.alpha
  
    return model

model_tex = get_trained_model(64, tagged_tex)
model_tex

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

<gensim.models.doc2vec.Doc2Vec at 0x107b78550>

In [9]:
def predict(model, sentences):
    vectors = list()
    for x in sentences:
        tokenized = word_tokenize(x.lower())
        vector = model.infer_vector(tokenized)
        vectors.append(vector)
    return vectors
#     fin_list.append(vec)
X = predict(model_tex, data)
y = df['label']

print(len(X), y.shape)

819 (819,)


## Train model

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [11]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X, y)

In [18]:
test_data = ['the ukrainian military extended the fight deeper into']

test_x = predict(model_tex, test_data)
neighbors = knn.kneighbors(test_x)
test_pred = knn.predict(test_x)
test_pred

array([0])

In [96]:
test_pred[1][0]

array([537, 173, 314])

In [97]:
data[test_pred[1][0]]

537           [**read more](https://nyti.ms/3vk8lnf)**\n
173    ](https://www.washingtonpost.com/world/2022/08...
314           [**read more](https://nyti.ms/3qneqz5)**\n
Name: text, dtype: object

## Save models

In [12]:
model_tex.save("text_vectorizer.model")

In [13]:
import pickle

with open('knn', 'wb') as files:
    pickle.dump(knn, files)

In [20]:

with open('knn' , 'rb') as f:
    blaa = pickle.load(f)

In [21]:
blaa.predict(test_x)

array([0])

In [23]:
test_pred[0]

0