In [2]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [3]:
doc = nlp("dog cat banana afskdsd")

for token in doc:
    print(token.text, "|", token.has_vector, 'OOV:', token.is_oov)

dog | True OOV: False
cat | True OOV: False
banana | True OOV: False
afskdsd | False OOV: True


In [4]:
def print_similarity(base_word, words_to_compare):
    base_token = nlp(base_word)
    doc = nlp(words_to_compare)
    for token in doc:
        print(f"{token.text} <-> {base_token.text}: ", token.similarity(base_token))

In [7]:
print_similarity("iphone", "apple samsung iphone dog kitten oppo")

apple <-> iphone:  0.6339781147910419
samsung <-> iphone:  0.6678678666301947
iphone <-> iphone:  1.0
dog <-> iphone:  0.17431037640553934
kitten <-> iphone:  0.14685812907484028
oppo <-> iphone:  0.13549454045927614


In [8]:
import pandas as pd

df = pd.read_csv("Fake_Real_Data.csv")
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [9]:
df.label.value_counts()

Fake    5000
Real    4900
Name: label, dtype: int64

In [10]:
df['label_num'] = df.label.map({
    'Fake': 0,
    'Real': 1
})
df.head(5)

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [11]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [12]:
df['vector'] = df.Text.apply(lambda x: nlp(x).vector)
df.head()

Unnamed: 0,Text,label,label_num,vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,"[-0.103623025, 0.17802684, -0.11873861, -0.034..."
1,U.S. conservative leader optimistic of common ...,Real,1,"[-0.0063406364, 0.16712041, -0.06661373, 0.017..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,"[-0.122753024, 0.17192385, -0.024732638, -0.06..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,"[-0.027337318, 0.12501417, -0.0073965387, -0.0..."
4,Democrats say Trump agrees to work on immigrat...,Real,1,"[-0.032708026, 0.093958504, -0.03287002, -0.00..."


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.vector, 
                                                    df.label_num, 
                                                    test_size = 0.2, 
                                                    random_state = 2022)

In [14]:
import numpy as np

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)
X_train_2d

array([[-0.02370346,  0.14819953, -0.05906299, ..., -0.06582212,
        -0.05378761,  0.08668853],
       [-0.01595326,  0.15394837, -0.10800642, ..., -0.03003666,
        -0.04334445,  0.03076661],
       [-0.04449651,  0.11169833, -0.04756551, ..., -0.10499363,
        -0.00837316,  0.06351685],
       ...,
       [ 0.02167883,  0.12635042, -0.01003216, ..., -0.08063941,
        -0.06881595,  0.04882506],
       [-0.07091133,  0.08315557, -0.06580248, ..., -0.06301989,
         0.02095402,  0.09888683],
       [-0.08993341,  0.14425951, -0.14141384, ..., -0.03444797,
         0.02387965,  0.06281336]], dtype=float32)

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.fit_transform(X_test_2d)

clf = MultinomialNB()

clf.fit(scaled_train_embed, y_train)

MultinomialNB()

In [17]:
from sklearn.metrics import classification_report
y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.97      0.94      0.95      1063
           1       0.93      0.97      0.95       917

    accuracy                           0.95      1980
   macro avg       0.95      0.95      0.95      1980
weighted avg       0.95      0.95      0.95      1980

