In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import sklearn as sk
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
news_df = pd.read_json('/content/drive/MyDrive/Machine_Learning/projekt/news_df_preprocessed.json', orient='records', lines=True)

In [None]:
news_df.head()

Unnamed: 0,Headline,Body,Label,Headline+Body
0,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,four way bob corker skewer donald trump image ...
1,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,linklater war veteran comedy speak modern amer...
2,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,trump fight corker jeopardize legislative agen...
3,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,egypt cheiron win tie pemex mexican onshore oi...
4,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,jason aldean open snl vegas tribute country si...


TOKENIZATION

In [None]:
news_df['tokenized_text'] = news_df['Headline+Body'].apply(lambda x: x.split())

MODEL

In [None]:
# Hyperparameters
vector_size = 200
window = 5
min_count = 5
epochs = 50

In [None]:
# Trening Word2Vec
w2v_model = Word2Vec(
    sentences=news_df['tokenized_text'],
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    workers=4  # Number of cores
)

In [None]:
# Trening by additional epochs (optional)
w2v_model.train(news_df['tokenized_text'], total_examples=w2v_model.corpus_count, epochs=epochs)



(54695328, 57676450)

In [None]:
# Sampels ov word vectors
print(w2v_model.wv['politics'])  # vector for "politics"

[ 0.37220612  0.19615623 -0.00375452 -0.1703136  -0.17877764 -0.20223732
  2.408265    1.4184501  -0.363228   -0.8808645  -0.6583355  -0.81538635
  1.4338886   3.1924672   0.5595894  -0.49617544 -0.11942227 -1.4604161
  0.17268632 -1.5927088   0.667167   -0.2724807   0.25898042 -1.9763734
 -1.4870657  -1.0931965  -1.3909725  -0.02021848 -2.15151    -0.39372241
  0.48234928  1.5995408   0.34619525  0.5660968  -1.2551457  -1.3711506
 -0.7097035  -0.37470078 -0.85390854  1.1476412  -0.27939004  1.7269415
 -1.1979383  -2.2133038   1.6033461  -0.58214736  1.6139984  -0.04938535
  2.8783572  -2.7831779  -0.22749999 -0.12337209  1.3012351  -1.6613642
 -1.4660261   1.6859261   0.5299649   0.9530054   1.3249013  -0.03585155
  0.17712921  0.7666422  -0.40199518 -1.1283077  -0.03217662  0.88231003
  1.5188154   0.82660687  0.18123233 -1.6877174   0.01134804  0.67544055
 -0.7307195  -0.66877556  2.1009648   1.6177393  -1.5449227   0.7469645
 -1.8224967  -0.67782336 -1.3566854  -0.02647039 -1.58490

In [None]:
# Words most similar to "politics"
print(w2v_model.wv.most_similar('trump', topn=5))

[('corker', 0.4927635192871094), ('obama', 0.44667473435401917), ('tillerson', 0.3904856741428375), ('president', 0.3881445825099945), ('clinton', 0.38262850046157837)]


In [None]:
# Function that calculates average vector for every word
def get_average_word2vec(tokens, model, vector_size):
    valid_tokens = [t for t in tokens if t in model.wv]
    if valid_tokens:
        return np.mean(model.wv[valid_tokens], axis=0)
    else:
        return np.zeros(vector_size)

In [None]:
# adding vectors as new columns
news_df['article_vector'] = news_df['tokenized_text'].apply(
    lambda x: get_average_word2vec(x, w2v_model, vector_size)
)

In [None]:
print(news_df[['Headline+Body', 'article_vector']].head())

                                       Headline+Body  \
0  four way bob corker skewer donald trump image ...   
1  linklater war veteran comedy speak modern amer...   
2  trump fight corker jeopardize legislative agen...   
3  egypt cheiron win tie pemex mexican onshore oi...   
4  jason aldean open snl vegas tribute country si...   

                                      article_vector  
0  [0.46033996, -0.109343626, 0.31062958, 0.13282...  
1  [0.22174864, -0.19731563, 0.11542927, -0.00832...  
2  [0.35529947, -0.051323485, 0.2759785, 0.063373...  
3  [-0.20820667, 0.15874955, -0.18405026, -0.2891...  
4  [-0.06874878, -0.62141293, 0.110453, 0.2759600...  


In [None]:
# Train/test split
X = np.vstack(news_df['article_vector'].values)
y = news_df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Checking sets sizes
print(f'Training set: {X_train.shape[0]} samples')
print(f'test set: {X_test.shape[0]} samples')

Training set: 3190 samples
test set: 798 samples


In [None]:
proportion = (X_train.shape[0]/(X_train.shape[0]+X_test.shape[0]))*100
print(f'Proportion of training set: {proportion:.2f}%')
print(f'Proportion of test set: {100-proportion:.2f}%')

Proportion of training set: 79.99%
Proportion of test set: 20.01%


CLASSIFICATION MODELS

In [None]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC()
}

In [None]:
#Training, prediction and generatin classification report
for model_name, model in models.items():
    # Training
    model.fit(X_train, y_train)
    # Prediction on test set
    y_pred = model.predict(X_test)
    # Printing classification report
    print(f"Model: {model_name}")
    print(classification_report(y_test, y_pred))
    print("\n")

Model: Random Forest
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       450
           1       0.96      0.94      0.95       348

    accuracy                           0.96       798
   macro avg       0.96      0.96      0.96       798
weighted avg       0.96      0.96      0.96       798



Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       450
           1       0.95      0.95      0.95       348

    accuracy                           0.96       798
   macro avg       0.96      0.96      0.96       798
weighted avg       0.96      0.96      0.96       798



Model: Support Vector Machine
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       450
           1       0.98      0.97      0.98       348

    accuracy                           0.98       798
   macro avg       0.98      0.98      0.98   

SAVING

In [None]:
news_df.to_json("/content/drive/MyDrive/Machine_Learning/projekt/news_dataset_vectors.json", orient="records", lines=True)