In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
import spacy
import re
from gensim.models import Word2Vec

In [None]:
news_df = pd.read_json('/content/drive/MyDrive/Machine_Learning/projekt/news_dataset_vectors.json', orient='records', lines=True)

In [None]:
news_df

Unnamed: 0,Headline,Body,Label,Headline+Body,tokenized_text,article_vector
0,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,four way bob corker skewer donald trump image ...,"[four, way, bob, corker, skewer, donald, trump...","[0.4603399634, -0.10934362560000001, 0.3106295..."
1,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,linklater war veteran comedy speak modern amer...,"[linklater, war, veteran, comedy, speak, moder...","[0.22174863520000002, -0.19731563330000002, 0...."
2,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,trump fight corker jeopardize legislative agen...,"[trump, fight, corker, jeopardize, legislative...","[0.3552994728, -0.051323484600000004, 0.275978..."
3,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,egypt cheiron win tie pemex mexican onshore oi...,"[egypt, cheiron, win, tie, pemex, mexican, ons...","[-0.2082066685, 0.1587495506, -0.1840502620000..."
4,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,jason aldean open snl vegas tribute country si...,"[jason, aldean, open, snl, vegas, tribute, cou...","[-0.0687487796, -0.6214129329, 0.1104530022, 0..."
...,...,...,...,...,...,...
3983,CNN and Globalist Exposed - Steve Quayle and A...,"Vietnam Is in Great Danger, You Must Publish a...",0,cnn globalist expose steve quayle alex jones v...,"[cnn, globalist, expose, steve, quayle, alex, ...","[0.6354882121000001, 0.3110466599, 0.533089518..."
3984,Trends to Watch,Trends to Watch\n% of readers think this story...,0,trend watch trend watch reader think story fac...,"[trend, watch, trend, watch, reader, think, st...","[-0.19917035100000002, 0.0497304536, 0.0947959..."
3985,Trump Jr. Is Soon To Give A 30-Minute Speech F...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,0,trump jr soon give minute speech trump jr soon...,"[trump, jr, soon, give, minute, speech, trump,...","[0.0815322027, -0.2945052683, 0.2023320347, 0...."
3986,China to accept overseas trial data in bid to ...,SHANGHAI (Reuters) - China said it plans to ac...,1,china accept overseas trial datum bid speed dr...,"[china, accept, overseas, trial, datum, bid, s...","[-0.26024410130000003, 0.008547413200000001, 0..."


MODELS

In [None]:
X = np.vstack(news_df['article_vector'].values)
y = news_df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "Random Forest": (RandomForestClassifier(), {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10]
    }),
    "Logistic Regression": (LogisticRegression(), {
        "penalty": ['l1', 'l2'],
        "C": [0.1, 1, 10],
        "solver": ['liblinear']
    }),
    "Support Vector Machine": (SVC(), {
        "C": [0.1, 1, 10],
        "kernel": ['linear', 'rbf'],
        "gamma": ['scale', 'auto']
    })
}

In [None]:
# Hyperparameters optimalization
best_params = {}
predictions = {}

for model_name, (model, param_grid) in models.items():
    # GridsearchCV for best params
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='precision', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # saving best param
    best_params[model_name] = grid_search.best_params_

    # training model with best params
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

    # prediction on test set
    y_pred = best_model.predict(X_test)
    predictions[model_name] = y_pred

    # Printing report
    print(f"Model: {model_name} (Najlepsze parametry: {best_params[model_name]})")
    print(classification_report(y_test, y_pred))
    print("\n")

Model: Random Forest (Najlepsze parametry: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 300})
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       450
           1       0.95      0.95      0.95       348

    accuracy                           0.96       798
   macro avg       0.96      0.96      0.96       798
weighted avg       0.96      0.96      0.96       798



Model: Logistic Regression (Najlepsze parametry: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'})
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       450
           1       0.95      0.94      0.95       348

    accuracy                           0.95       798
   macro avg       0.95      0.95      0.95       798
weighted avg       0.95      0.95      0.95       798



Model: Support Vector Machine (Najlepsze parametry: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'})
              precision    recall  f1

In [None]:
# Result analysis
best_model_name = max(predictions, key=lambda model: classification_report(y_test, predictions[model], output_dict=True)['weighted avg']['precision'])
best_precision = classification_report(y_test, predictions[best_model_name], output_dict=True)['weighted avg']['precision']

print(f"Best model for highest precision: {best_model_name}")
print(f"Precision value for best model: {best_precision}")

Best model for highest precision: Support Vector Machine
Precision value for best model: 0.9825330871037242
