# Model Training

In [3]:
import numpy as np

X = np.load('data/features.npy')
y= np.load('data/labels.npy')

In [32]:
X.shape

(71537, 100)

In [4]:
y

array([1., 1., 0., ..., 0., 0., 1.], dtype=float32)

In [42]:
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
SUBSET_SIZE = 71537
TRAIN_SPLIT = 0.8

X_subset = X[:SUBSET_SIZE]
y_subset = y[:SUBSET_SIZE]

#X_train, X_test, y_train, y_test = train_test_split(X_subset, y_subset, test_size=0.2, random_state=RANDOM_SEED)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

## Connect to MLFLOW

In [11]:
import mlflow

EXPERIMENT_NAME = 'fake_news_detector'

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

<Experiment: artifact_location='mlflow-artifacts:/725549188942396702', creation_time=1699380828640, experiment_id='725549188942396702', last_update_time=1699380828640, lifecycle_stage='active', name='fake_news_detector', tags={}>


In [43]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score

run_name = 'run-fake-news-detector-SVC'

modelSVC = SVC(random_state=RANDOM_SEED)

with mlflow.start_run(
    experiment_id=experiment.experiment_id,
    run_name=run_name
) as run:
     # set the tags
    mlflow.set_tags({
        "default_model_params":False,
        "model": "SVC",
        "author": "@dcc2k",
    })
    
    # Log a parameter (key-value pair)
    # Log the model parameters
    mlflow.log_param("random_seed", RANDOM_SEED)
    mlflow.log_param("train_size", TRAIN_SPLIT)
    mlflow.log_param("subset_size", SUBSET_SIZE)
    
    modelSVC.fit(X_train,y_train)
    
    y_pred = modelSVC.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precission = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    
     # Log a metric; metrics can be updated throughout the run
    mlflow.log_metric("accuracy", round(accuracy,3))
    mlflow.log_metric("precision", round(precission,3))
    mlflow.log_metric("recall", round(recall,3))

In [49]:
run_name = 'run-fake-news-detector-SVC'

mlflow.set_tracking_uri("http://127.0.0.1:5000")

with mlflow.start_run(
    experiment_id=experiment.experiment_id,
    run_name=run_name,
) as run:
    
    # log the confusion matrix as an artifact
    mlflow.sklearn.log_model(modelSVC, "SVC_model") 
    mlflow.set_tags({"model": "SVC"})

    # Print the run ID
    print(f"Run ID: {run.info.run_id}")

Run ID: af1b4f01c5774039bbdbb00c811f612e




In [44]:
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score

run_name = 'run-fake-news-detector-LogisticRegression'

modelLR = LogisticRegression(random_state=RANDOM_SEED)

with mlflow.start_run(
    experiment_id=experiment.experiment_id,
    run_name=run_name
) as run:
     # set the tags
    mlflow.set_tags({
        "default_model_params":False,
        "model": "LogisticRegression",
        "author": "@dcc2k",
    })
    
    # Log a parameter (key-value pair)
    # Log the model parameters
    mlflow.log_param("random_seed", RANDOM_SEED)
    mlflow.log_param("train_size", TRAIN_SPLIT)
    mlflow.log_param("subset_size", SUBSET_SIZE)
    
    modelLR.fit(X_train,y_train)
    
    y_pred = modelLR.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precission = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    
     # Log a metric; metrics can be updated throughout the run
    mlflow.log_metric("accuracy", round(accuracy,3))
    mlflow.log_metric("precision", round(precission,3))
    mlflow.log_metric("recall", round(recall,3))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [45]:
from sklearn.cluster import KMeans

from sklearn.metrics import accuracy_score, precision_score, recall_score

run_name = 'run-fake-news-detector-KMeans'

modelKM = KMeans(n_clusters=2)

with mlflow.start_run(
    experiment_id=experiment.experiment_id,
    run_name=run_name
) as run:
     # set the tags
    mlflow.set_tags({
        "default_model_params":False,
        "model": "KMeans",
        "author": "@dcc2k",
    })
    
    # Log a parameter (key-value pair)
    # Log the model parameters
    mlflow.log_param("n_clusters", 2)
    mlflow.log_param("train_size", TRAIN_SPLIT)
    mlflow.log_param("subset_size", SUBSET_SIZE)
    
    modelKM.fit(X_train,y_train)
    
    y_pred = modelKM.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precission = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    
     # Log a metric; metrics can be updated throughout the run
    mlflow.log_metric("accuracy", round(accuracy,3))
    mlflow.log_metric("precision", round(precission,3))
    mlflow.log_metric("recall", round(recall,3))

  super()._check_params_vs_input(X, default_n_init=10)
