In [1]:
import yaml
import pandas as pd
from joblib import dump
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.pipeline import Pipeline

with open("params.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

In [2]:
train_X = pd.read_csv(config["train_X_dataset_path"])
train_y = pd.read_csv(config["train_y_dataset_path"])

In [3]:
pipe = Pipeline([
    ("tf-idf", TfidfVectorizer()),
    ("linear-svc", LinearSVC())
])

In [4]:
pipe.fit(train_X.text_cleaned, train_y.values.ravel())
dump(pipe, config["model_path"]) 

['artifacts/model.joblib']

In [5]:
train_predictions = pipe.predict(train_X.text_cleaned)
pd.DataFrame(train_predictions).to_csv(config["train_predictions_path"], index=False)