# IMDB Sentiment Classification

## Setup

In [3]:
from argparse import ArgumentParser
from dataclasses import dataclass
import os
from typing import List, Dict

from datasets import load_dataset
from flair.data import Sentence
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from examples.utils.gpu import get_balanced_devices
from fluidml.common import Resource
from fluidml.flow import Flow, GridTaskSpec, TaskSpec
from fluidml.swarm import Swarm
from fluidml.swarm.storage import LocalFileStorage

In [None]:
class DatasetFetcher(Task):
    def __init__(self, name: str, id_: int, fetch_param: int):
        super().__init__(name=name, id_=id_)

    def run(self, results: Dict[str, Any], resource: Resource):
        dataset = load_dataset("imdb")
        sentences = []
        labels = []
        for item in dataset["train"]:
            if len(item["topics"]) > 0:
                sentences.append(item["text"])
                labels.append(item["topics"][0])
        task_results = {"sentences": sentences,
                        "labels": labels}
        return task_results

In [None]:
class Preprocessor(Task):
    def __init__(self, name: str, id_: int, fetch_param: int):
        super().__init__(name=name, id_=id_)

    def run(self, results: Dict[str, Any], resource: Resource):
        dataset = load_dataset("imdb")
        sentences = []
        labels = []
        for item in dataset["train"]:
            if len(item["topics"]) > 0:
                sentences.append(item["text"])
                labels.append(item["topics"][0])
        task_results = {"sentences": sentences,
                        "labels": labels}
        return task_results

In [None]:
class TfIdfFeaturizer(Task):
    def __init__(self, name: str, id_: int, tfidf_param: int):
        super().__init__(name, id_)

    def run(self, results: Dict[str, Any], resource: Resource):
        tfidf = TfidfVectorizer()
        tfidf_vectors = tfidf.fit_transform(results["pre_process"]["sentences"]).toarray()
        task_results = {
            "vectors": tfidf_vectors
        }
        return task_results

In [None]:
class GloveFeaturizer(Task):
    def __init__(self, name: str, id_: int, glove_param: int):
        super().__init__(name, id_)

    def run(self, results: Dict[str, Any], resource: Resource):
        sentences = [Sentence(sent) for sent in results["pre_process"]["sentences"]]
        embedder = DocumentPoolEmbeddings([WordEmbeddings("glove")])
        embedder.embed(sentences)
        glove_vectors = [sent.embedding.cpu().numpy() for sent in sentences]
        glove_vectors = np.array(glove_vectors).reshape(len(glove_vectors), -1)
        task_results = {
            "vectors": glove_vectors
        }
        return task_results

In [None]:
class Trainer(Task):
    def __init__(self, name: str, id_: int, train_param: int):
        super().__init__(name, id_)

    def run(self, results: Dict[str, Any], resource: Resource):
        model = LogisticRegression(max_iter=50)
        stacked_vectors = np.hstack((results["tfidf_featurize"]["vectors"], results["glove_featurize"]["vectors"]))
        model.fit(stacked_vectors, results["dataset"]["labels"])
        task_results = {
            "model": model,
            "vectors": stacked_vectors,
            "labels": results["dataset"]["labels"],
        }
        return task_results

In [None]:
class Evaluater(Task):
    def __init__(self, name: str, id_: int, eval_param: int):
        super().__init__(name, id_)

    def run(self, results: Dict[str, Any], resource: Resource):
        predictions = results["train"]["model"].predict(results["train"]["vectors"])
        report = classification_report(results["train"]["labels"], predictions)
        task_results = {
            "classification_report": report
        }
        return task_results

In [None]:
# create all task specs
fetch = TaskSpec(task=DatasetFetcher, name="fetch", task_kwargs={"fetch_param": 1})
preprocess = GridTaskSpec(task=Preprocessor, name="preprocess", gs_config={"process_param": 1})
featurize_glove = GridTaskSpec(task=GloveFeaturizer, name="featurize_glove", gs_config={"glove_param": [5, 10]})
featurize_tfidf = GridTaskSpec(task=TfIdfFeaturizer, name="featurize_tfidf", gs_config={"tfidf_param": 10})
train = GridTaskSpec(task=Trainer, name="train", gs_config={"train_param": 10})
evaluate = GridTaskSpec(task=Evaluater, name="evaluate", gs_config={"eval_param": 5})

In [None]:
# register dependencies between tasks
preprocess.requires([fetch])
featurize_glove.requires([preprocess])
featurize_tfidf.requires([preprocess])
train.requires([fetch, featurize_glove, featurize_tfidf])
evaluate.requires([train])

In [None]:
# all tasks
tasks = [fetch,
         preprocess,
         featurize_glove, featurize_tfidf,
         train,
         evaluate]

In [None]:
with Swarm(n_dolphins=3, refresh_every=5) as swarm:
    flow = Flow(swarm=swarm)
    results = flow.run(tasks)
print(results["evaluate"])