In [None]:
# !pip install tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'EEBDA is fun!',
    'EEBDA is a great course.',
    'house',
    'House',
    'Houses'
]

tokenizer = Tokenizer(num_words = 11) # the maximum number of words to keep, based on word frequency
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

In [None]:
# !pip install spacy

import spacy
import pandas as pd

# spacy.cli.download("en_core_web_md")

nlp = spacy.load("en_core_web_md") # load english model

document = nlp("EEBDA is SO much fun!")

pd.DataFrame({"Token": [word.text for word in document],
              "Base": [word.lemma_ for word in document]})

In [None]:
embed = nlp("dog")
    
embed.vector[0:10] # show first 10 entries for embedding

In [None]:
doc1 = nlp("dog")
doc2 = nlp("cat")

# Similarity of two words
doc1.similarity(doc2)

In [None]:

df_train = pd.read_csv(".\\case4_train.csv", header=None, nrows= 1000)
df_test = pd.read_csv(".\\case4_test.csv", header=None)
df_test.head()

In [None]:
df_train.info

In [None]:
# !pip install torch
# !pip install farm

import torch # machine learning "framework"
from farm.modeling.tokenization import Tokenizer
from farm.data_handler.processor import TextClassificationProcessor
from farm.data_handler.data_silo import DataSilo
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import TextClassificationHead
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.optimization import initialize_optimizer
from farm.train import Trainer


In [None]:

# Loading tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path="bert-base-uncased")



In [None]:
# set labels corresponding to "negative", "neutral" and "positive"
LABEL_LIST = ["0", "2", "4"]

# define the classification task
processor = TextClassificationProcessor(tokenizer=tokenizer, # here: "bert-base-uncased"
                                        max_seq_len=128, # samples are truncated after this many tokens
                                        data_dir=r".\\", # stored tweets
                                        label_list=LABEL_LIST, # "0", "2", "4"
                                        metric="f1_macro", # used for evaluation
                                        label_column_name="lable") # column with training lables(0,2,4)

In [None]:
BATCH_SIZE = 32 # batch size is a number of samples processed before the model is updated

data_silo = DataSilo( # generates and stores PyTorch DataLoader objects for the train, dev, and test datasets.
    processor=processor, # see above
    batch_size=BATCH_SIZE)

language_model = LanguageModel.load("bert-base-uncased")


In [None]:

prediction_head = TextClassificationHead(num_labels=len(LABEL_LIST))


In [None]:

EMBEDS_DROPOUT_PROB = 0.1 # The probability that a value in the embeddings returned by the language model will be zeroed.
                          # Helps preventing the model from overfitting!

device = "cpu" # using cpu since not everybody has a gpu installed

model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[prediction_head],
    embeds_dropout_prob=EMBEDS_DROPOUT_PROB,
    lm_output_types=["per_sequence"], #  How to extract the embeddings from the final layer of the language model.
                                      # If set to “per_sequence”, a single embedding will be extracted to represent 
                                      # the full input sequence.
    device = device)


In [None]:

LEARNING_RATE = 2e-5 # determins step size at each iteration while moving to minimum of a loss function
N_EPOCHS = 1 # determins number of passes of the entire training dataset the algorithm has to complete


model, optimizer, lr_schedule = initialize_optimizer(
    model=model,
    device=device,
    learning_rate=LEARNING_RATE,
    n_batches=len(data_silo.loaders["train"]),
    n_epochs=N_EPOCHS) 




In [None]:

N_GPU = 0 # set zero since we are not using any gpu

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    data_silo=data_silo,
    epochs=N_EPOCHS,
    n_gpu=N_GPU,
    lr_schedule=lr_schedule,
    device=device,
)

model = trainer.train()

In [None]:
from farm.infer import Inferencer
from pprint import PrettyPrinter

infer_model = Inferencer(processor=processor, model=model, task_type="text_classification", gpu=False)

basic_texts = [
    {"text": "EEBDA is such a great course!"},
]
result = infer_model.inference_from_dicts(dicts=basic_texts)
PrettyPrinter().pprint(result)


```{bibliography}
```