<b> Code example of using the Fasttext library. For more details consult the documentaion </b> https://fasttext.cc/docs/en/support.html

In [None]:
import fasttext
import pprint
import inspect

<b> Fasttext.train_supervised takes txt file as input and has good default arguments. you can change the paramenters as you see in the example. The output is a model object which can produce word vector and get neighbor words.
Check out supervised training of fasttext</b>

In [None]:
# training is very simple and on high level. We try with the default settings
model = fasttext.train_unsupervised('data/training_data.txt', model='skipgram')
#playing with the parameters
#model = fasttext.train_unsupervised('data/training_data.txt', model='skipgram', 
#                                    minn=2, maxn=5, dim=300, epoch=1, lr=0.5, thread=4)
# default values ("dim=100": controls the number of dimensions of a vector) "100-300 range is popular"
#                 ("minn=2": The subwords are all the substrings contained in a word between the minimum size 
#                  (minn) and the maximal size (maxn).) default between 3 and 6
#                  (epoch=1: default 5)
#                   (lr=0.5: default value is 0.05) "the faster the model converge to a solution 
#                    but at the risk of overfitting to the dataset"
#                   ("thread=4" default=12) fastText is multi-threaded and uses 12 threads by default. 
#                    If you have less CPU cores (say 4), you can easily set the number of threads using the thread flag

In [None]:
# let's find out what we can do with the model object [what methods and attributes it has]
methods_attr = [item for item in dir(model) if not item.startswith("_")]
pprint.pprint(methods_attr)

In [None]:
# Separate methods and attributes
methods = [item for item in methods_attr if inspect.ismethod(getattr(model, item))]
attributes = [item for item in methods_attr if not inspect.ismethod(getattr(model, item))]

# Pretty print the methods and attributes
pprint.pprint(f"Methods: {methods}")
pprint.pprint(f"Attributes: {attributes}")

In [None]:
# let's get info on the arguments of some of the methods
print(inspect.signature(model.get_dimension))
print(inspect.signature(model.get_analogies))
print(inspect.signature(model.get_subwords))

In [None]:
model.get_word_vector("father")

In [None]:
model.get_nearest_neighbors("father")

In [None]:
model.get_subwords("university")

In [None]:
model.get_analogies("father", "mother", "daughter", k=1)

In [None]:
model.get_analogies("Berlin", "Germany", "France", k=1)

In [None]:
#save the model
model.save_model("embedding_1m_word.bin")

<b> let's try a pretrained model that is much larger.
Please note that it might not load on your memory</b>

In [None]:
#download pretrained model
import fasttext.util

In [None]:
#fasttext.util.download_model('en', if_exists='ignore')
lg_model = fasttext.load_model('embedding_1m_word.bin')

In [None]:
lg_model.get_nearest_neighbors('father')

In [None]:
lg_model.get_analogies("Berlin", "Germany", "France", k=1)

In [None]:
lg_model.get_analogies("father", "mother", "daughter", k=1)

In [None]:
del lg_model

<b> Let's try to use the model we trained instead of the tf-idf we used previously with the random forest classifier </b>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import fasttext
import numpy as np
import joblib

In [None]:
# Load the data from the JSON file
with open('train_data.json', 'r') as file:
    data = pd.read_json(file)

# Use only 4400 examples (4000 for training and 400 for testing)
data = data.sample(4400, random_state=42)
del file

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=400, random_state=42)

# Load the FastText model
model = fasttext.load_model("cc.en.300.bin")

In [None]:
# Function to vectorize text using FastText
def vectorize_text(texts):
    vectorized_texts = []
    for text in texts:
        words = text.split()
        word_vectors = [model.get_word_vector(word) for word in words]
        text_vector = np.mean(word_vectors, axis=0)
        vectorized_texts.append(text_vector)
    return np.vstack(vectorized_texts)

In [None]:
# Vectorize the training and testing text data
X_train_vec = vectorize_text(X_train)
X_test_vec = vectorize_text(X_test)

In [None]:
# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_vec, y_train)

In [None]:
# Predict on the test set
y_pred = clf.predict(X_test_vec)

In [None]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))