<b> Code example of using the Fasttext library. For more details consult the documentaion </b> https://fasttext.cc/docs/en/support.html

In [1]:
import fasttext
import pprint
import inspect
import os

<b> Fasttext.train_supervised takes txt file as input and has good default arguments. you can change the paramenters as you see in the example. The output is a model object which can produce word vector and get neighbor words.
Check out supervised training of fasttext</b>

In [2]:
# Directory containing your text files
data_dir = 'data_2'

# Output file to store the combined data
output_file = 'combined_training_data.txt'

# Open the output file in write mode
with open(output_file, 'w', encoding='utf-8') as outfile:
    # Loop through all files in the directory
    for filename in os.listdir(data_dir):
        if filename.endswith('.txt'):  # Only process .txt files
            file_path = os.path.join(data_dir, filename)
            with open(file_path, 'r', encoding='utf-8') as infile:
                # Write content of each file into the output file
                outfile.write(infile.read() + "\n")

print(f"All text files combined into {output_file}")


All text files combined into combined_training_data.txt


In [3]:
# training is very simple and on high level. We try with the default settings
model = fasttext.train_unsupervised('combined_training_data.txt', model='skipgram', dim=200)
#playing with the parameters
#model = fasttext.train_unsupervised('data/training_data.txt', model='skipgram', 
#                                    minn=2, maxn=5, dim=300, epoch=1, lr=0.5, thread=4)
# default values ("dim=100": controls the number of dimensions of a vector) "100-300 range is popular"
#                 ("minn=2": The subwords are all the substrings contained in a word between the minimum size 
#                  (minn) and the maximal size (maxn).) default between 3 and 6
#                  (epoch=1: default 5)
#                   (lr=0.5: default value is 0.05) "the faster the model converge to a solution 
#                    but at the risk of overfitting to the dataset"
#                   ("thread=4" default=12) fastText is multi-threaded and uses 12 threads by default. 
#                    If you have less CPU cores (say 4), you can easily set the number of threads using the thread flag

Read 450M words
Number of words:  996767
Number of labels: 0
Progress: 100.0% words/sec/thread:   32813 lr:  0.000000 avg.loss:  0.508162 ETA:   0h 0m 0s


In [4]:
# let's find out what we can do with the model object [what methods and attributes it has]
methods_attr = [item for item in dir(model) if not item.startswith("_")]
pprint.pprint(methods_attr)

['bucket',
 'dim',
 'epoch',
 'f',
 'get_analogies',
 'get_dimension',
 'get_input_matrix',
 'get_input_vector',
 'get_label_id',
 'get_labels',
 'get_line',
 'get_meter',
 'get_nearest_neighbors',
 'get_output_matrix',
 'get_sentence_vector',
 'get_subword_id',
 'get_subwords',
 'get_word_id',
 'get_word_vector',
 'get_words',
 'is_quantized',
 'label',
 'labels',
 'loss',
 'lr',
 'lrUpdateRate',
 'maxn',
 'minCount',
 'minCountLabel',
 'minn',
 'neg',
 'predict',
 'pretrainedVectors',
 'quantize',
 'save_model',
 'set_args',
 'set_matrices',
 't',
 'test',
 'test_label',
 'thread',
 'verbose',
 'wordNgrams',
 'words',
 'ws']


In [5]:
# Separate methods and attributes
methods = [item for item in methods_attr if inspect.ismethod(getattr(model, item))]
attributes = [item for item in methods_attr if not inspect.ismethod(getattr(model, item))]

# Pretty print the methods and attributes
pprint.pprint(f"Methods: {methods}")
pprint.pprint(f"Attributes: {attributes}")

("Methods: ['get_analogies', 'get_dimension', 'get_input_matrix', "
 "'get_input_vector', 'get_label_id', 'get_labels', 'get_line', 'get_meter', "
 "'get_nearest_neighbors', 'get_output_matrix', 'get_sentence_vector', "
 "'get_subword_id', 'get_subwords', 'get_word_id', 'get_word_vector', "
 "'get_words', 'is_quantized', 'predict', 'quantize', 'save_model', "
 "'set_args', 'set_matrices', 'test', 'test_label']")
("Attributes: ['bucket', 'dim', 'epoch', 'f', 'label', 'labels', 'loss', 'lr', "
 "'lrUpdateRate', 'maxn', 'minCount', 'minCountLabel', 'minn', 'neg', "
 "'pretrainedVectors', 't', 'thread', 'verbose', 'wordNgrams', 'words', 'ws']")


In [6]:
# let's get info on the arguments of some of the methods
print(inspect.signature(model.get_dimension))
print(inspect.signature(model.get_analogies))
print(inspect.signature(model.get_subwords))

()
(wordA, wordB, wordC, k=10, on_unicode_error='strict')
(word, on_unicode_error='strict')


In [7]:
model.get_word_vector("father")

array([-3.62398893e-01,  8.12202916e-02, -2.67764688e-01, -7.68128037e-02,
        1.57742396e-01, -1.13313183e-01,  1.52611107e-01,  4.86544907e-01,
       -2.40758568e-01, -4.08467144e-01,  7.24291280e-02, -2.09741294e-01,
        1.19522475e-01, -3.60782683e-01, -3.87418061e-01,  1.45134017e-01,
        4.32168096e-01, -1.11449845e-01,  3.60231032e-03, -1.00929327e-01,
        3.43433142e-01, -1.22957811e-01,  2.84534067e-01,  4.78930622e-01,
        2.55102187e-01,  3.03595923e-02, -6.45152181e-02, -1.30704701e-01,
        1.27844587e-01,  3.29458326e-01,  1.25470012e-01, -4.78381440e-02,
       -3.62820864e-01, -3.10503423e-01, -2.59261847e-01, -2.33823508e-01,
        1.03295691e-01, -2.26096004e-01,  2.47430429e-01,  3.84607971e-01,
       -1.59989163e-01,  1.24242231e-02, -5.98286450e-01, -2.87795991e-01,
       -4.01901960e-01, -1.28270537e-01, -3.98319475e-02,  6.53634012e-01,
       -1.91364170e-03, -3.09041934e-03, -9.84952450e-02,  1.21091910e-01,
       -3.34523976e-01,  

In [8]:
model.get_nearest_neighbors("teacher")

[(0.8769745230674744, 'schoolteacher'),
 (0.8637434840202332, 'teacher,'),
 (0.8260265588760376, 'schoolteacher,'),
 (0.8106196522712708, 'pupil-teacher'),
 (0.8057887554168701, 'teacher.'),
 (0.799347996711731, 'teacher;'),
 (0.7882447242736816, 'teacher"'),
 (0.7877546548843384, 'teacher:'),
 (0.7810304164886475, 'teacher)'),
 (0.7784423232078552, 'schoolteacher.')]

In [9]:
model.get_subwords("university")

(['university',
  '<un',
  '<uni',
  '<univ',
  '<unive',
  'uni',
  'univ',
  'unive',
  'univer',
  'niv',
  'nive',
  'niver',
  'nivers',
  'ive',
  'iver',
  'ivers',
  'iversi',
  'ver',
  'vers',
  'versi',
  'versit',
  'ers',
  'ersi',
  'ersit',
  'ersity',
  'rsi',
  'rsit',
  'rsity',
  'rsity>',
  'sit',
  'sity',
  'sity>',
  'ity',
  'ity>',
  'ty>'],
 array([  10637, 2006539, 1369982, 2221226, 2277193, 2330984, 2734796,
        2460663, 2611517, 2064761, 2896076, 2752556, 2348153, 2161736,
        2493280, 1624789, 2268716, 1207329, 2911362, 2990173, 2760605,
        1639134, 2861953, 1927873, 2686432, 2186816, 2464494, 1913761,
        1832339, 2031614, 1754993, 1826723, 1425642, 1717838, 1060559]))

In [10]:
model.get_analogies("father", "mother", "daughter", k=3)

[(0.8165403604507446, '“daughter'),
 (0.8020674586296082, 'daughter,'),
 (0.799014151096344, 'goddaughter')]

In [11]:
model.get_analogies("Berlin", "Germany", "France", k=4)

[(0.7275875806808472, 'Paris'),
 (0.7073453664779663, 'Berlin_'),
 (0.6804660558700562, 'Francfort'),
 (0.6751400232315063, 'Berline')]

In [12]:
#save the model
model.save_model("embedding_9m_word.bin")

<b> let's try a pretrained model that is much larger.
Please note that it might not load on your memory</b>

In [14]:
#download pretrained model
import fasttext.util

In [15]:
#fasttext.util.download_model('en', if_exists='ignore')
lg_model = fasttext.load_model('cc.en.300.bin')



In [None]:
lg_model.get_nearest_neighbors('father')

In [None]:
lg_model.get_analogies("Berlin", "Germany", "France", k=3)

In [None]:
lg_model.get_analogies("father", "mother", "daughter", k=5)

In [None]:
lg_model.get_nearest_neighbors('color')

In [29]:
lg_model.get_analogies("doctor", "man", "woman", k=5)

[(0.7013290524482727, 'gynecologist'),
 (0.6668133735656738, 'physician'),
 (0.6592792272567749, 'gynocologist'),
 (0.6535542011260986, 'gynaecologist'),
 (0.6520789861679077, 'OBGYN')]

In [None]:
del lg_model

<b> Let's try to use the model we trained instead of the tf-idf we used previously with the random forest classifier </b>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import fasttext
import numpy as np
import joblib

In [None]:
# Load the data from the JSON file
with open('train_data.json', 'r') as file:
    data = pd.read_json(file)

# Use only 4400 examples (4000 for training and 400 for testing)
data = data.sample(4400, random_state=42)
del file

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=400, random_state=42)

# Load the FastText model
model = fasttext.load_model("embedding_1m_word.bin")

In [None]:
# Function to vectorize text using FastText
def vectorize_text(texts):
    vectorized_texts = []
    for text in texts:
        words = text.split()
        word_vectors = [model.get_word_vector(word) for word in words]
        text_vector = np.mean(word_vectors, axis=0)
        vectorized_texts.append(text_vector)
    return np.vstack(vectorized_texts)

In [None]:
# Vectorize the training and testing text data
X_train_vec = vectorize_text(X_train)
X_test_vec = vectorize_text(X_test)

In [None]:
# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_vec, y_train)

In [None]:
# Predict on the test set
y_pred = clf.predict(X_test_vec)

In [None]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))