# Тест библиотеки fastText

In [None]:
!pip install fasttext

In [None]:
import pandas as pd
import fasttext
from sklearn.model_selection import train_test_split
import re
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import remove_stopwords

pd.options.display.max_colwidth = 1000

In [None]:
col_df = ['article_link', 'headline', 'is_sarcastic']

In [None]:
df = pd.read_csv('/content/drive/MyDrive/dataset/Sarcasm_Headlines_Dataset.csv', usecols=col_df)

In [None]:
df.head()

In [None]:
def clean_text(text):
  """Функция очистки текста"""
  text = text.lower()
  text = re.sub(r'[^\sa-zA-Z0-9@\[\]]',' ',text) # Удалить пунктуацию
  text = re.sub(r'\w*\d+\w*', '', text) # Удалить цифры
  text = re.sub('\s{2,}', " ", text) # Удалить ненужные пробелы
  return text

In [None]:
df['headline'] = df['headline'].apply(clean_text)

In [None]:
# Обучающая и тестовые выборки
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# Трасформация массивов данных в нужный формат
with open('train.txt', 'w') as f:
    for each_text, each_label in zip(train['headline'], train['is_sarcastic']):
        f.writelines(f'__label__{each_label} {each_text}\n')
        
with open('test.txt', 'w') as f:
    for each_text, each_label in zip(test['headline'], test['is_sarcastic']):
        f.writelines(f'__label__{each_label} {each_text}\n')

In [None]:
!head -n 10 train.txt

### Модель 1

In [None]:
model1 = fasttext.train_supervised('train.txt')

In [None]:
def print_results(sample_size, precision, recall):
    precision   = round(precision, 2)
    recall      = round(recall, 2)
    print(f'sample_size = {sample_size}')
    print(f'precision = {precision}')
    print(f'recal = {recall}')

In [None]:
print_results(*model1.test('test.txt'))

sample_size = 5342
precision = 0.85
recal = 0.85


### Модель 2

In [None]:
model2 = fasttext.train_supervised('train.txt', epoch=25)

In [None]:
print_results(*model2.test('test.txt'))

sample_size = 5342
precision = 0.83
recal = 0.83


### Модель 3

In [None]:
model3 = fasttext.train_supervised('train.txt', epoch=10, lr=1.0)

In [None]:
print_results(*model3.test('test.txt'))

sample_size = 5342
precision = 0.83
recal = 0.83


### Модель 4

In [None]:
model4 = fasttext.train_supervised('train.txt', epoch=10, lr=1.0, wordNgrams =2)

In [None]:
print_results(*model4.test('test.txt'))

sample_size = 5342
precision = 0.86
recal = 0.86


### Модель 5

In [None]:
model5 = fasttext.train_supervised('train.txt', autotuneValidationFile='test.txt')

In [None]:
print_results(*model5.test('test.txt'))

### Модель 6

In [None]:
model6 = fasttext.train_supervised('train.txt', autotuneValidationFile='test.txt', autotuneMetric="f1:__label__1")

In [None]:
print_results(*model6.test('test.txt'))

In [None]:
# Сохранение результатов
model6.save_model('optimized.model')

In [None]:
model = fasttext.load_model("optimized.model")
model.predict("...")