### Install Packages

In [None]:
# ! pip install fasttext

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
import os
import re
import fasttext
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils import resample

pd.options.display.max_rows = 300
pd.options.display.max_columns = 100

from IPython.core.debugger import set_trace
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Clean & Pre-process Data

In [None]:
df_data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
df_data.head(1)

### Create FastText Training Data

In [None]:
# modify label
df_data['sentiment'] = df_data['sentiment'].map(lambda x: '__label__' + str(x))

In [None]:
df_data.head(1)

In [None]:
# train-valid-test split
df_data_train, df_data_test = train_test_split(df_data, test_size=0.2, random_state=123)

In [None]:
# write data to files
df_data_train.to_csv('data_train.txt', sep='\t', header=False, index=False)
df_data_test.to_csv('data_test.txt', sep='\t', header=False, index=False)

In [None]:
print('Shapes:')
print(f'Launch: {df_data_train.shape, df_data_test.shape}')

### Fit Model

In [None]:
model = fasttext.train_supervised(input='../working/data_train.txt', lr=0.9, epoch=25)

In [None]:
# calculate accuracy by predicting individual labels
def get_model_metrics(model, testfile):
    results = {}
    with open(testfile) as f:
        y_true = []
        y_pred = []
        for sample in f:
            sample = sample.replace('\n', '')

            if '__label__positive' in sample:
                label_actual = 'positive'
                sample = sample.replace('__label__positive', '')
            elif '__label__negative' in sample:
                label_actual = 'negative'
                sample = sample.replace('__label__negative', '')

            label_pred, pred_prob = model.predict(sample)
            label_pred, pred_prob = label_pred[0], pred_prob[0]
            label_pred = label_pred.split('__label__')[1]
    #         label_pred = int(label_pred)

            y_true.append(label_actual)
            y_pred.append(label_pred)

    y_true, y_pred = np.array(y_true), np.array(y_pred)
    results['Accuracy'] = accuracy_score(y_true, y_pred)
    precision, recall, fscore, support = precision_recall_fscore_support(y_true, y_pred, 
                                                    average='binary', pos_label='positive')
    results['Precision'] = precision
    results['Recall'] = recall
    return results

In [None]:
df = pd.read_csv('../working/data_test.txt', sep='\t', header=None)
df.head(2)

In [None]:
model.predict(df[0][0])

In [None]:
results = get_model_metrics(model, '../working/data_test.txt')

In [None]:
results

In [None]:
# model.save_model('fasttext_model.bin')