# Notebook to test FastText

### Imports

In [1]:
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

### Functions

In [None]:
def create_fasttext_format(df, file_path):
    """
    Creates a text file from a DataFrame in the format required by FastText for training.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame containing the tweets and labels.
    file_path : str
        The path where the output file will be saved.

    Returns
    -------
    None
    """
    with open(file_path, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            label = "__label__" + str(row['label'])
            tweet = row['tweet'].replace('\n', ' ')
            f.write(f"{label} {tweet}\n")
            
def create_fasttext_test_format(df, file_path):
    """
    Creates a text file from a DataFrame in the format required by FastText for testing.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame containing the tweets.
    file_path : str
        The path where the output file will be saved.

    Returns
    -------
    None
    """
    with open(file_path, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            tweet = row['tweet'].replace('\n', ' ')
            f.write(f"{tweet}\n")

### Pipeline

#### 1. Pre-processing data in required format

In [25]:
# Load data
df = pd.read_csv('../data/processed/train_full.csv')
df = df.dropna(subset=['tweet'])
random_state = 42

X = df['tweet']
y = df['label']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=random_state)

train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)

create_fasttext_format(train_df, 'fasttext_train.txt')
create_fasttext_format(val_df, 'fasttext_val.txt')

df_test = pd.read_csv('../data/processed/test.csv')
create_fasttext_test_format(df_test, 'test.txt')

#### 2. Training and Making predictions using commands in the README.md

#### 3. Format predictions file

In [4]:
# Read the predictions
with open('../results/fasttext_predictions.txt', 'r', encoding='utf-8') as f:
    predictions = [line.strip() for line in f.readlines()]

# Substitute labels
predictions = ['1' if p == '__label__positive' else '-1' for p in predictions]

# Combine IDs with predictions
result_df = df_test[['id']].copy()
result_df['prediction'] = predictions
print(len(result_df))

# Save the result to a new CSV file
result_df.to_csv('../results/fasttext_predictions.csv', index=False)

10000
