In [24]:
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split

# Suppress all warnings
warnings.filterwarnings("ignore")

# Step 1: Creare un file di testo dal DataFrame
def create_fasttext_format(df, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            label = "__label__" + str(row['label'])
            tweet = row['tweet'].replace('\n', ' ')
            f.write(f"{label} {tweet}\n")
            
# Function to create the test file in the required FastText format
def create_fasttext_test_format(df, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            tweet = row['tweet'].replace('\n', ' ')
            f.write(f"{tweet}\n")



In [25]:
df = pd.read_csv('../data/processed/train_full.csv')
df = df.dropna(subset=['tweet'])
random_state = 42

X = df['tweet']
y = df['label']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=random_state)

train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)

In [26]:
create_fasttext_format(train_df, 'fasttext_train.txt')

In [27]:
create_fasttext_format(val_df, 'fasttext_val.txt')

In [28]:
create_fasttext_format(df, 'fasttext_full.txt')

In [29]:
df_test = pd.read_csv('../data/processed/test.csv')

In [30]:
create_fasttext_test_format(df_test, 'test.txt')

In [31]:
# Read the predictions
with open('predictions.txt', 'r', encoding='utf-8') as f:
    predictions = [line.strip() for line in f.readlines()]

# Substitute labels
predictions = ['1' if p == '__label__positive' else '-1' for p in predictions]

# Combine IDs with predictions
result_df = df_test[['id']].copy()
result_df['prediction'] = predictions

In [32]:
len(result_df)

10000

In [33]:
# Save the result to a new CSV file
result_df.to_csv('../results/fastText_full_5min.csv', index=False)