## Data Engineering

In [1]:
#Importing Packages
import sys
import os
sys.path.append("..")
import pandas as pd
from FasttextClassifier.FasttextClassifier import FasttextClassifier
from sklearn.model_selection import train_test_split
import random
import csv
from pathlib import Path


## Creating FastText training file from Reuters files

In [4]:
train = pd.read_csv('C:/Users/franc/OneDrive/Documentos/Projects/r52/tclass/Data/train/reuters_train_52/r52-train-all-terms.txt', header=None, sep='\t')
test = pd.read_csv('C:/Users/franc/OneDrive/Documentos/Projects/r52/tclass/Data/test/reuters_test_52/r52-test-all-terms.txt', header=None, sep='\t')
train.columns = ['label', 'content']
test.columns = ['label', 'content']
test.head()

Unnamed: 0,label,content
0,trade,asian exporters fear damage from u s japan rif...
1,grain,china daily says vermin eat pct grain stocks a...
2,ship,australian foreign ship ban ends but nsw ports...
3,gold,western mining to open new gold mine in austra...
4,acq,sumitomo bank aims at quick recovery from merg...


In [5]:
test['label'] = '__label__'+test['label']
train['label'] = '__label__'+train['label']
train

Unnamed: 0,label,content
0,__label__cocoa,bahia cocoa review showers continued throughou...
1,__label__earn,champion products ch approves stock split cham...
2,__label__acq,computer terminal systems cpml completes sale ...
3,__label__earn,cobanco inc cbco year net shr cts vs dlrs net ...
4,__label__earn,am international inc am nd qtr jan oper shr lo...
...,...,...
6527,__label__jobs,n z unemployment rate pct in december quarter ...
6528,__label__rubber,japan rubber stocks fall in march japan s rubb...
6529,__label__money-fx,south korean won fixed at month high the bank ...
6530,__label__copper,nippon mining lowers copper price nippon minin...


In [6]:
#saving train file
train.to_csv('trainFT.txt', sep='\t', header=None, index=False)
test.to_csv('testFT.txt', sep='\t', header=None, index=False)

### Creating the model and testing

In [7]:
ft_model = FasttextClassifier(train_data='trainFT.txt')

In [8]:
ft_model.fasttext_test('testFT.txt')

(2568, 0.926791277258567, 0.926791277258567)

In [16]:
ft_model.label('COVID is bringing an unemployment wave')

(('__label__jobs',), array([0.99069309]))

## Creating FastText model from raw files

In this example, we will take a dataset of raw files, divided into labels defined by their path.

The dataset shall follow the format:  
<pre>
.  
└── Dataset  
    └── label1  
    |   └── label_text.txt  
    ├── label2  
    |   ├── firstpost.txt  
    |   ├── happy.txt     
    |   └── secondpost.txt  
    └── label3  
        ├── text0.txt      
        └── second.txt    
</pre>
  

In [17]:

RAW_DATA_PATH = '../Data/texts'
GENERATED_DATA_PATH = 'generated_files'
MERGED_DATA_PATH ='merged'
SHUFFLED_MERGED_DATA_PATH = 'shuffled_merged/'
DATA_FINAL_PATH = 'data_final/'


In [18]:
def read_all_texts(path_to_folders, outfile_path):
    init_model=FasttextClassifier()
    for folder in os.listdir(f'{path_to_folders}'):
        init_model.gen_training(path= f'{path_to_folders}/{folder}', label= f'{folder}', outfile_path=f'{outfile_path}/{folder}')

In [19]:
if not os.path.exists(GENERATED_DATA_PATH):
    os.mkdir(GENERATED_DATA_PATH)
read_all_texts(RAW_DATA_PATH, GENERATED_DATA_PATH)

[ERROR] No training data.


### Merging files

In [20]:
def merge_files(path_to_folder, outfile_path):
    for file in os.listdir(f'{path_to_folder}'):
            with open(f'{path_to_folder}/{file}', encoding='utf-8') as fp: 
                data = fp.read() 
                data += '\n'
                file_lst=[]
                for filename in os.listdir(path_to_folder):
                    with open(f'{path_to_folder}/{filename}', encoding='utf-8') as f:
                        file_content = pd.read_csv(f, sep='\t', header=None)
                        file_lst.append(file_content)
                        
                merged = pd.concat(file_lst)
                merged.to_csv(f'{outfile_path}/' + 'merged', header=False, index=False, sep='\t')

In [21]:
if not os.path.exists(MERGED_DATA_PATH):
    os.mkdir(MERGED_DATA_PATH)
    merge_files(GENERATED_DATA_PATH, MERGED_DATA_PATH)

### Shuffle the data

In [22]:
def shuffler(path_to_folder, outfile_path):
    for file in os.listdir(f'{path_to_folder}'):
            with open(f'{path_to_folder}/{file}', encoding='utf-8') as fp:
                df=pd.read_csv(fp, sep='\t', header=None)
                df = df.sample(frac=1).reset_index(drop=True)
    return df
    df.to_csv(f'{outfile_path}/' + 'shuffled_merged' + '.txt', header=False, index=False, sep='\t')

In [23]:
shuffled_merged=shuffler(MERGED_DATA_PATH, SHUFFLED_MERGED_DATA_PATH)

### Split train and test data

In [24]:
X_train, X_test , y_train, y_test = train_test_split(shuffled_merged[1], shuffled_merged[0], test_size=0.33, random_state=42)

In [25]:
train=pd.DataFrame(X_train)
train.insert(0, '', y_train, True)
#saving train file
if not os.path.exists(DATA_FINAL_PATH):
    os.mkdir(DATA_FINAL_PATH)
train.to_csv(DATA_FINAL_PATH + 'train_data.txt', sep='\t', header=None, index=False)

In [26]:
test=pd.DataFrame(X_test)
test.insert(0, '', y_test, True)
#saving test file
test.to_csv(DATA_FINAL_PATH + 'test_data.txt', sep='\t', header=None, index=False)

### Load files and create/test model

In [30]:
with open(DATA_FINAL_PATH + 'train_data.txt', encoding='utf-8') as train_txt:
    final_train_data=train_txt.read()

with open(DATA_FINAL_PATH + 'test_data.txt', encoding='utf-8') as test_txt:
    final_test_data=test_txt.read()

    
ft_model=FasttextClassifier(train_data=str(DATA_FINAL_PATH + 'train_data.txt'))
ft_model.fasttext_test(DATA_FINAL_PATH + 'test_data.txt')

(4, 0.5, 0.5)