## Data Engineering

In [1]:
import pandas as pd
import fasttext_class
from sklearn.model_selection import train_test_split
import random
import csv
from pathlib import Path
import fasttext

# Those are the paths that will be used in this notebook
GEN_PATH= 'C:/Users/franc/OneDrive/Documentos/Projects/text_classification/src/fasttext_class/train_data/'
RTT_ACC_PATH = 'C:/Users/franc/OneDrive/Documentos/Projects/text_classification/src/fasttext_class/train_data/training100accounting.txt'
RTT_FIN_PATH = 'C:/Users/franc/OneDrive/Documentos/Projects/text_classification/src/fasttext_class/train_data/train_finances.txt'
MERGED_PATH= 'C:/Users/franc/OneDrive/Documentos/Projects/text_classification/src/fasttext_class/train_data/merged.txt'
SHUFFLED_PATH='C:/Users/franc/OneDrive/Documentos/Projects/text_classification/src/fasttext_class/train_data/merged_and_shuffled.txt'
TRAIN_PATH='C:/Users/franc/OneDrive/Documentos/Projects/text_classification/src/fasttext_class/train_data/to_train.txt'
TEST_PATH='C:/Users/franc/OneDrive/Documentos/Projects/text_classification/src/fasttext_class/train_data/to_test.txt'

In [2]:
rtts_accounting = pd.read_csv(RTT_ACC_PATH, sep='\t', header=None)
rtts_finances = pd.read_csv(RTT_FIN_PATH, sep='\t', header=None)
merged = pd.concat([rtts_accounting, rtts_finances])


In [3]:
merged.head()

Unnamed: 0,0,1
0,__label__accounting,240 Accountancy : Company Accounts and Analysi...
1,__label__accounting,Accounting for Share Capital 4 3 (II) Amount ...
2,__label__accounting,Recording of Transactions — II 127 Pawan Elec...
3,__label__accounting,Accounting for Share Capital 5 5 Working Note...
4,__label__accounting,Financial Statements of a Company 1 4 1 amoun...


In [4]:
merged.tail()

Unnamed: 0,0,1
95,__label__finances,'Golden economic period' to end Ten years of ...
96,__label__finances,Industrial output falls in Japan Japanese ind...
97,__label__finances,Jobs go at Oracle after takeover Oracle has a...
98,__label__finances,Beijingers fume over parking fees Choking tra...
99,__label__finances,GM issues 2005 profits warning General Motors...


In [5]:
merged.to_csv(MERGED_PATH, sep='\t', header=None)

## Shuffling the occurences

In [6]:
shuffled_merged= merged.sample(frac=1).reset_index(drop=True)

In [7]:
shuffled_merged.head()

Unnamed: 0,0,1
0,__label__accounting,Accounting for Share Capital 5 5 Working Note...
1,__label__finances,US adds more jobs than expected The US econom...
2,__label__finances,"German economy rebounds Germany's economy, th..."
3,__label__finances,US economy still growing says Fed Most areas ...
4,__label__finances,Asian quake hits European shares Shares in Eu...


In [8]:
shuffled_merged.to_csv(SHUFFLED_PATH, sep='\t', header=None)

## Creating train and test columns

In [9]:
X_train, X_test , y_train, y_test = train_test_split(shuffled_merged[1], shuffled_merged[0], test_size=0.33, random_state=42)

## Creating train file

In [10]:
train=pd.DataFrame(X_train)

In [11]:
train.insert(0, '', y_train, True)

In [12]:
#saving train file
train.to_csv(TRAIN_PATH, sep='\t', header=None, index=False)

## Creating test file

In [13]:
test=pd.DataFrame(X_test)

In [14]:
test.insert(0, '', y_test, True)

In [15]:
#saving test file
test.to_csv(TEST_PATH, sep='\t', header=None, index=False)

## Loading the files

In [16]:
data_folder = Path(GEN_PATH)
file_to_open = data_folder / "to_train.txt"
with open(file_to_open, encoding='utf-8') as train_txt:
    train_rtt=train_txt.read()


In [17]:
data_folder = Path(GEN_PATH)
file_to_open2 = data_folder / "to_test.txt"
with open(file_to_open2, encoding='utf-8') as test_txt:
    test_rtt=test_txt.read()

## Creating the model

In [18]:
model_1=fasttext_class.fasttext_c(train_data=str(file_to_open))

In [22]:
model_1.fasttext_test(str(file_to_open2))

(66, 1.0, 1.0)

In [32]:
model_1.fasttext_labeler('Fellipe is borrowing money with a low interest rate')

(('__label__finances',), array([0.99961621]))

In [48]:
model_1.fasttext_labeler("Pay your bill right now and avoid fees!")

(('__label__accounting',), array([0.74799633]))