In [15]:
# all necessary includes

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords

import os

In [16]:
# additional functions

garbage_words = stopwords.words('english') + stopwords.words('russian')


def ClearText(text, garbage_words):
    result = [word for word in text.lower().split() if word not in garbage_words]
    return ' '.join(result)


def TrainRandomForestClassifier(dataframe):
    ngram_range = (1, 3)
    max_features = 50000
    n_estimators=100

    vectorizer = CountVectorizer(analyzer = "word",
                                tokenizer = None,
                                preprocessor = None,
                                stop_words = None, 
                                ngram_range = ngram_range,
                                max_features = max_features
                                )
    X_train, Y_train = dataframe['X'], dataframe['target']
    
    X_train = vectorizer.fit_transform(X_train)
    X_train = X_train.toarray()

    model = RandomForestClassifier(n_estimators=n_estimators)
    model = model.fit(X_train, Y_train)
    return vectorizer, model


def DepthFirstSearch(cur_folder, doc_types, file_names, doc_texts, name=''):
    files = os.listdir()
    cur_name = name[:]
    if len(cur_name) != 0:
        cur_name += '-'
    cur_name += cur_folder
    folders = [file for file in files if '.' not in file]
    for folder in folders:
        os.chdir(folder)
        DepthFirstSearch(folder, doc_types, file_names, doc_texts, cur_name)
        os.chdir('..')
    if len(folders) == 0:
        for file in files:
            doc_types.append(cur_name)
            file_names.append(file)
            doc_texts.append('') # need to proceed the text layer of document 
        

In [17]:
# read input

doc_types, file_names, doc_texts = [], [], []

dataset_path = 'for training'
os.chdir(dataset_path)
DepthFirstSearch('', doc_types, file_names, doc_texts)
os.chdir('..')
df = pd.DataFrame({'file_name': file_names, 'doc_type': doc_types, 'doc_text' : doc_texts})
print(df)

               file_name            doc_type doc_text
0              act-1.pdf      act-first page         
1              act-2.pdf      act-first page         
2              act-3.pdf      act-first page         
3          invoice-1.pdf  invoice-first page         
4          invoice-2.pdf  invoice-first page         
5            other-1.pdf               other         
6             upd1-1.pdf      upd-first page         
7  waybill-invoice-2.pdf             waybill         


In [18]:
# modify input

for i in range(len(df['doc_text'])):
    df['doc_text'][i] = ClearText(df['doc_text'][i], garbage_words)
df = df.rename(columns={'doc_type' : 'target', 'doc_text' : 'X'})
print(df)

               file_name              target X
0              act-1.pdf      act-first page  
1              act-2.pdf      act-first page  
2              act-3.pdf      act-first page  
3          invoice-1.pdf  invoice-first page  
4          invoice-2.pdf  invoice-first page  
5            other-1.pdf               other  
6             upd1-1.pdf      upd-first page  
7  waybill-invoice-2.pdf             waybill  


In [19]:
# split into training (0.6 of dataframe) validation (0.2 of dataframe) and test (0.2 of dataframe)

train, validate, test = np.split(df.sample(frac=1), [int(0.6 * len(df)), int(0.8 * len(df))])


In [21]:
# train the model

model, vectorizer = TrainRandomForestClassifier(train)

In [None]:
# validate the model

In [None]:
# write model to file


In [None]:
# assess accuracy of the model
