In [4]:
# all necessary includes

import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords

import os
import nltk
# nltk.download('stopwords')

In [None]:
# additional functions

garbage_words = stopwords.words('english') + stopwords.words('russian')


def ClearText(text, garbage_words):
    result = [word for word in text.lower().split() if word not in garbage_words]
    return ' '.join(result)


def TrainRandomForestClassifier(dataframe, n_estimators=300):
    ngram_range = (1, 3)
    max_features = 50000

    vectorizer = CountVectorizer(analyzer = "word",
                                tokenizer = None,
                                preprocessor = None,
                                stop_words = None, 
                                ngram_range = ngram_range,
                                max_features = max_features
                                )
    X_train, Y_train = dataframe['X'], dataframe['target']
    
    
    #print(X_train.shape)
    X_train = vectorizer.fit_transform(X_train)
    #print(X_train.shape)
    #print(type(X_train))
    # X_train = X_train.toarray()

    model = RandomForestClassifier(n_estimators=n_estimators)
    model = model.fit(X_train, Y_train)
    return vectorizer, model


def DepthFirstSearch(cur_folder, doc_types, file_names, doc_texts, name=''):
    files = os.listdir()
    cur_name = name[:]
    if len(cur_name) != 0:
        cur_name += '-'
    cur_name += cur_folder
    if 'text_layer' in files:
        files.remove('text_layer')
        
    folders = [file for file in files if '.' not in file]
    
    for folder in folders:
        os.chdir(folder)
        DepthFirstSearch(folder, doc_types, file_names, doc_texts, cur_name)
        os.chdir('..')
    if len(folders) == 0:
        for file in files:
            text_layer = ''
            if 'text_layer' in os.listdir():
                try:
                    with open('./text_layer/' + file.split('.')[0] + '.txt') as f:
                        text_layer = f.read()
                except:
                    NOT_FOUND.append((file, cur_name))
            if len(text_layer) > 0:
                doc_texts.append(text_layer)
                doc_types.append(cur_name)
                file_names.append(file)
        

In [3]:
#os.chdir('..')
print(os.listdir())

['.git', '.gitignore', '.ipynb_checkpoints', 'DOCUMENT_CLASSIFIER.ipynb', 'for training', 'models', 'outlook message extractor.ipynb', 'README.md']


In [6]:
# read input
NOT_FOUND = []
doc_types, file_names, doc_texts = [], [], []

dataset_path = 'for training'
os.chdir(dataset_path)
DepthFirstSearch('', doc_types, file_names, doc_texts)
os.chdir('..')
df = pd.DataFrame({'file_name': file_names, 'doc_type': doc_types, 'doc_text' : doc_texts})
# print(df)

In [72]:
# part for checking files that weren't found
ext = [x[0].split('.')[-1] for x in NOT_FOUND]
print(set(ext))
formated_not_found = ['file {} at {} not found'.format(x[0], x[1]) for x in NOT_FOUND if '' in x[0].split('.')[-1].lower()]
print(len(formated_not_found))
#print('\n'.join(formated_not_found))

{'JPEG', 'pdf', 'PDF', 'jpg', 'msg'}
49


In [61]:
# modify input

for i in range(len(df['doc_text'])):
    df['doc_text'][i] = ClearText(df['doc_text'][i], garbage_words)
df = df.rename(columns={'doc_type' : 'target', 'doc_text' : 'X'})
# print(df)

In [32]:
# will split dataframe into samples with same proportion of each document class
def split_train_sample(dataframe, fraction=0.7):
    '''
    1) to form arrays for each document class
    2) to split each class by np.split
    3) to shuffle rows in resulting dataframe1 and dataframe2
    '''
    types = set(dataframe['doc_type'])
    print(types)
    dataframes = [dataframe[dataframe['doc_type'] == doc_type] for doc_type in types]
    splited = [np.split(df.sample(frac=1), [int(fraction * len(df))]) for df in dataframes]
    test = [df[0] for df in splited]
    train = [df[1] for df in splited]
    return pd.concat(train).sample(frac=1), pd.concat(test).sample(frac=1)


In [73]:
# split into training (0.6 of dataframe) validation (0.2 of dataframe) and test (0.2 of dataframe)

# train, test = np.split(df.sample(frac=1), [int(0.7 * len(df))])
train, test = split_train_sample(df, fraction=0.7)

In [None]:
# here will be cross validation by n_estimators, mb cross validation for vectorizer is necessary too

n_estimators = [100, 300, 500, 700]
best_accuracy = 0
best_arg = 300

for n in n_estimators:
    vectorizer, model = TrainRandomForestClassifier(train, n_estimators=n)
    

In [74]:
# train the model

vectorizer, model = TrainRandomForestClassifier(train, n_estimators=best_arg)

In [75]:
# write model to file
if 'models' not in os.listdir():
    os.mkdir('models')
with open('./models/doc_classifyer-model', 'wb') as f:
    pickle.dump(model, f)
with open('./models/doc_classifyer-vectorizer', 'wb') as f:
    pickle.dump(vectorizer, f)

In [80]:
with open('./models/doc_classifyer-model', 'rb') as f:
    model = pickle.load(f)
with open('./models/doc_classifyer-vectorizer', 'rb') as f:
    vectorizer = pickle.load(f)

In [81]:
prediction = model.predict(vectorizer.transform(test['X']))
probability = model.predict_proba(vectorizer.transform(test['X']))
accuracy = np.mean(prediction == test['target'])
print(accuracy)
print('mean probability:', np.mean(probability))

0.9012446997674737
mean probability: 0.03225806451612903


In [77]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# print(confusion_matrix(test['target'], prediction))
print(classification_report(test['target'], prediction))
print(accuracy_score(test['target'], prediction))


                                                              precision    recall  f1-score   support

                                            ad-invoice-first       0.93      0.92      0.93      1299
                                            ad-invoice-multi       0.92      0.75      0.82       147
                                          ad-statement-first       0.98      0.99      0.99      1671
                                          ad-statement-multi       0.98      0.91      0.94       216
                                                ad-upd-first       0.89      0.95      0.92      1081
                                                ad-upd-multi       0.92      0.82      0.86       223
                                            ad-waybill-first       0.93      0.95      0.94       408
                                            ad-waybill-multi       1.00      0.70      0.82        53
                                                     rfq-AOG       0.88      0.60

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [78]:
# assess accuracy of the model
def calculate_accuracy(ds, confidence_level):
    ds['predict'] = model.predict(vectorizer.transform(ds['X']))
    ds['confidence'] = np.transpose(np.amax(model.predict_proba(vectorizer.transform(ds['X'])), axis=1))

    precision = round(100 * len(ds[(ds['confidence'] >= confidence_level) & (ds['target'] == ds['predict'])]) / len(ds), 2)
    to_validate = round(100 * len(ds[(ds['confidence'] < confidence_level)]) / len(ds), 2)
    return precision, to_validate, confidence_level


precision, to_validate, confidence_level = calculate_accuracy(test, 0.8)

print('Rows in test set: {}'.format(len(ds)))
print('Statistics:')
print('Precision = {}%; Ratio of wrong answers = {}%; Confidence = {}'.format(precision, to_validate, confidence_level))

Rows in test set: 7311
Statistics:
Precision = 77.09%; Ratio of wrong answers = 20.43%; Confidence = 0.8


In [5]:
# test