# Random Forest (with Feature Engineering)

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#Custom models
from prepro import readfile, readstring, get_sentence, is_number, extract_words


#Model
from sklearn.ensemble import RandomForestClassifier

#Evalulation
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [24]:
#import data from my github repo
train = readfile("train.txt")
corpus = train.copy()
test = readfile("test.txt")

#create corpus
corpus.extend(test)

In [25]:
words = []
tags = []
for sentence in corpus:
    for word in sentence:
        words.append(word[0])
        tags.append(word[1])   

## Tokenization

In [26]:
words=list(words)
n_words = len(set(words))
n_tags = len(set(tags))

print("Number of words in the dataset: ", n_words)
print("Number of tags in the dataset: ", n_tags)

Number of words in the dataset:  27316
Number of tags in the dataset:  9


In [27]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding
# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}
print("The word 'rejects' is identified by the index: {}".format(word2idx["rejects"]))

The word 'rejects' is identified by the index: 218128


In [28]:
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}
print("The labels B-LOC (location) is identified by the index: {}".format(tag2idx["B-LOC"]))

The labels B-LOC (location) is identified by the index: 250026


In [29]:
def tokenize(word_list):
    new_list= []
    for word in word_list:
        if is_number(word2idx[word]):
            new_list.append(word2idx[word])
    else:
        None
    return(new_list)

In [30]:
#training set
train_words, train_tags = extract_words(train)

#tokenize words into tokens
tr_words = tokenize(train_words)

## Feature Engineering 

In [9]:
def feature_map(word):
    '''Simple feature map.'''
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha()])

In [None]:
feature_words = [feature_map(word) for word in train_words]

## Random Forest Classifier (with Features)

We use feature engineering here as Random Forest classifiers are not typical used for sequencs classification and thus need an input dataframe with a least two columns. Therefore we need train the classifier using feature engineered data with 6 features results in a dataframe of 6 columns. 

In [19]:
#Train model
pred = cross_val_predict(RandomForestClassifier(n_estimators=20), X=feature_words, y=train_tags, cv=5)

In [20]:
#generate report on entire model
report = classification_report(y_pred=pred, y_true=train_tags)
print(report)

             precision    recall  f1-score   support

      B-LOC       0.21      0.39      0.27      7140
     B-MISC       0.38      0.02      0.03      3438
      B-ORG       0.34      0.18      0.24      6321
      B-PER       0.27      0.47      0.34      6600
      I-LOC       0.00      0.00      0.00      1157
     I-MISC       0.00      0.00      0.00      1155
      I-ORG       0.25      0.00      0.01      3704
      I-PER       0.32      0.01      0.01      4528
          O       0.94      0.97      0.96    169578

avg / total       0.83      0.85      0.83    203621



  'precision', 'predicted', average, warn_for)
