# Random Forest (with Feature Engineering)

In [115]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#Custom models
from prepro import readfile, get_sentence, is_number, extract_words,partial_tags


#Model
from sklearn.ensemble import RandomForestClassifier

#Evalulation
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report,f1_score

In [116]:
#import data from my github repo
train = readfile("train.txt")
corpus = train.copy()
test = readfile("test.txt")

#create corpus
corpus.extend(test)

In [117]:
words = []
tags = []
for sentence in corpus:
    for word in sentence:
        words.append(word[0])
        tags.append(word[1])   

## Tokenization

In [118]:
tags = partial_tags(tags)
words=list(words)
n_words = len(set(words))
n_tags = len(set(tags))

labels=list(set(tags))
labels.remove('O')

print("Number of words in the dataset: ", n_words)
print("Number of tags in the dataset: ", n_tags)

Number of words in the dataset:  27316
Number of tags in the dataset:  5


In [119]:
#training set
train_words, train_tags = extract_words(train)
train_tags=partial_tags(train_tags)

## Orthographic  Feature Engineering 

In [120]:
def count_vowel(word):
    return sum(list(map(word.lower().count, "aeiou")))

def dash(word):
    return 1 if "-" in word else 0

def count_consonants(word):
    vowels="aeiou"
    return sum(i not in vowels for i in word)
    
def feature_map(word):
    '''Simple feature map with 10 features'''
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha(),word.isalnum(),count_vowel(word),
                    dash(word), count_consonants(word)]).tolist()

In [121]:
train_feature_words = [feature_map(word) for word in train_words]

In [122]:
train_feature_words[0]

[0, 0, 1, 2, 0, 1, 1, 2, 0, 2]

## Random Forest Classifier (with Features)

We use feature engineering here as Random Forest classifiers are not typical used for sequencs classification and thus need an input dataframe with a least two columns. Therefore we need train the classifier using feature engineered data with 10 features results in a dataframe of 10 columns. 

In [44]:
#Train 
pred = cross_val_predict(RandomForestClassifier(n_estimators=20), X=train_feature_words, y=train_tags, cv=5)

In [45]:
#generate report on entire model
report = classification_report(y_pred=pred, y_true=train_tags,labels=labels)
f1= f1_score(y_pred=pred, y_true=train_tags,average='weighted',labels=labels)
print(report)
print("F1 Score:",f1)

              precision    recall  f1-score   support

         PER       0.36      0.76      0.49     11128
         LOC       0.48      0.20      0.29      8297
         ORG       0.42      0.28      0.34     10025
        MISC       0.44      0.12      0.19      4593

   micro avg       0.39      0.40      0.39     34043
   macro avg       0.42      0.34      0.32     34043
weighted avg       0.42      0.40      0.35     34043

F1 Score: 0.3538755235011891


## Predict on Test Dataset

In [12]:
## Test set
test_words, test_tags = extract_words(test)
test_tags=partial_tags(test_tags)

#apply feature engineering
test_feature_words = [feature_map(word) for word in test_words]

In [13]:
#Train and fit
clf = RandomForestClassifier(n_estimators=20, random_state=0)
clf.fit(train_feature_words,train_tags)

pred = clf.predict(test_feature_words)

In [14]:
#generate report on entire model
report = classification_report(y_pred=pred, y_true=test_tags,labels=labels)
f1= f1_score(y_pred=pred, y_true=test_tags,average='weighted',labels=labels)
print(report)
print("F1 Score:",f1)

              precision    recall  f1-score   support

         PER       0.37      0.78      0.50      2773
         LOC       0.48      0.20      0.28      1925
         ORG       0.43      0.26      0.32      2496
        MISC       0.37      0.14      0.20       918

   micro avg       0.39      0.41      0.40      8112
   macro avg       0.41      0.34      0.33      8112
weighted avg       0.41      0.41      0.36      8112

F1 Score: 0.3601999395995972


## Adding Context to Words

In [182]:
#Get all the sentences in the form of a list
def get_all_sentences(dataset):
    sentences=[]
    for i in range(len(dataset)):
        sentences.append(get_sentence(dataset,i+1))
    return sentences

def sent2features(sent):
    return [feature_map2(sent, i) for i in range(len(sent))]

def feature_map2(sentence,i):
    word = sentence[i]
    
    '''Simple feature map with 10 features'''
    primary_word = np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha(),word.isalnum(),count_vowel(word),
                    dash(word), count_consonants(word)]).tolist()
    if i > 0:
        word1 = sentence[i-1]
        primary_word.extend(np.array([word1.istitle(), word1.islower(), word1.isupper(), len(word1),
                     word1.isdigit(),  word1.isalpha(),word1.isalnum(),count_vowel(word1),
                    dash(word1), count_consonants(word1)]).tolist())
    else: 
        word1 = sentence[-i]
        primary_word.extend(np.array([word1.istitle(), word1.islower(), word1.isupper(), len(word1),
                     word1.isdigit(),  word1.isalpha(),word1.isalnum(),count_vowel(word1),
                    dash(word1), count_consonants(word1)]).tolist())
    
    if i < len(sentence)-1:
        word1 = sentence[i+1]
        primary_word.extend(np.array([word1.istitle(), word1.islower(), word1.isupper(), len(word1),
                     word1.isdigit(),  word1.isalpha(),word1.isalnum(),count_vowel(word1),
                    dash(word1), count_consonants(word1)]).tolist())
    else:
        word1 = sentence[0]
        primary_word.extend(np.array([word1.istitle(), word1.islower(), word1.isupper(), len(word1),
                     word1.isdigit(),  word1.isalpha(),word1.isalnum(),count_vowel(word1),
                    dash(word1), count_consonants(word1)]).tolist())
        
    return(primary_word)

## Predict on Training

In [198]:
sentences = get_all_sentences(train)

#Applt feature engineering
X = [sent2features(s) for s in sentences]

train_feature_words = []
for sublist in X:
    for item in sublist:
        train_feature_words.append(item)

In [199]:
#Train 
pred = cross_val_predict(RandomForestClassifier(n_estimators=20), X=train_feature_words, y=train_tags, cv=5)

In [200]:
report = classification_report(y_pred=pred, y_true=train_tags,labels=labels)
f1= f1_score(y_pred=pred, y_true=train_tags,average='weighted',labels=labels)
print(report)
print("F1 Score:",f1)

              precision    recall  f1-score   support

         PER       0.69      0.72      0.71     11128
         LOC       0.68      0.66      0.67      8297
         ORG       0.67      0.62      0.64     10025
        MISC       0.57      0.52      0.54      4593

   micro avg       0.67      0.65      0.66     34043
   macro avg       0.65      0.63      0.64     34043
weighted avg       0.67      0.65      0.66     34043

F1 Score: 0.656093027063109


## Predict on Test

In [201]:
sentences = get_all_sentences(test)

#Applt feature engineering
X = [sent2features(s) for s in sentences]

test_feature_words = []
for sublist in X:
    for item in sublist:
        test_feature_words.append(item)

In [202]:
#Train and fit
clf = RandomForestClassifier(n_estimators=20, random_state=0)
clf.fit(train_feature_words,train_tags)

pred = clf.predict(test_feature_words)

In [203]:
#generate report on entire model
report = classification_report(y_pred=pred, y_true=test_tags,labels=labels)
f1= f1_score(y_pred=pred, y_true=test_tags,average='weighted',labels=labels)
print(report)
print("F1 Score:",f1)

              precision    recall  f1-score   support

         PER       0.65      0.67      0.66      2773
         LOC       0.63      0.60      0.61      1925
         ORG       0.63      0.61      0.62      2496
        MISC       0.46      0.46      0.46       918

   micro avg       0.62      0.61      0.61      8112
   macro avg       0.59      0.59      0.59      8112
weighted avg       0.62      0.61      0.61      8112

F1 Score: 0.6140340754306168
