In [164]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

In [165]:
#data = pd.read_csv('./output.csv')
data = pd.read_csv('./output_sliding_window_100d.csv')

column_names = ['label','dep','pos','head_pos','head_dep','entity_label','frequency_in_doc','sentence_loc','sentence_num']
for i in range(100):
    column_names.append('text_'+str(i))
for i in range(100):
    column_names.append('head_text_'+str(i))
for i in range(100):
    column_names.append('next_verb_'+str(i))
#data.columns = column_names

for i in range(100):
    column_names.append('two_prior_'+str(i))
for i in range(100):
    column_names.append('one_prior_'+str(i))
for i in range(100):
    column_names.append('one_post_'+str(i))
for i in range(100):
    column_names.append('two_post_'+str(i))

data.columns = column_names
#data = data.drop(data.iloc[:,9:159], axis=1)

In [166]:
percent_no_loc = data[data['label'] != 'none'].shape[0] 
print('Percentage of sentences with no location:', percent_no_loc)

Percentage of sentences with no location: 1488


In [167]:
data_labeled = data[data['label'] != 'none']
data_unlabeled = data[data['label'] == 'none']
print(data_unlabeled.shape)
print(data_labeled.shape)
balanced_data = data_labeled.append(data_unlabeled.sample(n=data_labeled.shape[0]))
print(balanced_data.shape)
data = balanced_data

(3345, 359)
(1488, 359)
(2976, 359)


In [168]:
data_labeled = data[data['label'] != 'none']
data_unlabeled = data[data['label'] == 'none']
print(data_unlabeled.shape)
print(data_labeled.shape)
balanced_data = data_labeled.append(data_unlabeled.sample(n=data_labeled.shape[0]))
print(balanced_data.shape)
data = balanced_data

(1488, 359)
(1488, 359)
(2976, 359)


In [169]:
data['label'] = data['label'].astype('category')
categories = dict(enumerate(data['label'].cat.categories))
data['label'] = data['label'].cat.codes

In [170]:
#dim reduce word embedding vectors - Doesn't work well
# text_cols = data.iloc[:,9:309]
# head_text_cols = data.iloc[:,309:609]
# next_verb_cols = data.iloc[:,609:909]
# prior_one_cols = data.iloc[:,909:1209]
# prior_two_cols = data.iloc[:,1209:1509]
# post_one_cols = data.iloc[:,1509:1809]
# post_two_cols = data.iloc[:,1809:2109]

# pca = PCA(n_components=100)
# text_cols_reduced = pca.fit_transform(text_cols)
# text_cols_reduced_df = pd.DataFrame(data=text_cols_reduced, columns=column_names[9:109])
# head_text_cols_reduced = pca.fit_transform(head_text_cols)
# head_text_cols_reduced_df = pd.DataFrame(data=head_text_cols_reduced, columns=column_names[309:409])
# next_verb_cols_reduced = pca.fit_transform(next_verb_cols)
# next_verb_cols_reduced_df = pd.DataFrame(data=next_verb_cols_reduced, columns=column_names[609:709])

# prior_one_cols_reduced = pca.fit_transform(prior_one_cols)
# prior_one_cols_reduced_df = pd.DataFrame(data=prior_one_cols_reduced, columns=column_names[909:1009])
# prior_two_cols_reduced = pca.fit_transform(prior_two_cols)
# prior_two_cols_reduced_df = pd.DataFrame(data=prior_two_cols_reduced, columns=column_names[1209:1309])
# post_one_cols_reduced = pca.fit_transform(post_one_cols)
# post_one_cols_reduced_df = pd.DataFrame(data=post_one_cols_reduced, columns=column_names[1509:1609])
# post_two_cols_reduced = pca.fit_transform(post_two_cols)
# post_two_cols_reduced_df = pd.DataFrame(data=post_two_cols_reduced, columns=column_names[1809:1909])

# data = data.iloc[:,:9]
# data = data.merge(text_cols_reduced_df, left_index=True, right_index=True)
# data = data.merge(head_text_cols_reduced_df, left_index=True, right_index=True)
# data = data.merge(next_verb_cols_reduced_df, left_index=True, right_index=True)
# data = data.merge(prior_one_cols_reduced_df, left_index=True, right_index=True)
# data = data.merge(prior_two_cols_reduced_df, left_index=True, right_index=True)
# data = data.merge(post_one_cols_reduced_df, left_index=True, right_index=True)
# data = data.merge(post_two_cols_reduced_df, left_index=True, right_index=True)

# print(data.shape)

# print(np.any(np.isnan(data.iloc[:,9:309].to_numpy())))

In [171]:
#One hot encode categorical labels (dependency and pos features)
dep_one_hot = pd.get_dummies(data['dep'], prefix='dep')
pos_one_hot = pd.get_dummies(data['pos'], prefix='pos')
head_pos_one_hot = pd.get_dummies(data['head_pos'], prefix='head_pos')
head_dep_one_hot = pd.get_dummies(data['head_dep'], prefix='head_dep')
entity_label_one_hot = pd.get_dummies(data['entity_label'], prefix='entity_label')

data = data.drop(['dep', 'pos','head_pos', 'head_dep',  'entity_label'], axis=1)
data = data.join(dep_one_hot)
data = data.join(pos_one_hot)
data = data.join(head_pos_one_hot)
data = data.join(head_dep_one_hot)
data = data.join(entity_label_one_hot)

print(data.shape)

(2976, 446)


In [172]:
X = data.drop('label', axis=1)
y = data['label']

In [173]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [174]:
learning_rates = [0.05, 0.1, 0.15, 0.2]
n_estimators = [50, 75, 100, 125]
max_features = ['10', 'sqrt', '50', '100']
max_depth = [2, 4, 6, 8]
min_samples_split= [150, 200, 250]
min_samples_leaf = [1]

classifier = GradientBoostingClassifier(learning_rate=0.06, n_estimators=120, max_depth=4,  max_features='sqrt', random_state=0, min_samples_split=50, min_samples_leaf=1)
#classifier = RandomForestClassifier(random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [175]:
print(categories) 
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

{0: 'acqbus', 1: 'acqloc', 2: 'acquired', 3: 'drlamt', 4: 'none', 5: 'purchaser', 6: 'seller', 7: 'status'}
Accuracy: 0.8051075268817204
              precision    recall  f1-score   support

           0       0.73      0.48      0.58        23
           1       0.68      0.45      0.54        29
           2       0.63      0.63      0.63        91
           3       1.00      0.65      0.79        40
           4       0.88      0.98      0.92       367
           5       0.62      0.78      0.69        96
           6       0.50      0.15      0.23        33
           7       0.98      0.83      0.90        65

    accuracy                           0.81       744
   macro avg       0.75      0.62      0.66       744
weighted avg       0.80      0.81      0.79       744

