In [None]:
import glob
import string
import fasttext
import fileinput
import pandas as pd
import numpy as np
import sklearn
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

In [None]:
lexicon_sources = []

with open('../lexicon/set_2/lexicon_sources_2.txt', "r") as f:
    lexicon_sources = f.read().split()

# 1. Define features

In [None]:
def word2features(sent, i):
    word = sent[i][2]
    vector = sent[i][4:103]
    binvector = sent[i][104:203]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[5:]': word[:5],
        'word[-5:]': word[-5:],
        'word.isdigit()': word.isdigit(),
        'word.islower()': word.islower(),
        'word.istitle()': word.istitle(),
#         'word.issource()': word.lower() in lexicon_sources,
#         'vector': vector,
#         'binvector': binvector

    }
    if i > 0:
        word1 = sent[i-1][2]
        vector1 = sent[i-1][4:103]
        binvector1 = sent[i-1][104:203]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word[5:]': word1[:5],
            '-1:word[-5:]': word1[-5:],
            '-1:word.isdigit()': word1.isdigit(),
            '-1word.islower()': word1.islower(),
            '-1word.istitle()': word1.istitle(),
#             '-1word.issource()': word1.lower() in lexicon_sources,
#             '-1vector': vector1,
#             '-1binvector': binvector1,
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][2]
        vector1 = sent[i+1][4:103]
        binvector1 = sent[i+1][104:203]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word[5:]': word1[:5],
            '+1:word[-5:]': word1[-5:],
            '+1:word.isdigit()': word1.isdigit(),
            '+1word.islower()': word1.islower(),
            '+1word.istitle()': word1.istitle(),
#             '+1word.issource()': word1.lower() in lexicon_sources,
#             '+1vector': vector1,
#             '+1binvector': binvector1,
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent): #use version of sent without vectors
    return [label for nr, token, label in sent]

def sent2tokens(sent): #use version of sent without vectors
    return [token for nr, token, label in sent]

# 2. Prepare train set

In [None]:
features_train = pd.read_csv('all_train_vectors.csv', sep=',', index_col=0).drop('attribute', 1)

In [None]:
features_train['vector'] = features_train['vector'].str.replace('[', '')
features_train['vector'] = features_train['vector'].str.replace(']', '')
features_train['vector'] = features_train['vector'].str.strip()
features_train['vector'] = features_train['vector'].str.replace('\n', ' ')
features_train.vector = features_train.vector.replace('\s+', ' ', regex=True)

In [None]:
features_train = pd.concat([features_train[['filename','token', 'label']], features_train['vector'].str.split(' ', expand=True)], axis=1)

## 2.1 Convert RawEmb to BinEmb

In [None]:
for i in range (0,100):
    features_train[i] = pd.to_numeric(features_train[i], downcast="float")

In [None]:
model = fasttext.load_model("PATH_TO_FASTTEXT_MODEL/.bin")
numpynumeric = model.get_output_matrix()
numeric = pd.DataFrame(data=numpynumeric)

positive_means = np.mean(numeric[numeric > 0], axis=0).values
negative_means = np.mean(numeric[numeric < 0], axis=0).values

positive_dict = dict(zip(numeric, positive_means))
negative_dict = dict(zip(numeric, negative_means))

In [None]:
all_rows = []

for index, row in features_train.iterrows():
    row_dict = dict(row)
    for i in range(0,100):
        if row_dict[i] > positive_dict[i]:
            row_dict['bin' + str(i)] = '+'
            continue
        if row_dict[i] < negative_dict[i]:
            row_dict['bin' + str(i)] = '-'
            continue
        else:
            row_dict['bin' + str(i)] = 0
    all_rows.append(row_dict)

merged = pd.DataFrame(all_rows)

In [None]:
merged.to_csv('all_train_vectors_features_bin.csv')

In [None]:
from csv import reader

with open('all_train_vectors_features_bin.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    sent = list(csv_reader)

In [None]:
sent.pop(0)

In [None]:
train_sents = [sent]
sent2 = [item[1:4] for item in sent]
train_sents2 = [sent2]

In [None]:
X_train = [sent2features(s) for s in train_sents]
Y_train = [sent2labels(s) for s in train_sents2]

### 2.1.1 Split vectors in 100 features

In [None]:
for item in X_train:
    for word in item:
        word_vector = word['vector']
        for i, element in enumerate(word_vector):
            word['vector{}'.format(i+1)] = float(element)
        word.pop('vector')
        if '-1vector' in word:
            word_vector_prev = word['-1vector']
            for i, element in enumerate(word_vector_prev):
                word['-1vector{}'.format(i+1)] = float(element)
            word.pop('-1vector')
        else:
            continue
            
        if '+1vector' in word:
            word_vector_next = word['+1vector']
            for i, element in enumerate(word_vector_next):
                word['+1vector{}'.format(i+1)] = float(element)
            word.pop('+1vector')
        else:
            continue

### 2.1.2 Split binvectors in 100 features

In [None]:
for item in X_train:
    for word in item:
        word_vector = word['binvector']
        for i, element in enumerate(word_vector):
            word['binvector{}'.format(i+1)] = element
        word.pop('binvector')
        if '-1binvector' in word:
            word_vector_prev = word['-1binvector']
            for i, element in enumerate(word_vector_prev):
                word['-1binvector{}'.format(i+1)] = element
            word.pop('-1binvector')
        else:
            continue
            
        if '+1binvector' in word:
            word_vector_next = word['+1binvector']
            for i, element in enumerate(word_vector_next):
                word['+1binvector{}'.format(i+1)] = element
            word.pop('+1binvector')
        else:
            continue

# 3. Prepare test set

In [None]:
features_test = pd.read_csv('all_test_vectors.csv', sep=',', index_col=0).drop('attribute', 1)

In [None]:
features_test['vector'] = features_test['vector'].str.replace('[', '')
features_test['vector'] = features_test['vector'].str.replace(']', '')
features_test['vector'] = features_test['vector'].str.strip()
features_test['vector'] = features_test['vector'].str.replace('\n', ' ')
features_test.vector = features_test.vector.replace('\s+', ' ', regex=True)

In [None]:
features_test = pd.concat([features_test[['filename', 'token', 'label']], features_test['vector'].str.split(' ', expand=True)], axis=1)

## 3.1 Convert RawEmb to BinEmb

In [None]:
for i in range (0,100):
    features_test[i] = pd.to_numeric(features_test[i], downcast="float")

In [None]:
all_rows = []

for index, row in features_test.iterrows():
    row_dict = dict(row)
    for i in range(0,100):
        if row_dict[i] > positive_dict[i]:
            row_dict['bin' + str(i)] = '+'
            continue
        if row_dict[i] < negative_dict[i]:
            row_dict['bin' + str(i)] = '-'
            continue
        else:
            row_dict['bin' + str(i)] = 0
    all_rows.append(row_dict)

merged = pd.DataFrame(all_rows)

In [None]:
merged.to_csv('all_test_vectors_features_bin.csv')

In [None]:
with open('all_test_vectors_features_bin.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    sent3 = list(csv_reader)

In [None]:
sent3.pop(0)

In [None]:
test_sents = [sent3]
sent4 = [item[1:4] for item in sent3]
test_sents2 = [sent4]

In [None]:
X_test = [sent2features(s) for s in test_sents]
Y_test = [sent2labels(s) for s in test_sents2]

### 3.1.1 Split vectors into 100 features

In [None]:
for item in X_test:
    for word in item:
        word_vector = word['vector']
        for i, element in enumerate(word_vector):
            word['vector{}'.format(i+1)] = float(element)
        word.pop('vector')
        
        if '-1vector' in word:
            word_vector_prev = word['-1vector']
            for i, element in enumerate(word_vector_prev):
                word['-1vector{}'.format(i+1)] = float(element)
            word.pop('-1vector')
        else:
            continue
            
        if '+1vector' in word:
            word_vector_next = word['+1vector']
            for i, element in enumerate(word_vector_next):
                word['+1vector{}'.format(i+1)] = float(element)
            word.pop('+1vector')
        else:
            continue

### 3.1.2 Split binvectors into 100 features

In [None]:
for item in X_test:
    for word in item:
        word_vector = word['binvector']
        for i, element in enumerate(word_vector):
            word['binvector{}'.format(i+1)] = element
        word.pop('binvector')
        if '-1binvector' in word:
            word_vector_prev = word['-1binvector']
            for i, element in enumerate(word_vector_prev):
                word['-1binvector{}'.format(i+1)] = element
            word.pop('-1binvector')
        else:
            continue
            
        if '+1binvector' in word:
            word_vector_next = word['+1binvector']
            for i, element in enumerate(word_vector_next):
                word['+1binvector{}'.format(i+1)] = element
            word.pop('+1binvector')
        else:
            continue

# 4. Train CRF-model

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, Y_train)

# 5. Evaluate the model

In [None]:
labels = list(crf.classes_)
labels.remove('O')

In [None]:
Y_pred = crf.predict(X_test)
metrics.flat_f1_score(Y_test, Y_pred,
                      average='weighted', labels=labels)

In [None]:
print(metrics.flat_classification_report(
    Y_test, Y_pred, labels=labels, digits=3
))

In [None]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

# 6. Merge label and predicted label

In [None]:
from itertools import chain

new_Y_pred = list(chain.from_iterable(Y_pred))

In [None]:
result = [a+[x] for a,x in zip(sent4,new_Y_pred)]

In [None]:
df = pd.DataFrame(result, columns=['filename', 'token', 'gold', 'prediction'])

In [None]:
df.head()

In [None]:
df.groupby('gold')['gold'].count()

In [None]:
df_gold = df.copy().drop(['prediction'], 1)

In [None]:
df_gold.head()

In [None]:
df_prediction = df.drop(['gold'], 1)

In [None]:
df_prediction.head()

In [None]:
df_gold.to_csv('lexicon_gold.txt', sep='\t')

In [None]:
df_prediction.to_csv('lexicon_prediction.txt', sep='\t')

In [None]:
df.to_csv('lexicon_gold_prediction.csv')