# Feature Engineering and Classification for Metaphor

In [27]:
from collections import Counter
from nltk.corpus import wordnet as wn
from nltk import word_tokenize
from random import shuffle
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from nltk.corpus import wordnet as wn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.externals import joblib
import nltk

In [28]:
def generate_features(wordphrase):
    features = {}
    POS_bucket = []
    wordphrase = wordphrase.lower()
#     for line in wordphrase:
#         for word in line:
#             POS = nltk.pos_tag(word)
#             POS_bucket.append(POS[0][1])
    features = featurize_pos_list(get_pos_list_from_ngram(wordphrase), features)
    return features
generate_features('veni vidi vici') 

{'NN': 3}

In [22]:
def get_letter_combinations(candidate, features, number):
    candidate = candidate.replace(" ", "")
    if len(candidate) < number:
        return features
    else:
        for index in range(0, len(candidate), number):
            features[candidate[index:index + number]] += 1
        return features

In [21]:
def featurize_pos_list(pos_list, features):
    for pos in pos_list:
        if pos in features.keys():
            features[pos] += 1
        else:
            features[pos] = 1
    return features

def get_pos_list_from_ngram(ngram):
    ngram_tagged = nltk.pos_tag(ngram.split())
    pos_list = [tagged_word[1] for tagged_word in ngram_tagged]
    return pos_list
        
    
    

In [39]:
# This function allows experimentation with different feature definitions
# items is a list of (key, value) pairs from which features are extracted and training sets are made
# Feature sets returned are dictionaries of features

# This function also optionally returns the names of the training, development, 
# and test data for the purposes of error checking

def create_training_sets (feature_function, items, return_items=False):
    # Create the features sets.  Call the function that was passed in.
    # For names data, key is the name, and value is the gender
    shuffle(items)
    featuresets = [(feature_function(key), value, key) for (key, value) in items]
    
    # Divided training and testing in thirds.  Could divide in other proportions instead.
    fifth = int(float(len(featuresets)) / 5.0)
    
    train_set, dev_set, test_set = featuresets[0:fifth*4], featuresets[fifth*4:fifth*5], featuresets[fifth*4:]
    train_items, dev_items, test_items = items[0:fifth*4], items[fifth*4:fifth*5], items[fifth*4:]
    if return_items == True:
        return train_set, dev_set, test_set, train_items, dev_items, test_items
    else:
        return train_set, dev_set, test_set

In [56]:
len(test_set)

3

In [31]:
dataset_df = pd.read_csv("all_an_data.csv")
dataset_df.head()

Unnamed: 0,Phrase,Label
0,stormy applause,metaphor
1,clean government,metaphor
2,woman suffer pain,NOT metaphor
3,filthy garment,NOT metaphor
4,silky voice,metaphor


In [32]:
print(dataset_df.head())
items = []
print(dataset_df["Phrase"][0])
for index in range(len(dataset_df)):
    items.append((dataset_df["Phrase"][index], dataset_df["Label"][index]))
print(items)
    

               Phrase         Label
0     stormy applause      metaphor
1    clean government      metaphor
2   woman suffer pain  NOT metaphor
3      filthy garment  NOT metaphor
4         silky voice      metaphor
stormy applause
[('stormy applause', 'metaphor'), ('  clean government', 'metaphor'), ('woman suffer pain', 'NOT metaphor'), ('filthy garment', 'NOT metaphor'), ('silky voice', 'metaphor'), ('temperature break number', 'metaphor'), ('tragedy strike community', 'metaphor'), ('woolly liberal', 'metaphor'), ('thirsty camel', 'NOT metaphor'), ('army bury soldier', 'NOT metaphor'), ('empty promise', 'metaphor'), ('stony ridge', 'NOT metaphor'), ('framework bury initiative', 'metaphor'), ('strong bone', 'NOT metaphor'), ('cloudy future', 'metaphor'), ('pronoun grab towel', 'NOT metaphor'), ('firm mattress', 'NOT metaphor'), ('person smile none', 'NOT metaphor'), ('change shake confidence', 'metaphor'), ('bride dream none', 'NOT metaphor'), ('foot slip none', 'NOT metaphor'), ('te

In [42]:
train_set, dev_set, test_set, train_items, dev_items, test_items = create_training_sets(generate_features, items, True)
# cl4 = nltk.NaiveBayesClassifier.train(train_set4)
# This is code from the NLTK chapter
errors = []
# print ("%.3f" % nltk.classify.accuracy(cl4, dev_set4))

In [43]:
# print ("%.3f" % nltk.classify.accuracy(cl4, test_set4))
# print(train_set4[0][1])
# print(test_set4[:2])
test_set_features = np.asarray([item[0] for item in test_set])
train_set_features = np.asarray([item[0] for item in train_set])
test_set_names = np.asarray([item[2] for item in test_set])
train_set_names = np.asarray([item[2] for item in train_set])
test_set_labels = np.asarray([item[1] for item in test_set])
train_set_labels = np.asarray([item[1] for item in train_set])

train_set = {}
train_set["features"] = train_set_features
train_set["names"] = train_set_names
train_set["labels"] = train_set_labels

test_set = {}
test_set["features"] = test_set_features
test_set["names"] = test_set_names
test_set["labels"] = test_set_labels

print(test_set["names"][0])



ship carry passenger


In [44]:
def create_manual_test_set(manual_list, generate_features):
    manual_set = [(generate_features(key), value, key) for (key, value) in manual_list]
    test_set_features = np.asarray([item[0] for item in manual_set])
    test_set_labels = np.asarray([item[1] for item in manual_set])
    test_set_names = np.asarray([item[2] for item in manual_set])
    manual_set_dict = {}
    manual_set_dict["features"] = test_set_features
    manual_set_dict["names"] = test_set_names
    manual_set_dict["labels"] = test_set_labels
    return manual_set_dict

In [45]:

class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
#         print(data_dict)
        return data_dict[self.key]

In [60]:
kaggle_classifier = Pipeline([('union', FeatureUnion(
                                    transformer_list=[

                                        # Pipeline for pulling features from the post's subject line
                                        ('names', Pipeline([
                                            ('selector', ItemSelector(key='names')),
                                            ('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(2,3), sublinear_tf=True)),
                                        ])),

                                        # Pipeline for standard bag-of-words model for body
                                        ('features', Pipeline([
                                            ('selector', ItemSelector(key='features')),
                                            ('dict', DictVectorizer(sparse='False'))
                                        ])),

                                    ],

                                    # weight components in FeatureUnion
                                    transformer_weights={
                                        'names': 0.2,
                                        'features': 0.8,
                                    },
                                )),

                                # Use a SVC classifier on the combined features
                                ('svc', LinearSVC()),
                            ])
kaggle_classifier = kaggle_classifier.fit(train_set,train_set_labels)
    
kaggle_predictions = kaggle_classifier.predict(test_set)

accuracy_score(test_set_labels, kaggle_predictions)

0.52325581395348841

In [47]:
# kaggle_classifier.predict(["viz a viz", "tete a tete", "locker", "scrum", "scalar", "table"])
manual_list = [("dividend", True)]
manual_test_dict = create_manual_test_set(manual_list, generate_features)
print(manual_test_dict)
manual_predictions = kaggle_classifier.predict(manual_test_dict)
print(manual_predictions)

{'features': array([{'NN': 1}], dtype=object), 'names': array(['dividend'], 
      dtype='<U8'), 'labels': array([ True], dtype=bool)}
['metaphor']


In [48]:
kaggle_classifier = Pipeline([('tfidfvect', TfidfVectorizer(analyzer='char', ngram_range=(2,4), sublinear_tf=True)),
#                                     ('feat',SelectKBest(chi2, 5)),
                                    ('classifier', LinearSVC())
                                   ])
kaggle_classifier = kaggle_classifier.fit(train_set_names,train_set_labels)
    
kaggle_predictions = kaggle_classifier.predict(test_set_names)

accuracy_score(test_set_labels, kaggle_predictions)

0.51162790697674421

In [16]:
def test_manual_predictions(manual_list):
    manual_test_dict = create_manual_test_set(manual_list, generate_features)
    manual_predictions = kaggle_classifier.predict(manual_test_dict)
    print(manual_predictions)
    
    

In [17]:
# The True/False bit of the Tuple only needs to be accurate if you plan to test the accuracy using accuracy_score, 
# else it isn't considered.
manual_list = [("viz a viz", True), ("tete a tete",False), ("bottomline", True), ("ibuprofin", True), ("uninterested", False)]
test_manual_predictions(manual_list)


[ True  True False False False]


In [18]:
joblib.dump(kaggle_classifier, 'linear_jargon_classifier.pkl') 

['linear_jargon_classifier.pkl']