This notebook explores feature engineering for text classification, training a regularized logistic regression model for the binary classification task of predicting a movie's genre.

In [None]:
import json
import nltk
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from scipy import sparse
from collections import Counter
import operator

Read movie metadata and return dictionary mapping genres to the set of movies that are tagged with that genre; print out the top 30 genres by frequency.

In [None]:
def read_metadata(filename):

    metadata={}
    counts=Counter()
    with open(filename, encoding="utf-8") as file:
        file.readline()
        for line in file:
            cols=line.rstrip().split("\t")
            movieID=cols[0]
            genres=json.loads(cols[8])
            
            for genre in genres.values():
                counts[genre]+=1
                if genre not in metadata:
                    metadata[genre]={}
                metadata[genre][movieID]=1

    # print out the 30 most frequent genres in the data
    for k, v in counts.most_common(30):
        print("%s\t%s" % (str(v).ljust(10), k))
    
    return metadata

In [None]:
metadata=read_metadata("../data/movie.metadata.tsv")

Subset the metadata to just those movies that exclusively appear with one of two genres selected for binary classification.

In [None]:
def filter_for_genres(metadata, genre1, genre2):
    
    labels={}
    
    numGen1=0
    numGen2=0
    
    for movie in metadata[genre1]:
        if movie not in metadata[genre2]:
            labels[movie]=1
            numGen1+=1
            
    for movie in metadata[genre2]:
        if movie not in metadata[genre1]:
            labels[movie]=0
            numGen2+=1
        
    return labels

In [None]:
genre1="Romantic comedy"
genre2="Science Fiction"

labels=filter_for_genres(metadata, genre1, genre2)

Read movie summary data and tokenize the descriptions of those movies that match for the two genres selected.  Return a list of tokenized summaries and a corresponding list of their binary labels.

In [None]:
def read_data(filename, labels):
    
    data=[]
    X=[]
    Y=[]
    
    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            movieID=cols[0]
            if movieID in labels:
                # lowercase description
                description=cols[1].lower()
                
                # tokenize
                tokens=nltk.word_tokenize(description)
                X.append(tokens)
                Y.append(labels[movieID])  
                
    return X, Y 

In [None]:
X, Y=read_data("../data/plot_summaries.txt", labels)

Split the data into training and validation sets (hold out 20% of the data for evaluation).

In [None]:
trainX, devX, trainY, devY = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
def build_features(dataX, feature_functions):
    
    """ This function featurizes the data according to the list of parameter feature_functions """
    
    data=[]
    for tokens in dataX:
        feats={}
        
        for function in feature_functions:
            feats.update(function(tokens))

        data.append(feats)
    return data

In [None]:
def features_to_ids(data, feature_vocab):
    
    """ 
    
    This helper function converts a dictionary of feature names to a sparse representation
 that we can fit in a scikit-learn model.  This is important because almost all feature 
 values will be 0 for most documents (note: why?), and we don't want to save them all in 
 memory.

    """
    new_data=sparse.lil_matrix((len(data), len(feature_vocab)))
    for idx,doc in enumerate(data):
        for f in doc:
            if f in feature_vocab:
                new_data[idx,feature_vocab[f]]=doc[f]
    return new_data

In [None]:
def create_vocab(data, top_n=None):
    
    """ 
    
    This helper function converts a dictionary of feature names to unique numerical ids. 
    top_n limits the features to only the n most frequent features observed in the training data 
    (in terms of the number of documents that contains it).
    
    """
    
    counts=Counter()
    for doc in data:
        for feat in doc:
            counts[feat]+=1

    feature_vocab={}

    for idx, (k, v) in enumerate(counts.most_common(top_n)):
        feature_vocab[k]=idx
                
    return feature_vocab

In [None]:
def pipeline(trainX, devX, trainY, devY, feature_functions):

    """ This function evaluates a list of feature functions on the training/dev data arguments """
    
    trainX_feat=build_features(trainX, feature_functions)
    devX_feat=build_features(devX, feature_functions)

    # just create vocabulary from features in *training* data.
    feature_vocab=create_vocab(trainX_feat, top_n=100000)

    trainX_ids=features_to_ids(trainX_feat, feature_vocab)
    devX_ids=features_to_ids(devX_feat, feature_vocab)
    
    clf = linear_model.LogisticRegression(C=100, solver='lbfgs', penalty='l2', max_iter=10000)
    clf.fit(trainX_ids, trainY)
    print("Accuracy: %.3f" % clf.score(devX_ids, devY))
    
    return clf, feature_vocab

Let's create a simple dictionary-based feature: this feature value is set to 1 for a document whenever any word present that dictionary appears in that document (and 0 otherwise).

In [None]:
comedy_dictionary=set(["comedy", "love", "date"])
scifi_dictionary=set(["science", "ship", "alien"])

def dictionary_feature(tokens):
    feats={}
    for word in tokens:
        if word in comedy_dictionary:
            feats["word_in_comedy_dictionary"]=1
        if word in scifi_dictionary:
            feats["word_in_scifi_dictionary"]=1
    return feats

In [None]:
features=[dictionary_feature]
clf, vocab=pipeline(trainX, devX, trainY, devY, features)

Is this accuracy good or bad?  We need to contextualize this performance against some baseline. A simple one to use is a *majority class* classifier: for every document in the test data, let's just predict whatever class appears the most frequently in the training data.

In [None]:
def majority_class(trainY, devY):
    labelCounts=Counter()
    for label in trainY:
        labelCounts[label]+=1
    majority_class=labelCounts.most_common(1)[0][0]
    
    correct=0.
    for label in devY:
        if label == majority_class:
            correct+=1
            
    print("%s\t%.3f" % (majority_class, correct/len(devY)))

In [None]:
majority_class(trainY, devY)

In [None]:
def unigram_feature(tokens):
    feats={}
    for word in tokens:
        feats["UNIGRAM_%s" % word]=1
    return feats

In [None]:
features=[unigram_feature]
clf, vocab=pipeline(trainX, devX, trainY, devY, features)

Let's print out the top 10 features with the strongest weights for each class

In [None]:
def print_weights(clf, vocab, n=10):
    weights=clf.coef_[0]
    reverse_vocab=[None]*len(weights)
    for k in vocab:
        reverse_vocab[vocab[k]]=k

    for feature, weight in sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))[:n]:
        print("%.3f\t%s" % (weight, feature))

    print()

    for feature, weight in list(reversed(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))))[:n]:
        print("%.3f\t%s" % (weight, feature))

In [None]:
print_weights(clf, vocab, n=10)

In [None]:
def your_awesome_feature(tokens):
    feats={}

    
    return feats

In [None]:
features=[your_awesome_feature]
pipeline(trainX, devX, trainY, devY, features)