# Part 1: The Classifier

This part builds the classifier we'll use to filter/flag our villain data

In [1]:
from textblob.classifiers import NaiveBayesClassifier
import numpy
import json
import nltk

def load_file(filename,t):
    f = open(filename,'rb')
    lines = f.read().splitlines()
    target = []
    if t == "int":
        for x in lines:
            target.append(int(x))
    else:
        if t == "json":
            for x in lines:
                y = json.loads(x)
                target.append(y)
        else:
            if t == "arr":
                for arr in lines:
                    #print (arr)
                    row = arr.split(", ")
                    target.append(row)
            else:
                for x in lines:
                    target.append(x)
    return target

i_black = load_file("i_black",'int')
i_white = load_file("i_white",'int')
v_black = load_file("v_black",'int')
v_white = load_file("v_white",'int')
collections = load_file("collections",'json')
originals = load_file("originals",'json')

In [2]:
def test_train_data(source,label):
    train=[]
    t1=[]
    t2=[]
    i=0
    data_size = len(source)
    numpy.random.seed(3)
    random_arr = numpy.random.choice(range(0,data_size),data_size)
    train_size = round(data_size*.64,1)
    devt_size = round(data_size*.16,1)
    target = []
    j = 0
    for i in random_arr:
        if j<train_size:
            target=train
        else:
            if j<train_size+devt_size:
                target=t1
            else:
                target=t2
        pair = (source[i],label)
        target.append(pair)
        j=j+1
    return (train,t1,t2)

c_tt = test_train_data(collections,"col")
o_tt = test_train_data(originals,"orig")

train = c_tt[0]+o_tt[0]

test1 = c_tt[1]+o_tt[1]
test2 = c_tt[2]+o_tt[2]

In [3]:
from HTMLParser import HTMLParser
import string
from textblob.utils import strip_punc

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def corpus_from_text(data,ls,key):
    #print(data)
    if data[key] is not None:        
        desc = strip_tags(data[key]).lower()
        desc = desc.split()
        for word in desc:
            word = strip_non_ascii(word)
            word = strip_punc(word,all=True)
            #print(word)
            if word not in ls:
                ls.append(word)
    return ls


In [4]:
def strip_non_ascii(string):
    stripped = (c for c in string if (0 < ord(c) < 127))
    return ''.join(stripped)

def first_words(text,num):
    ls = ""
    j = 0
    if text is not None:
        desc = strip_tags(text).lower()
        desc = desc.split()
        for word in desc:
            word = strip_non_ascii(word)
            word = strip_punc(word,all=True)
            #print(word)
            ls = ls + word
            j = j+1
            if j == num:
                return ls
        return ls
    return ls

In [5]:
manual_desc_corpus = ['reprints','collecting','hardcover','paperback','after','trade','as'
                     'into','vol']
manual_title_corpus = ['death','without','knight','four','line','by','collection','omnibus']
desc_corpus = manual_desc_corpus
title_corpus = manual_title_corpus
                
def basic_word_extractor(data,key):
    corpus = desc_corpus
    if key == 'name':
        corpus = title_corpus
    data_words = []
    data_words = corpus_from_text(data,data_words,key)
    features = dict((((key+'_contains({0})').format(word), (word in data_words))
                                            for word in corpus))
    return features

def volume_features(vol_data):
    #words_in_title = words(vol_data['name'])
    #words_in_desc = words(vol_data['name'])
    features= {'issue_count': vol_data['count_of_issues'],
            'start_year': int(vol_data['start_year']),
            'early_start': int(vol_data['start_year'])<=1990,
            'first_word': first_words(vol_data['description'],1),
            'first_two': first_words(vol_data['description'],2)
            #'name_contains(by)': 'by' in vol_data['name'].lower(),
            #'desc_contains(collected)': 'collected' in unicode(vol_data['description']).lower()
           }
    features.update(basic_word_extractor(vol_data,"name"))
    features.update(basic_word_extractor(vol_data,"description"))
    return features

In [6]:
train_feats = [(volume_features(n), category) for (n, category) in train]
devt_feats = [(volume_features(n), category) for (n, category) in test1]
test_feats = [(volume_features(n), category) for (n, category) in test2]
train2_feats = [(volume_features(n), category) for (n, category) in train+test1]
train_set = train2_feats
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [7]:
accuracy1 = nltk.classify.accuracy(classifier, test_feats)
print(accuracy1)
classifier.show_most_informative_features(10)

0.947368421053
Most Informative Features
description_contains(reprints) = True              col : orig   =     50.6 : 1.0
description_contains(paperback) = True              col : orig   =     26.3 : 1.0
description_contains(trade) = True              col : orig   =     16.6 : 1.0
description_contains(collecting) = True              col : orig   =     13.1 : 1.0
             early_start = True             orig : col    =     11.1 : 1.0
description_contains(hardcover) = True              col : orig   =     10.1 : 1.0
               first_two = u'seriesof'       col : orig   =      8.6 : 1.0
              start_year = 2003              col : orig   =      8.3 : 1.0
description_contains(vol) = True             orig : col    =      7.4 : 1.0
              first_word = u'hardcover'      col : orig   =      7.0 : 1.0


In [8]:
def prob_of_label(feats,cl):
    arr = {}
    dist = cl.prob_classify(feats)
    for label in dist.samples():
        row = "%s: %f" % (label, dist.prob(label))
        #print(row)
        arr[label]=dist.prob(label)
    return arr

# EX: prob_of_label(volume_features(collections[100]),classifier)['col']

In [9]:
def trunc_desc(vol):
    if vol["description"] is not None:
        return vol["description"][0:30]
    else:
        return ""

errors = []
for (vol, tag) in test2:
    #print(vol['description'])
    if vol["name"] is not None:
        guess = classifier.classify(volume_features(vol))
        dist = prob_of_label(volume_features(vol),classifier)
        if guess != tag:
            errors.append( (tag, guess, vol["name"], json.dumps(vol['site_detail_url']), dist) )
        
for item in errors:
    print(item)

('col', 'orig', u'Iron Man Magazine', '"http://comicvine.gamespot.com/iron-man-magazine/4050-32600/"', {'col': 0.3324122319953169, 'orig': 0.6675877680046839})
('col', 'orig', u'Marvel Masterworks: Inhumans', '"http://comicvine.gamespot.com/marvel-masterworks-inhumans/4050-34836/"', {'col': 0.015653535850907296, 'orig': 0.9843464641490931})
('col', 'orig', u'Civil War: Amazing Spider-Man Decisions', '"http://comicvine.gamespot.com/civil-war-amazing-spider-man-decisions/4050-24551/"', {'col': 0.020124507434789444, 'orig': 0.9798754925652108})
('orig', 'col', u'New Thunderbolts', '"http://comicvine.gamespot.com/new-thunderbolts/4050-11298/"', {'col': 0.586006236492409, 'orig': 0.41399376350759226})


# Part 2: The Reckoning

This is where we'll put the code to:

* Fetch issues from each villain
* Check whether valid: 
    * Check i_white/i_black, v_white/v_black
    * Check if in spiderman_ids and vol[id]==31
    * Flag the probability it's a Trade Paperback -- make a hash table
* Create list of the json objects for each villain's issue, periodically export to file
* Create table from json array, limiting to the fields we want

In [2]:
v_candidates = load_file("villains.txt","arr")
print v_candidates[0:5]

[['name', 'id', 'count_of_issue_appearances', 'site_url'], ['Mysterio', '4333', '466', 'http://comicvine.gamespot.com/mysterio/4005-4333/'], ['Mysterio (Berkhart)', '84871', '43', 'http://comicvine.gamespot.com/mysterio-berkhart/4005-84871/'], ['Francis Klum', '84872', '21', 'http://comicvine.gamespot.com/francis-klum/4005-84872/'], ['Hobgoblin (Kingsley)', '7605', '317', 'http://comicvine.gamespot.com/hobgoblin-kingsley/4005-7605/']]


In [15]:
spider_ids = load_file("spider_issue_ids.txt","int")
spiderman = 1443
random_spiders = numpy.random.choice(spider_ids,2000)

i_black = load_file("i_black",'int')
i_white = load_file("i_white",'int')
v_black = load_file("v_black",'int')
v_white = load_file("v_white",'int')

resource_hash = {"volume":"4050","issue":"4000","character":"4005"}
url_base = "http://comicvine.gamespot.com/api/"
key = "aff8790cd32512f45b429bb78cc21a7a87cf4d48"

actual_v_list = []
v_issues_list = []

def construct_url(i,resource):
    new_base = url_base+resource+"/"+resource_hash[resource]+"-"+str(i)+"/"
    target = new_base + "?api_key="+key+"&format=json"
    return target

def url_to_data( url ):
    req = urllib2.Request(url)
    req.add_header('User-agent', 'Mozilla 5.10')
    res = urllib2.urlopen(req)
    data = json.load(res)['results']
    return data

def look_up(i,resource):
    return url_to_data(construct_url(i,resource))

def is_enemy(char_data):
    enemies = char_data['character_enemies']
    if enemies is not None:
        for e in enemies:
            if e['id'] == spiderman:
                return True
        return False
    return False            

def marvel(vol_id):
    data = look_up(vol_id,"volume") #API Call
    if data['publisher']['id'] == 31:
        return True

def process_issues(ids):
    data = []
    for i in ids:
        if i not in i_black:
            issue_data = look_up(i,"issue") #API Call
            if i in i_white:
                data.append((issue_data,2))
            else:
                v = issue_data['volume']
                if v['id'] not in v_black:
                    if v['id'] in i_white:
                        data.append((issue_data,2))
                    else:
                        if marvel(v['id']):
                            p_dist = prob_of_label(volume_features(v),classifier)
                            p_col = p_dist['col']
                            data.append((issue_data,p_col))
    return data                

def construct_villain_issue_list():
    j = 0
    for row in v_candidates:
        if j != 0:
            vi = row[1]
            data = look_up(vi,"character")
            if is_enemy(data):
                actual_v_list.append(vi)
                issue_ids = []
                for issue in data['issue_credits']:
                    issue_ids.append(issue['id'])
                v_issues_list = v_issues_list+process_issues(issue_ids)
                #archive("vill_json", v_issues_list)
        j = j+1

In [16]:
villain_issue_data = construct_villain_issue_list()

http://comicvine.gamespot.com/api/volume/4050-77941/?api_key=aff8790cd32512f45b429bb78cc21a7a87cf4d48&format=json
