# Part 1: The Classifier

This part builds the classifier we'll use to filter/flag our villain data

In [1]:
from textblob.classifiers import NaiveBayesClassifier
import numpy
import json
import nltk

def load_file(filename,t):
    f = open(filename,'rb')
    lines = f.read().splitlines()
    target = []
    if t == "int":
        for x in lines:
            target.append(int(x))
    else:
        if t == "json":
            for x in lines:
                y = json.loads(x)
                target.append(y)
        else:
            if t == "arr" or t == "arr2":
                for arr in lines:
                    #print (arr) 
                    row = arr.split(", ")
                    if t == "arr2":
                        newr = []
                        for c in row[0:3]:
                            c = strip_punc(c,all=True)
                            newr.append(c)
                        newr.append(row[3])
                        row = newr
                    target.append(row)
            else:
                for x in lines:
                    target.append(x)
    return target
collections = load_file("collections",'json')
originals = load_file("originals",'json')

In [2]:
def test_train_data(source,label):
    train=[]
    t1=[]
    t2=[]
    i=0
    data_size = len(source)
    numpy.random.seed(3)
    random_arr = numpy.random.choice(range(0,data_size),data_size)
    train_size = round(data_size*.64,1)
    devt_size = round(data_size*.16,1)
    target = []
    j = 0
    for i in random_arr:
        if j<train_size:
            target=train
        else:
            if j<train_size+devt_size:
                target=t1
            else:
                target=t2
        pair = (source[i],label)
        target.append(pair)
        j=j+1
    return (train,t1,t2)

c_tt = test_train_data(collections,"col")
o_tt = test_train_data(originals,"orig")

train = c_tt[0]+o_tt[0]

test1 = c_tt[1]+o_tt[1]
test2 = c_tt[2]+o_tt[2]

In [3]:
from HTMLParser import HTMLParser
import string
from textblob.utils import strip_punc

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def corpus_from_text(data,ls,key):
    #print(data)
    if data[key] is not None:        
        desc = strip_tags(data[key]).lower()
        desc = desc.split()
        for word in desc:
            word = strip_non_ascii(word)
            word = strip_punc(word,all=True)
            #print(word)
            if word not in ls:
                ls.append(word)
    return ls


In [4]:
def strip_non_ascii(string):
    stripped = (c for c in string if (0 < ord(c) < 127))
    return ''.join(stripped)

def first_words(text,num):
    ls = ""
    j = 0
    if text is not None:
        desc = strip_tags(text).lower()
        desc = desc.split()
        for word in desc:
            word = strip_non_ascii(word)
            word = strip_punc(word,all=True)
            #print(word)
            ls = ls + word
            j = j+1
            if j == num:
                return ls
        return ls
    return ls

In [5]:
manual_desc_corpus = ['reprints','collecting','hardcover','paperback','after','trade','as'
                     'into','vol']
manual_title_corpus = ['death','without','knight','four','line','by','collection','omnibus']
desc_corpus = manual_desc_corpus
title_corpus = manual_title_corpus
                
def basic_word_extractor(data,key):
    corpus = desc_corpus
    if key == 'name':
        corpus = title_corpus
    data_words = []
    data_words = corpus_from_text(data,data_words,key)
    features = dict((((key+'_contains({0})').format(word), (word in data_words))
                                            for word in corpus))
    return features

def volume_features(vol_data):
    #words_in_title = words(vol_data['name'])
    #words_in_desc = words(vol_data['name'])
    features= {'issue_count': vol_data['count_of_issues'],
            'start_year': int(vol_data['start_year']),
            'early_start': int(vol_data['start_year'])<=1990,
            'first_word': first_words(vol_data['description'],1),
            'first_two': first_words(vol_data['description'],2)
            #'name_contains(by)': 'by' in vol_data['name'].lower(),
            #'desc_contains(collected)': 'collected' in unicode(vol_data['description']).lower()
           }
    features.update(basic_word_extractor(vol_data,"name"))
    features.update(basic_word_extractor(vol_data,"description"))
    return features

In [6]:
train_feats = [(volume_features(n), category) for (n, category) in train]
devt_feats = [(volume_features(n), category) for (n, category) in test1]
test_feats = [(volume_features(n), category) for (n, category) in test2]
train2_feats = [(volume_features(n), category) for (n, category) in train+test1]
train_set = train2_feats
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [7]:
accuracy1 = nltk.classify.accuracy(classifier, test_feats)
print(accuracy1)
classifier.show_most_informative_features(10)

0.947368421053
Most Informative Features
description_contains(reprints) = True              col : orig   =     50.6 : 1.0
description_contains(paperback) = True              col : orig   =     26.3 : 1.0
description_contains(trade) = True              col : orig   =     16.6 : 1.0
description_contains(collecting) = True              col : orig   =     13.1 : 1.0
             early_start = True             orig : col    =     11.1 : 1.0
description_contains(hardcover) = True              col : orig   =     10.1 : 1.0
               first_two = u'seriesof'       col : orig   =      8.6 : 1.0
              start_year = 2003              col : orig   =      8.3 : 1.0
description_contains(vol) = True             orig : col    =      7.4 : 1.0
              first_word = u'hardcover'      col : orig   =      7.0 : 1.0


In [8]:
def prob_of_label(feats,cl):
    arr = {}
    dist = cl.prob_classify(feats)
    for label in dist.samples():
        row = "%s: %f" % (label, dist.prob(label))
        #print(row)
        arr[label]=dist.prob(label)
    return arr

# EX: prob_of_label(volume_features(collections[100]),classifier)['col']

In [9]:
def trunc_desc(vol):
    if vol["description"] is not None:
        return vol["description"][0:30]
    else:
        return ""

errors = []
for (vol, tag) in test2:
    #print(vol['description'])
    if vol["name"] is not None:
        guess = classifier.classify(volume_features(vol))
        dist = prob_of_label(volume_features(vol),classifier)
        if guess != tag:
            errors.append( (tag, guess, vol["name"], json.dumps(vol['site_detail_url']), dist) )
        
for item in errors:
    print(item)

('col', 'orig', u'Iron Man Magazine', '"http://comicvine.gamespot.com/iron-man-magazine/4050-32600/"', {'col': 0.3324122319953169, 'orig': 0.6675877680046839})
('col', 'orig', u'Marvel Masterworks: Inhumans', '"http://comicvine.gamespot.com/marvel-masterworks-inhumans/4050-34836/"', {'col': 0.015653535850907296, 'orig': 0.9843464641490931})
('col', 'orig', u'Civil War: Amazing Spider-Man Decisions', '"http://comicvine.gamespot.com/civil-war-amazing-spider-man-decisions/4050-24551/"', {'col': 0.020124507434789444, 'orig': 0.9798754925652108})
('orig', 'col', u'New Thunderbolts', '"http://comicvine.gamespot.com/new-thunderbolts/4050-11298/"', {'col': 0.586006236492409, 'orig': 0.41399376350759226})


# Part 2: The Reckoning

This is where we'll put the code to:

* Fetch issues from each villain
* Check whether valid: 
    * Check i_white/i_black, v_white/v_black
    * Check if in spiderman_ids and vol[id]==31
    * Flag the probability it's a Trade Paperback -- make a hash table
* Create list of the json objects for each villain's issues, periodically export to file
* Create table from json array, limiting to the fields we want

In [10]:
import urllib2
true_v = load_file("major_villains2.txt","arr2")
print true_v[0]

['Mysterio', '4333', '466', "'http://comicvine.gamespot.com/mysterio/4005-4333/'"]


In [20]:
import time
spider_ids = load_file("spider_issue_ids.txt","int")
spiderman = 1443
random_spiders = numpy.random.choice(spider_ids,2000)
vi_col = 1
vn_col = 0

i_black = load_file("i_black790",'int')
i_white = load_file("i_white790",'int')
v_black = load_file("v_black790",'int')
v_white = load_file("v_white790",'int')

resource_hash = {"volume":"4050","issue":"4000","character":"4005"}
url_base = "http://comicvine.gamespot.com/api/"
key = "aff8790cd32512f45b429bb78cc21a7a87cf4d48"

actual_v_list = []
v_issues_list = []

def construct_url(i,resource):
    new_base = url_base+resource+"/"+resource_hash[resource]+"-"+str(i)+"/"
    target = new_base + "?api_key="+key+"&format=json"
    return target

def url_to_data( url ):
    req = urllib2.Request(url)
    req.add_header('User-agent', 'Mozilla 5.10')
    res = urllib2.urlopen(req)
    data = json.load(res)['results']
    return data

def look_up(i,resource):
    return url_to_data(construct_url(i,resource))

def marvel(volume):
    vi = volume['id']
    if volume['publisher'] is not None:
        if volume['publisher']['id'] == 31:
            marvel_v.append(vi)
            return True
        else:
            not_marvel.append(vi)
            return False
    else:
        not_marvel.append(vi)
        return False    

def issue_list_from_vi(vi):
    issues = look_up(vi, "character")["issue_credits"]
    return issues

##
def construct_issue_hash(vls):
    v_issues_hash = {}
    i = 0
    for v in vls:
        vi = v[vi_col]
        vname = v[vn_col]
        #print("adding "+vname+" as the "+str(i)+" villain")
        vissues = issue_list_from_vi(vi) # Why am I using name instead of id?
        v_issues_hash[vname] = vissues
        i = i+1
    return v_issues_hash

def p_original(ii,vol,table):
    #Checking the white/blacklists should return over %100/0 probability -- a probability of 2 or -1 or something
    if (ii in i_black):
        print "Black i"
        return -1
    if (ii in i_white):
        print "White i"
        return 2
    vi = vol["id"]
    if (vi in i_black):
        print "Black v"
        return -1
    if (vi in i_white):
        print "White v"
        return 2
    #Return the probability 0<p<1 this is an original
    return p_vol(vol)
    
def p_vol(vol): 
    feats = volume_features(vol)
    p = prob_of_label(feats,classifier)
    return p["orig"]

def write_file(ls,filename):
    f = open(filename, 'w')
    for item in ls:
        f.write("%s\n" % item)

def write_json_file(ls,filename):
    f = open(filename,'w')
    if type(ls) == dict:
        print "dumping dict!"+str(len(ls))
        d = json.dumps(ls)
        print len(d)
        f.write(d)
    else:
        json_string_arr = convert_to_star(ls)
        for item in json_string_arr:
            f.write("%s\n" % item)        
        
def convert_to_star(ls):
    arr = []
    for item in ls:
        json_string = json.dumps(item)
        arr.append(json_string)
    return arr
        
def archive(ls,filen,num):
    new_file = filen+str(num)
    write_json_file(ls,new_file)

In [12]:
marvel_v = v_white
not_marvel = []

In [22]:
## I think I should to make a list of issue_data for each villain in this, since I'm pulling the data here     
def process_issues(villain_iss_id_dict):
    p_table = dict(load_file("p_table1500","json")[0]) #replace with whatever the latest p_table archive is
    len(p_table)
    name_to_issue_data = {}
    for vill in villain_iss_id_dict:
        i = 0
        #vill = "Hobgoblin Kingsley" # Trying to see if my blacklist/whitelist does anything
        print "Processing "+vill+" "+str(i)
        #if i>0: #Limits to the first villain, TAKE OUT when you know it works
            #return {"prob": p_table, "name_to_issue_data": name_to_issue_data}
        issues = villain_iss_id_dict[vill]
        issue_data_ls = []
        for issue in issues: ##TAKE OUT the 0:10
            ## ARCHIVING p_table for ever 50 additions to p_table, issue_data for every 100 issues            
            issue_data = look_up(issue["id"],"issue")
            vi = issue_data["volume"]["id"]
            if vi in marvel_v:
                issue_data_ls.append(issue_data) ##I am now also worried I am doing this wronng...oh I totally am, move it outsifde the if stat
                if str(vi) not in p_table:
                    time.sleep(1)
                    vol_data = look_up(vi,"volume")
                    prob = p_original(issue,vol_data,p_table)
#                    print "Adding to p_table"
                    p_table[vi] = prob ## add to p_table
            else:
                if vi not in not_marvel:
                    time.sleep(1)
                    vol_data = look_up(vi, "volume")
                    if marvel(vol_data):
                        issue_data_ls.append(issue_data)
                        if str(vi) not in p_table:
                            prob = p_original(issue,vol_data,p_table)
#                            print "Adding to p_table"
                            p_table[vi] = prob
#                    else:
#                        print str(vi)+" evaluated, and not marvel!"
#                else:
#                    print str(vi)+" in the not-marvel list!"
                
            i = i+1
            time.sleep(1)
            if (i%100 == 0) or (i == len(issues)-1):
                print vill+str(i)
                archive(issue_data_ls,vill,i)
            if ((len(p_table)%50 == 0) & (len(p_table)!=1202)) or (i == len(issues)-1):
                print "ptable"+str(len(p_table))
                archive(p_table,"p_table",len(p_table))
        name_to_issue_data[vill] = issue_data_ls
        holder_dict = dict(name_to_issue_data)
        archive(holder_dict,"holder_dict",0)
        archive(marvel_v,"marvel_v",0)
        archive(not_marvel,"not_marvel",0)
        
    return {"prob": p_table, "name_to_issue_data": name_to_issue_data}

In [14]:
name_to_issue_ids = construct_issue_hash(true_v)

In [16]:
test_issue_dict = dict(name_to_issue_ids)

a = load_file("holder_dict0","json")
b = load_file("holder_dict1","json")
c = load_file("holder_dict2","json")

a[0].update(b[0])
a[0].update(c[0])

to_pop = []

for i in a[0].keys():
    to_pop.append(str(i))

done = []
    
for villain in to_pop:
    if villain in test_issue_dict.keys():
        test_issue_dict.pop(villain)

print len(to_pop)


24


In [18]:
already_done = ["Kraven the Hunter", "Scorpion", "Mysterio", "Vulture", "Vulture Drago", "Hobgoblin Kingsley",
             "Hobgoblin Macendale", "Hobgoblin 2211", "Francine Frye", "Shocker"]

for v in already_done:
    test_issue_dict.pop(v)
print len(test_issue_dict)

1


In [23]:
#test_issue_dict = dict(name_to_issue_ids)
#test_issue_dict.pop("Kraven the Hunter")
#test_issue_dict.pop("Scorpion")
#test_issue_dict.pop("Mysterio")
#test_issue_dict.pop("Vulture")
#test_issue_dict.pop("Vulture Drago")
#test_issue_dict.pop("Hobgoblin Kingsley")
#test_issue_dict.pop("Hobgoblin Macendale")
#test_issue_dict.pop("Hobgoblin 2211")
#test_issue_dict.pop("Francine Frye")
#test_issue_dict.pop("Shocker")

processed = process_issues(test_issue_dict)


Processing Kingpin 0
ptable1500
dumping dict!1500
45117
ptable1500
dumping dict!1500
45117
ptable1500
dumping dict!1500
45117
ptable1500
dumping dict!1500
45117
ptable1500
dumping dict!1500
45117
ptable1500
dumping dict!1500
45117
ptable1500
dumping dict!1500
45117
ptable1500
dumping dict!1500
45117
ptable1500
dumping dict!1500
45117
ptable1500
dumping dict!1500
45117
ptable1500
dumping dict!1500
45117
Kingpin100
Kingpin200
Kingpin300
Kingpin400
ptable1550
dumping dict!1550
46656
ptable1550
dumping dict!1550
46656
ptable1550
dumping dict!1550
46656
ptable1550
dumping dict!1550
46656
ptable1550
dumping dict!1550
46656
ptable1550
dumping dict!1550
46656
ptable1550
dumping dict!1550
46656
ptable1550
dumping dict!1550
46656
ptable1550
dumping dict!1550
46656
Kingpin500
Kingpin600
Kingpin700
Kingpin800
ptable1600
dumping dict!1600
48163
ptable1600
dumping dict!1600
48163
ptable1600
dumping dict!1600
48163
Kingpin900
Kingpin1000
Kingpin1100
Kingpin1200
Kingpin1300
Kingpin1323
ptable1630
dump

In [41]:
pre_processed_villains = process_pre_processed(already_done)
print pre_processed_villains.keys()


def process_pre_processed(ls):
    pre_dict = {}
    for villain in ls:
        issue_data = process_pre_str(villain)
        issue_data.append(process_last_issue(villain))
        pre_dict[villain] = issue_data
        

def process_last_issue(v):
    last_row = get_issue_count(v)-1
    issue = name_to_issue_ids[v][last_row]
    ii = issue["id"]
    i_data = look_up(ii,"issue")
    vi = issue["volume"]["id"]
    if vi in marvel_v:
        if str(vi) not in p_table:
            time.sleep(1)
            vol_data = look_up(vi,"volume")
            prob = p_original(issue,vol_data,p_table)
            p_table[vi] = prob
        return i_data
    else:
        if vi not in not_marvel:
            time.sleep(1)
            vol_data = look_up(vi, "volume")
            if marvel(vol_data):
                if str(vi) not in p_table:
                    prob = p_original(issue,vol_data,p_table)
                    p_table[vi] = prob
                return i_data
    
    
def process_pre_str(v):
    issues = load_pre_villain(v)
    issue_data = []
    for issue in issues:
        issue_data.append(issue)
    return issue_data

def get_issue_count(v):
    return len(name_to_issue_ids[v])
    
def load_pre_villain(v):
    i = get_issue_count(v)
    filename = v+str(i-1)
    return load_file(filename,"json")

Processing test 0
22636 in the not-marvel list!
22636 in the not-marvel list!
43265 in the not-marvel list!
43265 in the not-marvel list!
56166 in the not-marvel list!
test11
ptable750
dumping dict!750
22612
dumping dict!1
162839
6
12
7
Original data...
Processed data...
Loaded file...
set([295633, 295634, 381597, 420974, 420975])
35
25


In [None]:
a[0].update(pre_processed_villains)
aggregate_iv = a[0]
vi_to_prob = dict(load_file("ptable#","json")[0])

nrows=0
for villain in aggregate_iv:
    nrows = nrows + len(aggreate_iv[villain])
    
print nrows

probs = []
for vol in vi_to_prob:
    probs = vi_to_prob[vol]
    
print len(probs)
print numpy.mean(probs)
numpy.histogram(probs)

In [None]:
csv_arr = build_csv(aggregate_issuesandvillains)



def build_csv(iv_dict):
    csv_arr = [["villain", "prob_orig", "issue_date", "issue_id", "volume_title", "volume_id", "volume_url"]]
    for vill in iv_dict:
        issues = iv_dict[vill]
        for issue in issues:
            row = construct_row(vill,issue)
            csv_arr.append(row)
    return csv_arr
            
def construct_row(v_name,i_data):
    villain = v_name
    issue_id = i_data["id"]
    volume_id = i_data["volume"]["id"]    
    prob_orig = get_prob(vol_id)
    issue_date = i_data["cover_date"]
    volume_title = i_data["volume"]["name"]
    volume_url = i_data["volume"]["site_detail_url"]
    row = [villain, prob_orig, issue_date, issue_id, volume_title, volume_id, volume_url]    
    
def get_prob(vi):
    if vi in vi_to_prob:
        return vi_to_prob[str(volume_id)]
    else:
        return -3
    

In [None]:
f = load_file("test2","json")
g = name_to_issue_ids["Hobgoblin 2211"]

hoohaw = process_issues({"test":g})
print len(f)
print len(g)
print len(hoohaw["name_to_issue_data"]["test"])

original = set()
print "Original data..."
for i in g:
    original.add(i["id"])

processed = set()
print "Processed data..."
for i in hoohaw["name_to_issue_data"]["test"]:
    processed.add(i["id"])

loaded = set()
print "Loaded file..."
for i in f:
    loaded.add(i["id"])
    
diff2 = original-processed    

print diff2
#print name_to_issue_ids["Hobgoblin 2211"]
print len(name_to_issue_ids)
print len(test_issue_dict)
        
#ptable_and_vissues = process_issues(test_issue_dict)

In [65]:
p = load_file("p_table622","json")[0]
print(len(p))

622


In [130]:
test_issue = name_to_issue_ids["Carnage"][10]
test_issue_data = look_up(test_issue["id"],"issue")
test_vi = test_issue_data["volume"]["id"]
test_volume = look_up(test_vi,"volume")

p = p_original(test_issue["id"],test_volume,table)
if test_vi not in table:
    table[test_vi] = p

print test_volume["id"]
print test_volume["description"]
print(p)

92833
<p> Spanish trade collection of:</p><ul><li>Deadpool #1-4</li><li>Deadpool Vol. 2</li><li><a href="http://comicvine.gamespot.com/deadpool-kills-the-marvel-universe/4050-50940/" data-ref-id="4050-50940" rel="nofollow">Deadpool Kills the Marvel Universe</a></li><li><a href="http://comicvine.gamespot.com/deadpool-killustrated/4050-56120/" data-ref-id="4050-56120" rel="nofollow">Deadpool Killustrated</a></li><li><a href="http://comicvine.gamespot.com/deadpool-kills-deadpool/4050-64684/" data-ref-id="4050-64684" rel="nofollow">Deadpool Kills Deadpool</a></li><li><a href="http://comicvine.gamespot.com/deadpool-vs-carnage/4050-72790/" data-ref-id="4050-72790" rel="nofollow">Deadpool Vs. Carnage</a></li></ul><p>Published by the Spanish wing of Panini Comics.</p>
0.44538006973


In [240]:
#God damn it I think the blacklists aren't working
tt=[]

for i in i_black[0:5]:
    vi = look_up(i, "issue")["volume"]["id"]
    vol_data = look_up(vi, "volume")
    print p_original(i,vol_data,tt)



Black i
-1
Black i
-1
Black i
-1
Black i
-1
Black i
-1


In [127]:
print table

{92864: 0.013352561303844828, 92833: 0.4453800697303239, 85938: 0.9995698434914074, 95207: 5.032671507280794e-09}
