In [1]:

import numpy as np
import pandas as pd
eps = np.finfo(float).eps
from numpy import log2 as log

In [2]:
import json

train_file = 'train_5500.label.txt'
train_list = []

with open(train_file, encoding='utf8', errors='ignore') as f:
    for line in f:
        tokens = line.strip().split()
        separator=" "
        text=separator.join(tokens[1:])
        train_list.append( {'category':tokens[0].split(':')[0], 'text':text} )

# test_file = 'TREC_10.label.txt'
# test_list = []
# with open(test_file, encoding='utf8', errors='ignore') as f:
#     for line in f:
#         tokens = line.strip().split()
#         separator=" "
#         text=separator.join(tokens[1:])
#         test_list.append( {'category':tokens[0].split(':')[0], 'text':text} )        

# print(test_list)

In [3]:
from nltk import ngrams, word_tokenize, pos_tag

def ngrams_extract(json_list, n, number):
    ngrams_dict = {}
    for i in json_list:
        temp = word_tokenize(i['text'])
        temp = list(ngrams(temp, n))
        print(temp)
        for j in temp:
            if j in ngrams_dict:
                ngrams_dict[j] += 1
            else:
                ngrams_dict[j] = 1
    ngrams_dict = sorted(ngrams_dict.items(), key = lambda x: x[1])
    ngrams_list = []
    length = len(ngrams_dict)
    for i in range(number):
        ngrams_list.append(ngrams_dict[length - i - 1][0])
    return ngrams_list



In [4]:
def tags_set(json_list):
    tag_list = set()
    for i in json_list:
        sentence = i['text']
        temp = word_tokenize(sentence)
        tags = pos_tag(temp)
        for j in tags:
            tag_list.add(j[1])
    tag_list = list(tag_list)
    return tag_list



In [5]:
def features_extract(json_list, unigram, bigram, trigram, tag_list):
    features = []
    for i in json_list:
        headline = i['text']
        headline_unigrams = list(ngrams(word_tokenize(headline), 1))
        headline_bigrams = list(ngrams(word_tokenize(headline), 2))
        headline_trigrams = list(ngrams(word_tokenize(headline), 3))
        headline_postags = pos_tag(word_tokenize(headline))
        headline_postags_list = []
        for tag in headline_postags:
            headline_postags_list.append(tag[1])
        feature = []
        for uni in unigram:
            if uni in headline_unigrams:
                feature.append(1)
            else:
                feature.append(0)
        for bi in bigram:
            if bi in headline_bigrams:
                feature.append(1)
            else:
                feature.append(0)
        for tri in trigram:
            if tri in headline_trigrams:
                feature.append(1)
            else:
                feature.append(0)
        for tag in tag_list:
            if tag in headline_postags_list:
                feature.append(1)
            else:
                feature.append(0)
        features.append(feature)
    return features


In [6]:
def feature_lab(json_list):
    feature_labels = []
    labels = {'NUM':0, 'HUM':1, 'LOC':2, 'ENTY':3, 'DESC':4, 'ABBR':5}
    for i in json_list:
        label = i['category']
        if label not in labels:
            feature_labels.append(0)
        else:
            feature_labels.append(labels[label])
    return feature_labels

In [7]:
unigrams = ngrams_extract(train_list, 1, 500)
type(unigrams)

[('How',), ('did',), ('serfdom',), ('develop',), ('in',), ('and',), ('then',), ('leave',), ('Russia',), ('?',)]
[('What',), ('films',), ('featured',), ('the',), ('character',), ('Popeye',), ('Doyle',), ('?',)]
[('How',), ('can',), ('I',), ('find',), ('a',), ('list',), ('of',), ('celebrities',), ("'",), ('real',), ('names',), ('?',)]
[('What',), ('fowl',), ('grabs',), ('the',), ('spotlight',), ('after',), ('the',), ('Chinese',), ('Year',), ('of',), ('the',), ('Monkey',), ('?',)]
[('What',), ('is',), ('the',), ('full',), ('form',), ('of',), ('.com',), ('?',)]
[('What',), ('contemptible',), ('scoundrel',), ('stole',), ('the',), ('cork',), ('from',), ('my',), ('lunch',), ('?',)]
[('What',), ('team',), ('did',), ('baseball',), ("'s",), ('St.',), ('Louis',), ('Browns',), ('become',), ('?',)]
[('What',), ('is',), ('the',), ('oldest',), ('profession',), ('?',)]
[('What',), ('are',), ('liver',), ('enzymes',), ('?',)]
[('Name',), ('the',), ('scar-faced',), ('bounty',), ('hunter',), ('of',), ('Th

[('What',), ('book',), ('does',), ('Holden',), ('Caulfield',), ('appear',), ('in',), ('?',)]
[('What',), ('happens',), ('when',), ('lightning',), ('strikes',), ('a',), ('body',), ('of',), ('water',), ('?',)]
[('What',), ('was',), ('the',), ('nickname',), ('of',), ('German',), ('flying',), ('ace',), ('Manfred',), ('von',), ('Richthofen',), ('?',)]
[('What',), ('does',), ('e=mc2',), ('mean',), ('?',)]
[('What',), ('country',), ('other',), ('than',), ('Germany',), ('invaded',), ('Poland',), ('in',), ('September',), ('1939',), ('?',)]
[('How',), ('do',), ('I',), ('contact',), ('answers.com',), ('?',)]
[('What',), ('are',), ('the',), ('names',), ('of',), ('the',), ('different',), ('toes',), ('?',)]
[('What',), ('are',), ('different',), ('products',), ('of',), ('petroleum',), ('?',)]
[('What',), ('causes',), ('asthma',), ('?',)]
[('When',), ('was',), ('the',), ('Bill',), ('of',), ('Rights',), ('ratified',), ('?',)]
[('What',), ('do',), ('the',), ('number',), ('1',), (',',), ('2',), (',',), (

[('What',), ('is',), ('a',), ('mathematical',), ('factor',), ('?',)]
[('How',), ('many',), ('four',), ('star',), ('generals',), ('were',), ('there',), ('and',), ('who',), ('are',), ('they',), ('?',)]
[('When',), ('did',), ('Nixon',), ('die',), ('?',)]
[('What',), ('was',), ('originally',), ('defined',), ('as',), ('one',), ('1-millionth',), ('of',), ('the',), ('distance',), ('from',), ('the',), ('equator',), ('to',), ('the',), ('Pole',), ('?',)]
[('What',), ('comedian',), ('hit',), ('the',), ('TV',), ('screen',), ('in',), ('1951',), ('with',), ('the',), ('NBC',), ('afternoon',), ('show',), ('Time',), ('for',), ('Ernie',), ('?',)]
[('How',), ('many',), ('members',), ('are',), ('in',), ('the',), ('California',), ('congressional',), ('delegation',), ('?',)]
[('What',), ('erupts',), ('every',), ('hour',), ('at',), ('Yellowstone',), ('National',), ('Park',), ('?',)]
[('What',), ('country',), ('contains',), ('the',), ('highest',), ('point',), ('in',), ('South',), ('America',), ('?',)]
[('What

[('How',), ('many',), ('elephants',), ('are',), ('left',), ('on',), ('earth',), ('?',)]
[('What',), ('is',), ('a',), ('softball',), ('made',), ('of',), ('?',)]
[('What',), ('``',), ('little',), ('red',), ('car',), ('``',), ('is',), ('mentioned',), ('in',), ('pop',), ('singer',), ('Prince',), ("'s",), ('hit',), ('song',), ('?',)]
[('Who',), ('did',), ('Doris',), ('Day',), ('mean',), ('when',), ('she',), ('said',), (':',), ('``',), ('I',), ('call',), ('him',), ('Ernie',), ('because',), ('he',), ("'s",), ('certainly',), ('no',), ('Rock',), ('``',), ('?',)]
[('What',), ('kind',), ('of',), ('species',), ('is',), ('the',), ('monster',), ('in',), ('the',), ('film',), ('``',), ('Jaws',), ('``',), ('?',)]
[('When',), ('was',), ('the',), ('first',), ('practical',), ('commercial',), ('typewriter',), ('marketed',), ('?',)]
[('What',), ('is',), ('a',), ('drought',), ('?',)]
[('What',), ('are',), ('the',), ('benefits',), ('of',), ('home',), ('school',), ('?',)]
[('What',), ('Scandinavian',), ('count

[('Name',), ('the',), ('cartoon',), ('genie',), ('conjured',), ('by',), ('the',), ('magic',), ('ring',), ('shared',), ('by',), ('Nancy',), ('and',), ('Chuck',), ('.',)]
[('What',), ('foods',), ('contain',), ('vitamin',), ('B12',), ('?',)]
[('What',), ('is',), ('the',), ('amount',), ('of',), ('money',), ('owed',), ('for',), ('illegally',), ('having',), ('a',), ('dog',), ('on',), ('a',), ('beach',), ('?',)]
[('What',), ('are',), ('the',), ('first',), ('and',), ('last',), ('letters',), ('of',), ('the',), ('Greek',), ('alphabet',), ('?',)]
[('What',), ('is',), ('the',), ('web',), ('address',), ('at',), ('which',), ('I',), ('can',), ('find',), ('the',), ('e-mail',), ('address',), ('of',), ('a',), ('member',), ('of',), ('the',), ('US',), ('House',), ('of',), ('Representatives',), ('?',)]
[('How',), ('big',), ('is',), ('the',), ('largest',), ('diamond',), ('?',)]
[('What',), ('are',), ('some',), ('ways',), ('to',), ('help',), ('someone',), ('with',), ('Chicken',), ('Pox',), ('?',)]
[('How',),

list

In [8]:
bigrams = ngrams_extract(train_list, 2, 300)

[('How', 'did'), ('did', 'serfdom'), ('serfdom', 'develop'), ('develop', 'in'), ('in', 'and'), ('and', 'then'), ('then', 'leave'), ('leave', 'Russia'), ('Russia', '?')]
[('What', 'films'), ('films', 'featured'), ('featured', 'the'), ('the', 'character'), ('character', 'Popeye'), ('Popeye', 'Doyle'), ('Doyle', '?')]
[('How', 'can'), ('can', 'I'), ('I', 'find'), ('find', 'a'), ('a', 'list'), ('list', 'of'), ('of', 'celebrities'), ('celebrities', "'"), ("'", 'real'), ('real', 'names'), ('names', '?')]
[('What', 'fowl'), ('fowl', 'grabs'), ('grabs', 'the'), ('the', 'spotlight'), ('spotlight', 'after'), ('after', 'the'), ('the', 'Chinese'), ('Chinese', 'Year'), ('Year', 'of'), ('of', 'the'), ('the', 'Monkey'), ('Monkey', '?')]
[('What', 'is'), ('is', 'the'), ('the', 'full'), ('full', 'form'), ('form', 'of'), ('of', '.com'), ('.com', '?')]
[('What', 'contemptible'), ('contemptible', 'scoundrel'), ('scoundrel', 'stole'), ('stole', 'the'), ('the', 'cork'), ('cork', 'from'), ('from', 'my'), ('m

[('What', 'is'), ('is', 'the'), ('the', 'size'), ('size', 'of'), ('of', 'the'), ('the', 'largest'), ('largest', 'akita'), ('akita', '?')]
[('What', 'animal'), ('animal', 'has'), ('has', 'the'), ('the', 'biggest'), ('biggest', 'eyes'), ('eyes', '?')]
[('What', 'is'), ('is', 'California'), ('California', "'s"), ("'s", 'capital'), ('capital', '?')]
[('Who', 'was'), ('was', 'the'), ('the', 'author'), ('author', 'of'), ('of', 'the'), ('the', 'book'), ('book', 'about'), ('about', 'computer'), ('computer', 'hackers'), ('hackers', 'called'), ('called', '``'), ('``', 'The'), ('The', 'Cuckoo'), ('Cuckoo', "'s"), ("'s", 'Egg'), ('Egg', ':'), (':', 'Tracking'), ('Tracking', 'a'), ('a', 'Spy'), ('Spy', 'Through'), ('Through', 'the'), ('the', 'Maze'), ('Maze', 'of'), ('of', 'Computer'), ('Computer', 'Espionage'), ('Espionage', '``'), ('``', '?')]
[('Who', 'was'), ('was', 'the'), ('the', 'founding'), ('founding', 'member'), ('member', 'of'), ('of', 'the'), ('the', 'Pink'), ('Pink', 'Floyd'), ('Floyd'

[('What', 'letter'), ('letter', 'adorns'), ('adorns', 'the'), ('the', 'flag'), ('flag', 'of'), ('of', 'Rwanda'), ('Rwanda', '?')]
[('What', 'did'), ('did', '8'), ('8', ','), (',', 'CD'), ('CD', 'NNS'), ('NNS', 'VBP'), ('VBP', 'TO'), ('TO', 'VB'), ('VB', 'NNP'), ('NNP', 'POS'), ('POS', 'NN'), ('NN', '.')]
[('What', 'is'), ('is', 'God'), ('God', '?')]
[('What', 'country'), ('country', "'s"), ("'s", 'northernmost'), ('northernmost', 'city'), ('city', 'is'), ('is', 'Darwin'), ('Darwin', '?')]
[('What', 'sound'), ('sound', 'does'), ('does', 'Olympia'), ('Olympia', ','), (',', 'Washington'), ('Washington', ','), (',', 'overlook'), ('overlook', '?')]
[('What', 'TV'), ('TV', 'show'), ('show', 'chronicled'), ('chronicled', 'the'), ('the', 'lives'), ('lives', 'of'), ('of', 'Katy'), ('Katy', 'Holstrum'), ('Holstrum', 'and'), ('and', 'Congressman'), ('Congressman', 'Glen'), ('Glen', 'Morley'), ('Morley', '?')]
[('What', 'Kentucky'), ('Kentucky', 'city'), ('city', 'calls'), ('calls', 'itself'), ('i

[('Which', 'gender'), ('gender', 'has'), ('has', 'bigger'), ('bigger', 'thighs'), ('thighs', '?')]
[('What', 'U.S.'), ('U.S.', 'state'), ('state', 'has'), ('has', 'the'), ('the', 'second-longest'), ('second-longest', 'coastline'), ('coastline', '?')]
[('What', 'state'), ('state', "'s"), ("'s", 'home'), ('home', 'to'), ('to', 'the'), ('the', 'Buffalo'), ('Buffalo', 'Bill'), ('Bill', 'Historical'), ('Historical', 'Center'), ('Center', '?')]
[('How', 'fast'), ('fast', 'do'), ('do', 'cheetahs'), ('cheetahs', 'run'), ('run', '?')]
[('What', 'is'), ('is', 'Ray'), ('Ray', 'Bradbury'), ('Bradbury', "'s"), ("'s", 'illustrated'), ('illustrated', 'man'), ('man', 'illustrated'), ('illustrated', 'with'), ('with', '?')]
[('How', 'fast'), ('fast', 'is'), ('is', 'a'), ('a', '45Mhz'), ('45Mhz', 'processor'), ('processor', '?')]
[('What', 'kind'), ('kind', 'of'), ('of', 'puzzle'), ('puzzle', 'first'), ('first', 'appeared'), ('appeared', 'in'), ('in', 'the'), ('the', 'U.S.'), ('U.S.', 'in'), ('in', 'the'

[('What', 'are'), ('are', 'the'), ('the', 'names'), ('names', 'of'), ('of', 'Jack'), ('Jack', "'s"), ("'s", 'original'), ('original', 'roommates'), ('roommates', 'on'), ('on', 'Three'), ('Three', "'s"), ("'s", 'Company'), ('Company', '?')]
[('What', 'is'), ('is', 'the'), ('the', 'definition'), ('definition', 'of'), ('of', 'hazmat'), ('hazmat', '?')]
[('How', 'many'), ('many', 'URL'), ('URL', 'extensions'), ('extensions', 'are'), ('are', 'there'), ('there', '?'), ('?', 'and'), ('and', 'what'), ('what', 'are'), ('are', 'they'), ('they', '?')]
[('Where', 'did'), ('did', 'the'), ('the', 'Wright'), ('Wright', 'brothers'), ('brothers', 'make'), ('make', 'their'), ('their', 'first'), ('first', 'flight'), ('flight', '?')]
[('Who', 'won'), ('won', 'the'), ('the', 'Battle'), ('Battle', 'of'), ('of', 'Gettysburg'), ('Gettysburg', '?')]
[('What', 'do'), ('do', 'you'), ('you', 'get'), ('get', 'by'), ('by', 'mixing'), ('mixing', 'gin'), ('gin', 'and'), ('and', 'vermouth'), ('vermouth', '?')]
[('Name

[('Where', 'did'), ('did', 'Ty'), ('Ty', 'Cobb'), ('Cobb', 'grow'), ('grow', 'up'), ('up', '?')]
[('What', 'is'), ('is', 'a'), ('a', 'philanthropist'), ('philanthropist', '?')]
[('What', 'is'), ('is', 'a'), ('a', 'portal'), ('portal', '?')]
[('What', 'Tom'), ('Tom', 'Wolfe'), ('Wolfe', 'book'), ('book', 'is'), ('is', 'about'), ('about', 'the'), ('the', 'Mercury'), ('Mercury', 'astronauts'), ('astronauts', '?')]
[('What', 'cooking'), ('cooking', 'oil'), ('oil', 'has'), ('has', '``'), ('``', 'corn'), ('corn', 'goodness'), ('goodness', '``'), ('``', '?')]
[('What', 'is'), ('is', 'the'), ('the', 'per-capita'), ('per-capita', 'income'), ('income', 'of'), ('of', 'Colombia'), ('Colombia', ','), (',', 'South'), ('South', 'America'), ('America', '?')]
[('How', 'many'), ('many', 'types'), ('types', 'of'), ('of', 'cheese'), ('cheese', 'are'), ('are', 'there'), ('there', 'in'), ('in', 'France'), ('France', '?')]
[('How', 'many'), ('many', '``'), ('``', 'eyes'), ('eyes', '``'), ('``', 'does'), ('do

[('What', 'delicacy'), ('delicacy', 'is'), ('is', 'known'), ('known', 'indelicately'), ('indelicately', 'as'), ('as', 'pickled'), ('pickled', 'roe'), ('roe', '?')]
[('What', 'after-dinner'), ('after-dinner', 'treat'), ('treat', 'was'), ('was', 'invented'), ('invented', 'in'), ('in', '1916'), ('1916', 'at'), ('at', 'the'), ('the', 'George'), ('George', 'Jung'), ('Jung', 'noodle'), ('noodle', 'factory'), ('factory', 'in'), ('in', 'Los'), ('Los', 'Angeles'), ('Angeles', '?')]
[('What', 'U.S.'), ('U.S.', 'vice-president'), ('vice-president', 'killed'), ('killed', 'Alexander'), ('Alexander', 'Hamilton'), ('Hamilton', 'in'), ('in', 'a'), ('a', 'duel'), ('duel', '?')]
[('What', "'s"), ("'s", 'the'), ('the', 'Fahrenheit'), ('Fahrenheit', 'equivalent'), ('equivalent', 'of'), ('of', 'zero'), ('zero', 'degrees'), ('degrees', 'centigrade'), ('centigrade', '?')]
[('How', 'many'), ('many', 'muscles'), ('muscles', 'does'), ('does', 'an'), ('an', 'oyster'), ('oyster', 'have'), ('have', '?')]
[('What',

In [9]:
trigrams = ngrams_extract(train_list, 3, 200)

[('How', 'did', 'serfdom'), ('did', 'serfdom', 'develop'), ('serfdom', 'develop', 'in'), ('develop', 'in', 'and'), ('in', 'and', 'then'), ('and', 'then', 'leave'), ('then', 'leave', 'Russia'), ('leave', 'Russia', '?')]
[('What', 'films', 'featured'), ('films', 'featured', 'the'), ('featured', 'the', 'character'), ('the', 'character', 'Popeye'), ('character', 'Popeye', 'Doyle'), ('Popeye', 'Doyle', '?')]
[('How', 'can', 'I'), ('can', 'I', 'find'), ('I', 'find', 'a'), ('find', 'a', 'list'), ('a', 'list', 'of'), ('list', 'of', 'celebrities'), ('of', 'celebrities', "'"), ('celebrities', "'", 'real'), ("'", 'real', 'names'), ('real', 'names', '?')]
[('What', 'fowl', 'grabs'), ('fowl', 'grabs', 'the'), ('grabs', 'the', 'spotlight'), ('the', 'spotlight', 'after'), ('spotlight', 'after', 'the'), ('after', 'the', 'Chinese'), ('the', 'Chinese', 'Year'), ('Chinese', 'Year', 'of'), ('Year', 'of', 'the'), ('of', 'the', 'Monkey'), ('the', 'Monkey', '?')]
[('What', 'is', 'the'), ('is', 'the', 'full')

[('How', 'many', 'Beatles'), ('many', 'Beatles', "'"), ('Beatles', "'", 'records'), ("'", 'records', 'went'), ('records', 'went', '#'), ('went', '#', '1'), ('#', '1', '?')]
[('Who', 'is', 'the'), ('is', 'the', 'only'), ('the', 'only', 'prime'), ('only', 'prime', 'minister'), ('prime', 'minister', 'of'), ('minister', 'of', 'Canada'), ('of', 'Canada', 'to'), ('Canada', 'to', 'serve'), ('to', 'serve', '22'), ('serve', '22', 'years'), ('22', 'years', 'but'), ('years', 'but', 'not'), ('but', 'not', 'necessarily'), ('not', 'necessarily', 'consecutively'), ('necessarily', 'consecutively', '?')]
[('What', 'is', 'the'), ('is', 'the', 'abbreviation'), ('the', 'abbreviation', 'of'), ('abbreviation', 'of', 'the'), ('of', 'the', 'company'), ('the', 'company', 'name'), ('company', 'name', '`'), ('name', '`', 'General'), ('`', 'General', 'Motors'), ('General', 'Motors', "'"), ('Motors', "'", '?')]
[('What', 'mountains', 'lie'), ('mountains', 'lie', 'between'), ('lie', 'between', 'the'), ('between', '

[('What', 'Marx', 'Brothers'), ('Marx', 'Brothers', 'movie'), ('Brothers', 'movie', 'centers'), ('movie', 'centers', 'on'), ('centers', 'on', 'a'), ('on', 'a', 'stolen'), ('a', 'stolen', 'painting'), ('stolen', 'painting', '?')]
[('What', 'was', 'unusual'), ('was', 'unusual', 'about'), ('unusual', 'about', 'Alexandra'), ('about', 'Alexandra', "'s"), ('Alexandra', "'s", 'appearance'), ("'s", 'appearance', 'in'), ('appearance', 'in', 'Josie'), ('in', 'Josie', 'and'), ('Josie', 'and', 'the'), ('and', 'the', 'Pussycats'), ('the', 'Pussycats', '?')]
[('What', 'is', 'the'), ('is', 'the', 'origin'), ('the', 'origin', 'of'), ('origin', 'of', 'the'), ('of', 'the', 'word'), ('the', 'word', '``'), ('word', '``', 'mind'), ('``', 'mind', '``'), ('mind', '``', '?')]
[('When', 'was', 'Dick'), ('was', 'Dick', 'Clark'), ('Dick', 'Clark', 'born'), ('Clark', 'born', '?')]
[('What', 'game', 'is'), ('game', 'is', 'fatal'), ('is', 'fatal', 'to'), ('fatal', 'to', 'anybody'), ('to', 'anybody', 'over'), ('anyb

[('What', 'is', 'the'), ('is', 'the', 'fear'), ('the', 'fear', 'of'), ('fear', 'of', 'frogs'), ('of', 'frogs', '?')]
[('Which', 'city', 'in'), ('city', 'in', 'Canada'), ('in', 'Canada', 'is'), ('Canada', 'is', 'the'), ('is', 'the', 'least-populated'), ('the', 'least-populated', '?')]
[('How', 'old', 'is'), ('old', 'is', 'Stevie'), ('is', 'Stevie', 'Wonder'), ('Stevie', 'Wonder', '?')]
[('Who', 'was', 'the'), ('was', 'the', 'first'), ('the', 'first', 'Taiwanese'), ('first', 'Taiwanese', 'President'), ('Taiwanese', 'President', '?')]
[('Who', 'is', 'Luke'), ('is', 'Luke', 'Skywalker'), ('Luke', 'Skywalker', "'s"), ('Skywalker', "'s", 'father'), ("'s", 'father', '?')]
[('What', 'does', 'a'), ('does', 'a', 'dipsomaniac'), ('a', 'dipsomaniac', 'crave'), ('dipsomaniac', 'crave', '?')]
[('What', 'is', 'the'), ('is', 'the', 'definition'), ('the', 'definition', 'of'), ('definition', 'of', 'the'), ('of', 'the', 'term'), ('the', 'term', '``'), ('term', '``', 'weapons'), ('``', 'weapons', 'system'

[('How', 'many', 'children'), ('many', 'children', 'does'), ('children', 'does', 'Ray'), ('does', 'Ray', 'Davies'), ('Ray', 'Davies', 'of'), ('Davies', 'of', 'the'), ('of', 'the', 'Kinks'), ('the', 'Kinks', 'have'), ('Kinks', 'have', '?')]
[('How', 'many', 'years'), ('many', 'years', 'did'), ('years', 'did', 'Shea'), ('did', 'Shea', '&'), ('Shea', '&', 'Gould'), ('&', 'Gould', 'practice'), ('Gould', 'practice', 'law'), ('practice', 'law', 'in'), ('law', 'in', 'Los'), ('in', 'Los', 'Angeles'), ('Los', 'Angeles', '?')]
[('What', 'did', 'Walter'), ('did', 'Walter', 'Huston'), ('Walter', 'Huston', 'remove'), ('Huston', 'remove', 'to'), ('remove', 'to', 'perform'), ('to', 'perform', 'in'), ('perform', 'in', 'the'), ('in', 'the', 'movie'), ('the', 'movie', 'The'), ('movie', 'The', 'Treasure'), ('The', 'Treasure', 'of'), ('Treasure', 'of', 'the'), ('of', 'the', 'Sierra'), ('the', 'Sierra', 'Madre'), ('Sierra', 'Madre', '?')]
[('How', 'much', 'salt'), ('much', 'salt', 'is'), ('salt', 'is', 'in

[('What', "'s", 'the'), ("'s", 'the', 'abbreviation'), ('the', 'abbreviation', 'for'), ('abbreviation', 'for', 'limited'), ('for', 'limited', 'partnership'), ('limited', 'partnership', '?')]
[('Which', 'continent', 'has'), ('continent', 'has', 'the'), ('has', 'the', 'most'), ('the', 'most', 'roses'), ('most', 'roses', '?')]
[('What', 'Russian', 'novel'), ('Russian', 'novel', 'embracing'), ('novel', 'embracing', 'more'), ('embracing', 'more', 'the'), ('more', 'the', '5'), ('the', '5', 'characters'), ('5', 'characters', 'is'), ('characters', 'is', 'set'), ('is', 'set', 'in'), ('set', 'in', 'the'), ('in', 'the', 'Napoleonic'), ('the', 'Napoleonic', 'Wars'), ('Napoleonic', 'Wars', '?')]
[('How', 'do', 'you'), ('do', 'you', 'write'), ('you', 'write', 'a'), ('write', 'a', 'correct'), ('a', 'correct', 'critical'), ('correct', 'critical', 'analysis'), ('critical', 'analysis', 'of'), ('analysis', 'of', 'a'), ('of', 'a', 'poem'), ('a', 'poem', '?')]
[('How', 'do', 'I'), ('do', 'I', 'check'), ('I

[('What', 'famous', 'meat'), ('famous', 'meat', 'company'), ('meat', 'company', 'went'), ('company', 'went', 'out'), ('went', 'out', 'of'), ('out', 'of', 'business'), ('of', 'business', 'because'), ('business', 'because', 'it'), ('because', 'it', 'became'), ('it', 'became', 'known'), ('became', 'known', 'that'), ('known', 'that', 'the'), ('that', 'the', 'underworld'), ('the', 'underworld', 'had'), ('underworld', 'had', 'been'), ('had', 'been', 'selling'), ('been', 'selling', 'them'), ('selling', 'them', 'kangaroo'), ('them', 'kangaroo', 'meat'), ('kangaroo', 'meat', '?')]
[('How', 'many', 'visitors'), ('many', 'visitors', 'go'), ('visitors', 'go', 'to'), ('go', 'to', 'the'), ('to', 'the', 'Vatican'), ('the', 'Vatican', 'each'), ('Vatican', 'each', 'year'), ('each', 'year', '?')]
[('What', 'is', 'the'), ('is', 'the', 'population'), ('the', 'population', 'of'), ('population', 'of', 'Mexico'), ('of', 'Mexico', '?')]
[('How', 'do', 'you'), ('do', 'you', 'get'), ('you', 'get', 'silly'), ('g

[('How', 'do', 'they'), ('do', 'they', 'produce'), ('they', 'produce', 'vitamins'), ('produce', 'vitamins', '?')]
[('The', 'lawyer', 'who'), ('lawyer', 'who', 'represented'), ('who', 'represented', 'Randy'), ('represented', 'Randy', 'Craft'), ('Randy', 'Craft', ','), ('Craft', ',', 'what'), (',', 'what', 'was'), ('what', 'was', 'his'), ('was', 'his', 'name'), ('his', 'name', '?')]
[('What', 'do', 'penguins'), ('do', 'penguins', 'eat'), ('penguins', 'eat', '?')]
[('What', 'organization', 'is'), ('organization', 'is', 'the'), ('is', 'the', 'Security'), ('the', 'Security', 'Council'), ('Security', 'Council', 'a'), ('Council', 'a', 'part'), ('a', 'part', 'of'), ('part', 'of', '?')]
[('What', 'is', 'New'), ('is', 'New', 'England'), ('New', 'England', "'s"), ('England', "'s", 'highest'), ("'s", 'highest', 'mountain'), ('highest', 'mountain', '?')]
[('What', 'TV', 'station'), ('TV', 'station', 'did'), ('station', 'did', 'Mary'), ('did', 'Mary', 'Richards'), ('Mary', 'Richards', 'work'), ('Ric

[('What', 'are', 'dinosaur'), ('are', 'dinosaur', 'droppings'), ('dinosaur', 'droppings', 'called'), ('droppings', 'called', '?')]
[('What', 'was', 'the'), ('was', 'the', 'infamous'), ('the', 'infamous', 'feat'), ('infamous', 'feat', 'of'), ('feat', 'of', 'Germany'), ('of', 'Germany', "'s"), ('Germany', "'s", 'U-2'), ("'s", 'U-2', 'submarine'), ('U-2', 'submarine', '?')]
[('What', 'Homerian', 'epic'), ('Homerian', 'epic', 'chronicles'), ('epic', 'chronicles', 'events'), ('chronicles', 'events', 'toward'), ('events', 'toward', 'the'), ('toward', 'the', 'end'), ('the', 'end', 'of'), ('end', 'of', 'the'), ('of', 'the', 'Trojan'), ('the', 'Trojan', 'Wars'), ('Trojan', 'Wars', '?')]
[('When', 'did', 'the'), ('did', 'the', 'vesuvius'), ('the', 'vesuvius', 'last'), ('vesuvius', 'last', 'erupt'), ('last', 'erupt', '?')]
[('What', 'is', 'widely'), ('is', 'widely', 'used'), ('widely', 'used', 'to'), ('used', 'to', 'detect'), ('to', 'detect', 'prenatal'), ('detect', 'prenatal', 'birth'), ('prenat

[('Why', 'did', 'Europeans'), ('did', 'Europeans', 'first'), ('Europeans', 'first', 'come'), ('first', 'come', 'to'), ('come', 'to', 'Australia'), ('to', 'Australia', 'and'), ('Australia', 'and', 'Oceania'), ('and', 'Oceania', '?')]
[('What', 'did', 'the'), ('did', 'the', 'Confederate'), ('the', 'Confederate', 'Constitution'), ('Confederate', 'Constitution', 'say'), ('Constitution', 'say', 'about'), ('say', 'about', 'slavery'), ('about', 'slavery', '?')]
[('How', 'long', 'does'), ('long', 'does', 'it'), ('does', 'it', 'take'), ('it', 'take', 'the'), ('take', 'the', 'Milky'), ('the', 'Milky', 'Way'), ('Milky', 'Way', 'Galaxy'), ('Way', 'Galaxy', 'to'), ('Galaxy', 'to', 'make'), ('to', 'make', 'one'), ('make', 'one', 'revolution'), ('one', 'revolution', '?')]
[('How', 'many', 'people'), ('many', 'people', 'own'), ('people', 'own', 'pets'), ('own', 'pets', '?')]
[('What', 'web', 'sites'), ('web', 'sites', 'are'), ('sites', 'are', 'linked'), ('are', 'linked', 'to'), ('linked', 'to', 'the')

[('What', 'is', 'the'), ('is', 'the', 'country'), ('the', 'country', 'of'), ('country', 'of', 'origin'), ('of', 'origin', 'for'), ('origin', 'for', 'the'), ('for', 'the', 'name'), ('the', 'name', 'Thomas'), ('name', 'Thomas', '?')]
[('Why', 'do', 'you'), ('do', 'you', 'say'), ('you', 'say', '``'), ('say', '``', 'God'), ('``', 'God', 'bless'), ('God', 'bless', 'you'), ('bless', 'you', '``'), ('you', '``', 'when'), ('``', 'when', 'people'), ('when', 'people', 'sneeze'), ('people', 'sneeze', '?')]
[('What', 'is', 'the'), ('is', 'the', 'population'), ('the', 'population', 'of'), ('population', 'of', 'the'), ('of', 'the', 'largest'), ('the', 'largest', 'Chilean'), ('largest', 'Chilean', 'city')]
[('What', 'country', 'has'), ('country', 'has', 'the'), ('has', 'the', 'highest'), ('the', 'highest', 'per'), ('highest', 'per', 'capita'), ('per', 'capita', 'consumption'), ('capita', 'consumption', 'of'), ('consumption', 'of', 'cheese'), ('of', 'cheese', '?')]
[('What', 'is', 'Larry'), ('is', 'Lar

In [10]:
import numpy as np
from sklearn import tree
from sklearn.model_selection import cross_val_score

tag_list = tags_set(train_list)
train_feature_set = features_extract(train_list, unigrams, bigrams, trigrams, tag_list)
train_feature_vec = np.asarray(train_feature_set)

train_feature_labels = feature_lab(train_list)
train_feature_label_vec = np.asarray(train_feature_labels)

classifier = tree.DecisionTreeClassifier()

score = cross_val_score(classifier, train_feature_vec, train_feature_label_vec, scoring='accuracy', cv=10)
print("10-Fold accuracy list: ",score)
print("10-Fold accuracy: ",sum(score)/10)


10-Fold accuracy list:  [0.75364964 0.75912409 0.73857404 0.73626374 0.75274725 0.72293578
 0.75322284 0.7587477  0.74401473 0.74401473]
10-Fold accuracy:  0.7463294531674619
