In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.parse.corenlp import CoreNLPParser
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from collections import Counter
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# XGBOOST
import xgboost as xgb
# from xgboost import XGBClassifier

# SPACY IMPORT
import spacy
nlp = spacy.load("en_core_web_md")

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
from xgboost import XGBClassifier

In [3]:
# same readData from STS.py
def readData(fileName):

    first_sentence = []
    second_sentence = []
    score = []
    file = open(fileName, encoding="utf8")
    text = file.readline()
    text = file.read()
    # loop to extract a set of two sentences
    for sentence in text.split('\n'):
        # creating two separate lists of the sentences
        # '.rstrip('.') only removes the last period in the sentence
        first_sentence.insert(len(first_sentence),
                              (sentence.split('\t')[1].lower()).rstrip('.'))
        second_sentence.insert(len(first_sentence),
                               (sentence.split('\t')[2].lower()).rstrip('.'))
        # inserting the score as a separate lists
        score.insert(len(first_sentence), (sentence.split('\t')[3]))

    # print(first_sentence)
    return first_sentence, second_sentence, score


def preprocess(fileName):

    first_sentence, second_sentence, score = readData(fileName)
    first_sentence_tokens = []
    second_sentence_tokens = []

    # tokenizing and tagging
    first_sentence_tags = []
    second_sentence_tags = []

    for sentence in first_sentence:
        tokens = nltk.word_tokenize(sentence)
        first_sentence_tokens.insert(len(first_sentence_tokens), tokens)
        first_sentence_tags.insert(
            len(first_sentence_tags), nltk.pos_tag(tokens))
        # print(first_sentence_tokens)

    for sentence in second_sentence:
        tokens = nltk.word_tokenize(sentence)
        second_sentence_tokens.insert(len(second_sentence_tokens), tokens)
        second_sentence_tags.insert(
            len(second_sentence_tags), nltk.pos_tag(tokens))

        # print(second_sentence_tokens)

    # lemmatizing
    first_sentence_lemmas = []
    second_sentence_lemmas = []
    lemmatizer = WordNetLemmatizer()
    for sentence in first_sentence_tokens:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        first_sentence_lemmas.insert(
            len(first_sentence_lemmas), sentence_components)

    for sentence in second_sentence_tokens:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        second_sentence_lemmas.insert(
            len(second_sentence_lemmas), sentence_components)

    return first_sentence, second_sentence, score, first_sentence_tokens, second_sentence_tokens, first_sentence_lemmas, second_sentence_lemmas


In [4]:
s1_arr_train, s2_arr_train, scores_train, s1_tokens_train, s2_tokens_train, s1_lemmas_train,s2_lemmas_train = preprocess("./data/train-set.txt")

## Data Inspection

In [5]:
score_list = [0,0,0,0,0,0]
for s in scores_train:
    score_list[int(s)] += 1
for i in range(0, len(score_list)):
#     print(i, ': ', score_list[i], round(score_list[i]/len(score_list), 1), '%')
    print("% 1d: % 4d % 6.2f per" %(i, score_list[i], 100*score_list[i]/len(scores_train))) 

 0:    8   0.54 per
 1:   37   2.49 per
 2:   95   6.40 per
 3:  310  20.89 per
 4:  616  41.51 per
 5:  418  28.17 per


## Feature Engineering

This section includes all the code/functions to create features.

### Cosine Similarity (TF-IDF)

In [6]:
def calc_cosine_similarity(sentence1, sentence2):

    # remove the stopwords, transform into TF-IDF matrix, then
    tfidf_matrix = TfidfVectorizer(
        stop_words="english").fit_transform([sentence1, sentence2])
    cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
#     print(tfidf_matrix.toarray())

    cos_sim = cos_sim_matrix[0][1]

    return cos_sim

### Smooth Inverse Frequency (SIF)

In [7]:
def frequency_distribution(s1_tokens_array, s2_tokens_array):
    freq_dist = FreqDist()
    for i in range(len(s1_tokens_array)):
        for token in (s1_tokens_array[i] + s2_tokens_array[i]):
            freq_dist[token.lower()] += 1
    return freq_dist

In [8]:
freq_dist = frequency_distribution(s1_tokens_train, s2_tokens_train)
print(freq_dist.most_common(40))

[('the', 5169), (',', 3690), ('of', 2497), ('to', 2133), ('and', 1716), ('a', 1615), ('in', 1573), ('is', 891), ('that', 831), ('on', 820), ('for', 756), ('it', 587), ('this', 579), ('we', 531), ('with', 464), ('be', 459), ('by', 443), ('i', 425), ('which', 403), ('have', 384), ('not', 366), ('at', 343), ('as', 334), ('are', 333), ('has', 319), ('said', 316), ('was', 304), ('european', 287), ("'s", 280), ('from', 261), ('``', 252), ("''", 242), ('will', 233), ('.', 229), ('also', 223), ('its', 194), ('but', 193), ('would', 191), ('all', 188), ('percent', 187)]


In [9]:
def calc_sif_similarity(s1, s2, a = .001):
    vectorizer = CountVectorizer(stop_words="english")
    X = vectorizer.fit_transform([s1, s2])
    X_arr = X.toarray()
    sif_matrix = []
    for i in range(0, len(X_arr)):
        sif_arr = []
        for j in range(0, len(X_arr[i])):
            word = vectorizer.get_feature_names()[j]
            w = a / (a + freq_dist[word])
            v = X_arr[i][j]
            sif_arr.append(v*w)
        sif_matrix.append(sif_arr)
    sif_cos_sim_matrix = cosine_similarity(sif_matrix, sif_matrix)
    sif_cos_sim = sif_cos_sim_matrix[0][1]
    return sif_cos_sim

In [10]:
calc_sif_similarity('I like some apples', 'I like the pears')

1.4515545128534995e-10

### Simple Overlap

Unique words that are in both sentences divided by the total number of words in both sentences. Does not include stop words.

In [11]:
stop_words = set(stopwords.words('english'))
tokenized_sentence_list = s1_tokens_train+s2_tokens_train
words_filtered = []

# print(words)

# looking through I've noticed there are a number of stop-words that can be added to the set
stop_words.add(',')
stop_words.add('``')
stop_words.add("n't")

for tsl in tokenized_sentence_list:
    for w in tsl:
        if w not in stop_words and w not in words_filtered:
            words_filtered.append(w)

In [12]:
def remove_stopwords(token_list):
    blank_list = []
    for w in token_list:
        if w not in stop_words:
            blank_list.append(w)
    return blank_list

In [13]:
def remove_duplicate_tokens(token_list):
    blank_list = []
    for w in token_list:
        if w not in blank_list:
            blank_list.append(w)
    return blank_list

In [14]:
def calc_basic_overlap(s1_tokens, s2_tokens):
    s1_tokens = remove_stopwords(s1_tokens)
    s1_tokens = remove_duplicate_tokens(s1_tokens)

    s2_tokens = remove_stopwords(s2_tokens)
    s2_tokens = remove_duplicate_tokens(s2_tokens)
    
    overlap = 0
    encountered_indexes = []
    for word in (s1_tokens+s2_tokens):
        try:
            word_index = words_filtered.index(word)
            if word_index in encountered_indexes: # we know we have found an overlap
                overlap += 1
            encountered_indexes.append(word_index)
        except ValueError:
            # print(word + ' not found in lexicon. Skipping...')
            continue

    avg_sentence_len = len(s1_tokens+s2_tokens) / 2
    
    overlap_normlalized = overlap / avg_sentence_len
    return overlap, overlap_normlalized

### Synset Overlap

In [15]:
def calc_synset_overlap(s1_tokens, s2_tokens):
    s1_tokens = remove_stopwords(s1_tokens)
    s1_tokens = remove_duplicate_tokens(s1_tokens)

    s2_tokens = remove_stopwords(s2_tokens)
    s2_tokens = remove_duplicate_tokens(s2_tokens)
    
    print(s2_tokens)
    print(s1_tokens)

    s1_spread = []
    s2_spread = []
    
    for word in s1_tokens:
        for synset in wn.synsets(word):
            for i in range(0, len(synset.lemmas())):
                syn_word = synset.lemmas()[i].name()
                if syn_word not in s1_spread:
                    s1_spread.append(syn_word)

    for word in s2_tokens:
        for synset in wn.synsets(word):
            for i in range(0, len(synset.lemmas())):
                syn_word = synset.lemmas()[i].name()
                if syn_word not in s2_spread:
                    s2_spread.append(syn_word)         
    
    return calc_basic_overlap(s1_spread, s2_spread)
    
calc_synset_overlap(s1_tokens_train[0], s2_tokens_train[0])

['sources', 'close', 'sale', 'said', 'vivendi', 'keeping', 'door', 'open', 'bids', 'next', 'day', 'two']
['sources', 'close', 'sale', 'said', 'vivendi', 'keeping', 'door', 'open', 'bids', 'hoped', 'see', 'bidders', 'interested', 'individual', 'assets', 'team']


(46, 0.23173803526448364)

### Spacy Similarity

In [16]:
def calc_spacy_sim(s1, s2):
    s2 = nlp(s2)
    s1 = nlp(s1)
    return s1.similarity(s2)


## Pipeline

In this section we run the data through the pipeline to get it into the form necessary to create our models.

In [17]:
def pipeline(s1_array, s2_array, s1_tokens, s2_tokens, s1_lemmas, s2_lemmas):
    # TODO add a check to ensure the lengths of these arrays are the same
    # or add the basic processing to pipeline
    data = []
    for i in range(0, len(s1_array)):
        cos_sim = calc_cosine_similarity(s1_array[i], s2_array[i])
        sif_sim = calc_sif_similarity(s1_array[i], s2_array[i])
        w_overlap, w_norm_overlap = calc_basic_overlap(s1_tokens[i], s2_tokens[i])
        l_overlap, l_norm_overlap = calc_basic_overlap(s1_lemmas[i], s2_lemmas[i])
        spacy_sim = calc_spacy_sim(s1_array[i], s2_array[i])
        syn_overlap, normalized_syn_overlap = calc_synset_overlap(s1_tokens[i], s2_tokens[i])
        data.append([i, w_norm_overlap, l_norm_overlap, sif_sim, cos_sim, spacy_sim, syn_overlap, normalized_syn_overlap])
    return data

In [18]:
data = pipeline(s1_arr_train, s2_arr_train, s1_tokens_train, s2_tokens_train, s1_lemmas_train, s2_lemmas_train)
print(data[0:5])

['sources', 'close', 'sale', 'said', 'vivendi', 'keeping', 'door', 'open', 'bids', 'next', 'day', 'two']
['sources', 'close', 'sale', 'said', 'vivendi', 'keeping', 'door', 'open', 'bids', 'hoped', 'see', 'bidders', 'interested', 'individual', 'assets', 'team']
['micron', "'s", 'numbers', 'also', 'marked', 'first', 'quarterly', 'profit', 'three', 'years', 'dram', 'manufacturer']
['micron', 'declared', 'first', 'quarterly', 'profit', 'three', 'years']
['perry', 'said', 'backs', 'senate', "'s", 'efforts', 'including', 'fines', 'force', 'democrats', 'return']
['fines', 'part', 'failed', 'republican', 'efforts', 'force', 'entice', 'democrats', 'return']
['american', 'anglican', 'council', 'represents', 'episcopalian', 'conservatives', 'said', 'seek', 'authorization', 'create', 'separate', 'province', 'north', 'america', 'last', 'week', "'s", 'actions']
['american', 'anglican', 'council', 'represents', 'episcopalian', 'conservatives', 'said', 'seek', 'authorization', 'create', 'separate', 'g

['neither', 'peace', 'rules', 'could', 'keep', 'funny', 'cide', 'drawing', 'away']
['peace', 'rules', 'defeated', 'funny', 'cide', 'louisiana', 'derby']
['waksal', 'pleaded', 'guilty', 'insider', 'trading', 'charges', 'last', 'year', 'scheduled', 'sentenced', 'june', '10']
['waksal', 'pleaded', 'guilty', 'securities', 'fraud', 'sentenced', 'next', 'week']
['zambrano', 'pitched', 'seven', 'innings', 'allowed', 'two', 'runs', 'five', 'hits', 'four', 'walks']
['allowed', 'two', 'runs', 'seven', 'innings', 'struck', 'six']
["'m", 'absolutely', 'confident', "'re", 'going', 'bill', "''", 'frist', 'r-tenn.', 'said', 'thursday']
['still', 'said', "'m", 'absolutely', 'confident', "'re", 'going', 'bill', '.', "''"]
['korean', 'air', 'deal', 'expected', 'finalized', 'next', 'several', 'weeks', "''", 'boeing', 'spokesman', 'bob', 'saling', 'said']
['boeing', 'said', 'final', 'agreement', 'expected', 'signed', 'next', 'weeks']
['shares', 'eds', 'closed', 'thursday', '$', '18.51', '6', 'cents', 'new

['wire', 'transfers', 'four', '40', 'accounts', 'open', 'beacon', 'hill', 'totaled', '$', '3.2', 'billion', '2001', '2002', 'morgenthau', 'said']
['2001', '2002', 'morgenthau', 'said', 'wire', 'transfers', 'four', 'beacon', 'hill', "'s", '40', 'accounts', 'totaled', '$', '3.2', 'billion']
['ran', 'last', 'year', 'democratic', 'nomination', 'texas', 'governor', 'lost', 'primary', 'multimillionaire', 'tony', 'sanchez']
['last', 'year', 'made', 'unsuccessful', 'bid', 'democratic', 'nomination', 'governor']
['wal-mart', 'told', 'top', '100', 'suppliers', "'ll", 'need', 'radio-frequency', 'id', 'systems', 'place', 'tracking', 'pallets', 'goods', 'supply', 'chain', 'jan.', '25', '2005']
['nation', "'s", 'largest', 'retailer', 'told', '100', 'top', 'suppliers', 'start', 'using', 'electronic', 'tags', 'pallets', 'goods', 'jan.', '25', '2005']
['harvey', 'taken', 'st.', 'luke', "'s", 'hospital', 'precautionary', 'neck', 'x-rays', 'came', 'back', 'negative']
['taken', 'hospital', 'precautionary'

['palm', 'said', 'wednesday', 'plans', 'buy', 'handspring', 'company', 'created', 'renegade', 'co-founder', 'jeff', 'hawkins']
['palm', 'wednesday', 'announced', 'plans', 'acquire', 'handspring', 'company', 'started', 'jeff', 'hawkins', 'regarded', 'many', 'father', 'handheld']
["'m", 'going', 'sponsoring', "'s", 'proposal', 'responding', 'negative', 'way', "''", 'said']
["'m", 'going', 'sponsoring', 'proposal', 'negative', "''"]
['government', 'recently', 'shelved', 'peace', 'talks', 'brokered', 'neighbouring', 'malaysia', 'spate', 'attacks', 'mindanao', 'including', 'three', 'deadly', 'bombings', 'blamed', 'milf']
['government', 'recently', 'shelved', 'peace', 'talks', 'milf', 'brokered', 'malaysia', 'string', 'attacks', 'including', 'three', 'bombings', 'mindanao']
['year-ago', 'quarter', 'company', 'earned', '$', '54.3', 'million', '22', 'cents', 'share']
['company', 'posted', 'profit', '$', '54.3', 'million', '22', 'cents', 'per', 'share', 'year-ago', 'period']
['along', 'boston',

['dynes', 'uc', 'san', 'diego', 'since', '1991', 'spending', '22', 'years', '&', 'bell', 'labs', 'worked', 'superconductors', 'materials']
['dynes', 'came', 'uc', 'san', 'diego', '1991', '22', 'years', 'physicist', '&', 'bell', 'labs']
['shares', 'ba', 'three', 'percent', '165-1/4', 'pence', '0933', 'gmt', 'low', '164', 'stronger', 'market']
['shares', 'ba', '1.5', 'percent', '168', 'pence', '1420', 'gmt', 'low', '164p', 'slightly', 'stronger', 'overall', 'london', 'market']
['house', 'senate', 'bills', 'would', 'end', 'ban', 'research', 'development', 'low-yield', 'nuclear', 'weapons']
['senate', 'agreed', 'tuesday', 'lift', '10-year-old', 'ban', 'research', 'development', 'low-yield', 'nuclear', 'weapons']
['meningitis', 'infection', 'fluid', 'person', "'s", 'spinal', 'cord', 'around', 'brain']
['meningitis', 'infection', 'spinal', 'cord', 'fluid', 'tissue', 'around', 'brain']
['chiron', 'aventis', 'pasteur', 'together', 'made', '80', 'million', 'doses', 'ordinarily', 'enough', 'u.s.

['people', 'thought', 'blood', 'pressure', 'fine', 'actually', 'need', 'start', 'exercising', 'eating', 'better', 'according', 'new', 'u.s.', 'guidelines', 'published', 'wednesday']
['people', 'thought', 'blood', 'pressure', 'fine', 'actually', 'may', 'well', 'way', 'hypertension', 'new', 'u.s.', 'guidelines', 'published', 'wednesday']
['stage', 'given', 'early', 'detection', 'outlook', 'instances', 'would', 'positive', "''", 'said']
['testing', 'still', 'way', 'stage', 'given', 'early', 'detection', 'outlook', 'instances', 'would', 'positive', "''", 'specialist', 'said', 'yesterday']
['amazon', 'new', 'york', 'attorney', 'general', "'s", 'office', 'already', 'settled', 'one', 'alleged', 'e-mail', 'forgers']
['amazon', 'also', 'reported', 'new', 'york', 'attorney', 'general', "'s", 'office', 'settled', 'civil', 'fraud', 'charges', 'one', 'spoofers', 'identified']
['marissa', 'jaret', 'winokur', 'tracy', 'best', 'actress', 'musical']
['leading', 'actress', 'nod', 'went', 'energetic', 'n

['america', 'friends', 'happen', '.', "''"]
['america', 'friends', 'happen', "''", 'said', 'loud', 'applause']
['bashir', 'also', 'felt', 'tried', 'opinion', 'rather', 'facts', 'law', 'added']
['bashir', 'felt', 'tried', 'opinion', 'facts', 'mahendradatta', 'told', 'reuters']
['cross-examination', "o'donnell", "'s", 'attorney', 'lorna', 'schofield', 'toepfer', 'conceded', 'ignored', 'many', 'suggestions', 'projects']
['schofield', 'got', 'toepfer', 'admit', 'cross-examination', 'ignored', 'many', "o'donnell", "'s", 'suggestions', 'projects']
['broad', 'standard', '&', 'poor', "'s", '500', 'index', '.spx', 'shed', '0.17', 'point', '0.02', 'percent', '934']
['standard', '&', 'poor', "'s", '500', 'index', 'slipped', '4.77', '0.5', 'percent', '929.62']
['crawled', 'narrow', 'winding', 'canyon', 'rappelled', '60-foot', 'cliff', 'walked', 'six', 'miles', 'near', 'canyonlands', 'national', 'park', 'southeastern', 'utah']
['may', '1', 'crawled', 'narrow', 'winding', 'canyon', 'rappelled', '60-

["'s", 'national', 'audubon', 'society', 'annual', 'christmas', 'bird', 'count', '104th', 'season']
['year', 'audubon', 'society', 'hosts', 'annual', 'christmas', 'bird', 'count']
['organizers', 'established', 'web', 'site', 'http', ':', '//defacerschallenge.com', 'listing', 'broken', 'english', 'rules', 'hackers', 'might', 'participate']
['removed', 'site', 'listed', 'broken', 'english', 'rules', 'hackers', 'might', 'participate']
['southwest', 'said', 'traffic', '4.6', 'percent', 'quarter', 'capacity', 'increase', '4.2']
['southwest', 'said', 'traffic', '4.6', 'percent', 'quarter', 'ended', '$', '2.2', 'billion', 'cash']
['belief', 'based', 'speculation', 'estrogen', 'prevents', 'cell', 'damage', 'improves', 'blood', 'flow']
['doctors', 'speculated', 'body', '’', 'estrogen', 'protects', 'cell', 'damage', 'improves', 'blood', 'flow']
['shortness', 'found', 'twice', 'often', 'hearing', 'loss']
['also', 'found', 'shortness', 'associated', 'family', 'history', 'hearing', 'loss']
['cisco'

['mr.', 'bryant', 'innocent', 'expects', 'completely', 'exonerated', "''", 'mackey', 'said', 'statement']
['lawyer', 'pamela', 'mackey', 'said', 'bryant', 'expects', 'completely', 'exonerated']
['standard', '&', 'poor', "'s", '500', 'index', 'rose', '15.66', '1,046.79', 'best', 'advance', 'since', 'oct.', '1', 'gained', '22.25']
['best', 'advance', 'since', 'oct.', '1', 'index', 'gained', '22.25']
['charles', 'cullen', '43', 'transferred', 'somerset', 'county', 'jail', 'somerville', 'anne', 'klein', 'forensic', 'center', '150-bed', 'psychiatric', 'treatment', 'facility', 'trenton']
['mr.', 'mask', 'said', 'cullen', 'would', 'taken', 'somerset', 'county', 'jail', 'thursday', 'moved', 'ann', 'klein', 'forensic', 'hospital', 'outside', 'trenton', 'psychiatric', 'care']
['stardust', 'designed', 'gather', 'thousands', 'dust', 'particles', 'streaming', 'wild', '2']
['scientists', 'believed', 'stardust', 'trapped', 'thousands', 'particles', 'dust']
['nine', 'seconds', 'later', 'spaceshipone',

['blue-chip', 'dow', 'jones', 'industrial', 'average', '.dji', 'tacked', '97', 'points', '1.14', 'percent', '8,699']
['blue-chip', 'dow', 'jones', 'industrial', 'average', '.dji', 'climbed', '164', 'points', '1.91', 'percent', '8,765.38', 'brushing', 'highest', 'levels', 'since', 'mid-january']
['according', 'arrest', 'warrant', 'bryant', '24', 'attacked', 'woman', 'june', '30']
['arrest', 'warrant', 'claimed', 'bryant', 'assaulted', 'woman', 'june', '30', 'hotel']
['kerry', 'viewed', 'favorably', '66', 'percent', 'polled', ';', 'dean', '57']
['florida', 'sen.', 'bob', 'graham', 'identifiable', '61', 'percent', 'polled']
['tech-laden', 'nasdaq', 'composite', 'index', '.ixic', 'shed', '8', 'points', '0.45', 'percent', '1,645']
['broader', 'standard', '&', 'poor', "'s", '500', 'index', '.spx', 'rose', '3.47', 'points', '0.36', 'percent', '977.59']
['twenty-four', 'players', 'broke', 'par', 'first', 'round', 'third', 'highest', 'figure', 'u.s.', 'open', 'history']
['result', '24', 'player

['27-year-old', 'rapper', "'s", 'attorney', 'civil', 'matter', 'mark', 'gann', 'return', 'calls', 'comment']
['rapper', "'s", 'lawyer', 'mark', 'gann', 'return', 'calls', 'comment']
['agents', 'confiscated', 'several', 'classified', 'documents', 'possession', 'interrogated']
['sources', 'say', 'agents', 'confiscated', 'several', "''", 'documents', 'carrying']
['also', 'hurting', 'news', 'general', 'motors', 'gm.n', 'issue', '$', '10', 'billion', 'debt', 'part', 'plug', 'hole', 'pension', 'plan']
['also', 'weighing', 'market', 'news', 'general', 'motors', 'gm.n', 'planned', 'issue', '$', '10', 'billion', 'debt', 'part', 'plug', 'hole', 'pension', 'plan']
['helicopter', 'owned', 'las', 'vegas-based', 'sundance', 'helicopters', 'inc.', 'according', 'sheriff', "'s", 'office']
['helicopter', 'burst', 'flames', 'upon', 'impact', 'according', 'mohave', 'county', 'sheriff', "'s", 'office']
['advised', 'certain', 'allegations', 'criminal', 'conduct', 'interposed', 'counsel', 'j.', 'michael', 'b

['communications', 'workers', 'union', 'represents', '2,300', 'comcast', 'employees', 'called', 'executive', 'pay', 'package', 'excessive', 'typical', 'employee', 'makes', '$', '27,000', 'annually']
['cwa', 'represents', '2,300', 'comcast', 'employees', 'called', 'excessive', 'typical', 'union', 'employee', 'makes', '$', '27,000', 'year']
['generated', '$', '4.47', 'billion', 'profit', '20', 'revenue', 'year', "'s", 'first', 'half', '53', 'percent', 'citigroup', 'totals']
['consumer', 'group', 'generated', '$', '4.47', 'billion', 'profit', '20', 'revenue', 'january', 'june', '53', '%', 'citigroup', "'s"]
['galloway', 'township', 'n.j.', '>', 'annika', 'sorenstam', 'michelle', 'wie', 'drew', 'crowds', 'angela', 'stanford', 'took', 'first', 'lpga', 'victory']
['galloway', 'township', 'n.j.', 'annika', 'sorenstam', 'drew', 'crowds', 'michelle', 'wie', 'got', 'publicity', 'angela', 'stanford', 'took', 'first', 'lpga', 'victory']
['saddam', 'gone', 'want', '(', 'u.s.', ')', 'occupation', 'e

['analysts', 'surveyed', 'thomson', 'first', 'call', 'expected', 'kodak', 'earn', '68', 'cents', 'share', 'quarter']
['kodak', 'expects', 'earnings', '5', 'cents', '25', 'share', 'quarter']
['colgate', 'shares', '30', 'cents', '$', '56', 'morning', 'trade', 'new', 'york', 'stock', 'exchange']
['colgate', 'shares', 'closed', 'monday', '$', '56.30', 'new', 'york', 'stock', 'exchange']
['several', 'shots', 'rang', 'wednesday', 'night', 'gators', 'killed']
['several', 'shots', 'rang', 'darkness', 'one', 'gator', 'killed', '11', 'p.m']
['corixa', "'s", 'stock', 'barely', 'flinched', 'news', 'dipping', '12', 'cents', 'close', '$', '6.88']
['shares', 'corixa', 'fell', '12', 'cents', '$', '6.88', 'nasdaq', 'stock', 'market']
['technology-laced', 'nasdaq', 'composite', 'index', '.ixic', '7.42', 'points', '0.45', 'percent', '1,653.44']
['broader', 'standard', '&', 'poor', "'s", '500', 'index', '<', '.spx', '>', '0.46', 'points', 'lower', '0.05', 'percent', '997.02']
['together', 'find', 'creativ

['studies', 'published', 'thursday', 'nature', 'british', 'weekly', 'science', 'journal']
['findings', 'published', 'november', '6', 'edition', 'journal', 'nature']
['survey', 'found', 'median', 'household', 'income', 'rose', '$', '51', 'accounting', 'inflation', '43,057']
['median', 'household', 'income', 'declined', '1.1', 'percent', '2001', '2002', '$', '42,409', 'accounting', 'inflation']
['bush', 'said', 'u.s.', 'european', 'union', 'leaders', 'annual', 'washington', 'summit', 'agreed', 'need', 'keep', 'iran', 'developing', 'nuclear', 'weapons']
['u.s.', 'european', 'leaders', 'pledged', 'wednesday', 'work', 'together', 'keep', 'iran', 'developing', 'nuclear', 'weapons', 'presenting', 'united', 'front', 'months', 'bitter', 'acrimony', 'iraq']
['company', 'also', 'said', 'would', 'cut', 'wholesale', 'prices', 'cassettes', 'change', 'suggested', 'retail', 'price', '$', '8.98']
['company', 'said', 'would', 'cut', 'wholesale', 'price', 'top-line', 'cds', '$', '9.09', '12.02']
['moved'

['hollywood', 'world', 'gearing', 'celebrate', 'legendary', 'entertainer', 'bob', 'hope', "'s", '100th', 'birthday', 'thursday']
['veteran', 'entertainer', 'bob', 'hope', 'celebrates', '100th', 'birthday', '-', 'many', 'years', 'showbusiness', 'thursday']
['navistar', 'shares', 'rose', 'penny', '$', '41.64', 'late', 'afternoon', 'new', 'york', 'stock', 'exchange', 'earlier', 'falling', 'low', '39.93']
['navistar', 'shares', '44', 'cents', '1.1', 'percent', '$', '41.19', 'new', 'york', 'stock', 'exchange', 'falling', 'low', '39.93']
['secretary', 'state', 'colin', 'powell', 'said', 'yesterday', 'contacts', 'iran', 'would', 'continue']
['colin', 'powell', 'secretary', 'state', 'said', 'contacts', 'iran', 'would', 'stop']
['advancers', 'outnumbered', 'decliners', 'nearly', '8', '3', 'nyse', '11', '5', 'nasdaq']
['declining', 'issues', 'outnumbered', 'advancers', 'nearly', '2', '1', 'new', 'york', 'stock', 'exchange']
['reagan', 'floating', 'airfield', 'flight', 'deck', 'covering', '4.5', 

['explained', 'found', 'antetonitrus', 'came', 'wits', '2001', 'post-doctoral', 'research', 'assistant', 'bristol', 'university', 'britain']
['explained', 'found', 'antetonitrus', 'came', 'wits', '2001', 'post-doctoral', 'research', 'assistant', 'england', "'s", 'university', 'bristol']
['.', '11', 'airline', 'world', 'air', 'canada', 'court', 'protection', 'creditors', 'since', 'april', '1']
['air', 'canada', 'largest', 'airline', '.', '11', 'world', 'court', 'protection', 'creditors', 'since', 'april', '1']
['security', 'official', "'s", 'backup', 'active', 'duty', 'lottery', 'association', 'replacement', 'new', 'jersey', 'strutt', 'said']
['security', 'official', "'s", 'backup', 'could', 'fill', 'active', 'military', 'duty', 'strutt', 'said']
['gonzales', 'found', 'guilty', 'using', 'excessive', 'force', 'spraying', 'olvera', 'pepper', 'spray']
['jury', 'also', 'found', 'gonzales', 'guilty', 'using', 'excessive', 'force', 'dousing', 'olvera-carrera', 'pepper', 'spray']
['bodies', 'm

['general', 'dynamics', 'sued', 'changed', 'retirement', 'benefits', '1997']
['workers', 'accuse', 'general', 'dynamics', 'reverse', 'age', 'discrimination', "''", 'change', 'retirement', 'benefits', '1997']
['tech-heavy', 'nasdaq', 'composite', 'index', '.ixic', '0.11', 'percent', '1.78', 'points', '1,594.13']
['tech-laced', 'nasdaq', 'composite', 'index', '.ixic', 'eased', '5.16', 'points', '0.32', 'percent', '1,590.75', 'breaking', 'six-day', 'string', 'gains']
['key', 'player', 'former', 'state', 'treasurer', 'paul', 'silvester', "'s", 'corruption', 'scheme', 'testified', 'tuesday', 'kickbacks', 'bribes', 'traded', 'business']
['key', 'figure', 'former', 'state', 'treasurer', 'paul', 'silvester', "'s", 'bribery', 'scheme', 'accused', 'wednesday', 'changing', 'story', 'alleged', 'corrupt', 'dealings', 'boston', 'investment', 'firm']
['mission', 'capps', 'ii', 'system', 'always', 'aviation', 'security', "''", 'said']
['mission', 'capps', 'ii', 'system', 'always', 'aviation', 'securit

['among', 'things', 'microsoft', 'comment', 'proposed', 'remedies', 'response']
['also', 'give', 'microsoft', 'opportunity', 'comment', 'remedies', 'proposed', 'commission']
['relief', 'sight', 'texans', 'saddled', 'skyrocketing', 'homeowners', 'insurance', 'premiums']
['texans', 'saddled', 'skyrocketing', 'homeowners', 'premiums', 'might', 'finally', 'getting', 'relief']
['six', 'democrats', 'two', 'republicans', 'running', 'seat', 'qualified', 'feb.', '3', 'primary', 'ballot']
['six', 'democrats', 'vying', 'succeed', 'jacques', 'qualified', 'feb.', '3', 'primary', 'ballot']
['technology-laced', 'nasdaq', 'composite', 'index', '.ixic', '1.55', 'points', '0.09', 'percent', '1,744.91']
['broader', 'standard', '&', 'poor', "'s", '500', 'index', 'rose', '3.42', 'points', '0.34', 'percent', '1,007.84']
['crews', 'worked', 'install', 'new', 'culvert', 'repave', 'highway', 'motorists', 'could', 'use', 'eastbound', 'lanes', 'travel']
['crews', 'worked', 'install', 'new', 'culvert', 'prepare',

['buoyed', 'advice', 'imparted', 'nicklaus', 'howell', 'shot', '8-under', '64', 'thursday', 'enter', 'today', "'s", 'round', 'one-stroke', 'lead', 'kenny', 'perry']
['buoyed', 'advice', 'imparted', 'nicklaus', 'howell', 'shot', '8-under', '64', 'one-stroke', 'lead', 'kenny', 'perry']
['jan.', '28', 'state', 'union', 'message', 'bush', 'said', 'british', 'government', 'learned', 'saddam', 'hussein', 'recently', 'sought', 'significant', 'quantities', 'uranium', 'africa', '.', "''"]
['state', 'union', 'address', 'january', 'bush', 'declared', 'british', 'government', 'learned', 'saddam', 'hussein', 'recently', 'sought', 'significant', 'quantities', 'uranium', 'africa', '.', "''"]
['$', '1.14', 'billion', 'quarter', 'last', 'year']
['net', 'revenue', 'rose', '$', '3.99', 'billion', '3.85', 'quarter', 'last', 'year']
["'s", 'sunday', 'night', 'comdex', 'must', 'time', 'yet', 'another', 'bill', 'gates', 'keynote']
["'s", 'bill', 'gates', 'comdex', 'keynote', 'must', 'time', 'new', 'tablet', 

['economy', 'shown', 'signs', 'sustainable', 'growth']
['economy', 'nonetheless', 'yet', 'exhibit', 'sustainable', 'growth']
['newly', 'unsealed', '32-count', 'indictment', 'alleges', 'money', 'laundering', 'conspiracy', 'strikes', 'one', 'top', 'targets', 'drug-trafficking', 'world', "''", 'jiménez', 'said']
['32-count', 'indictment', 'strikes', 'one', 'top', 'targets', 'drug', 'trafficking', 'world', "''", 'u.s.', 'attorney', 'marcos', 'jimenez', 'said']
['sunday', 'u.s.', 'soldier', 'killed', 'another', 'injured', 'munitions', 'dump', 'guarding', 'exploded', 'southern', 'iraq']
['soldier', 'killed', 'monday', 'another', 'wounded', 'convoy', 'ambushed', 'northern', 'iraq']
['california', "'s", 'lost', 'tax', 'revenue', 'mostly', 'due', 'international', 'corporate', 'shelters']
['commission', 'estimated', 'california', 'lost', '$', '937', 'million', 'corporate', 'tax', 'shelters', '2001']
['markets', 'u.s.', 'treasuries', 'inched', 'higher', 'declining', 'stocks', 'raised', 'appeal', 

['agency', 'yet', 'fully', 'formulate', 'strategy', 'creation', 'independent', 'engineering', 'technical', 'authority']
['calls', 'agency', 'plan', 'independent', 'safety', 'engineering', 'organization']
['worldcom', "'s", 'financial', 'troubles', 'came', 'light', 'last', 'year', 'company', 'subsequently', 'filed', 'bankruptcy', 'july', '2002']
['worldcom', "'s", 'accounting', 'problems', 'came', 'light', 'early', 'last', 'year', 'company', 'filed', 'bankruptcy', 'july', '2002', 'citing', 'massive', 'irregularities']
['benchmark', '10-year', 'note', 'recently', '17/32', 'yield', '4.067', 'percent']
['benchmark', 'treasury', '10-year', 'notes', 'gained', '17/32', 'yielding', '4.015', 'percent']
['asked', 'autopsy', 'reports', 'unsealed', 'portions', 'report', 'peterson', "'s", 'unborn', 'son', 'favorable', 'defense', 'leaked', 'media']
['prosecutors', 'about-face', 'may', 'asked', 'autopsy', 'reports', 'unsealed', 'portions', 'conner', 'peterson', "'s", 'report', 'favorable', 'defense',

['budget', 'line', 'finance', 'programmes', 'mine', 'clearance', 'prevention', 'years']
['budget', 'heading', 'funding', 'mine', 'clearance', 'prevention', 'programmes', 'many', 'years']
['point', 'view', 'illustrated', 'initial', 'aim', 'rapporteur', 'shared', 'many', 'colleagues', 'promote', 'regulations', 'codes', 'conduct', 'necessary', 'establish', 'insurers', 'forms', 'cost', 'option', 'guaranteeing', 'supply', 'good', 'quality', 'care', 'counter', 'risks', 'discriminatory', 'practices', 'selection', 'customers']
['viewpoint', 'illustrated', 'rapporteur', "'", 'initial', 'statement', 'shared', 'number', 'members', 'promote', 'regulations', 'codes', 'conduct', 'needed', 'order', 'establish', 'among', 'insurers', 'ways', 'organising', 'costs', 'mutual', 'basis', 'providing', 'everybody', 'guaranteed', 'access', 'high-quality', 'care', 'counter', 'risk', 'discriminatory', 'practices', 'developing', 'together', 'client', 'selection']
['case', 'tax', 'still', 'perceived', 'example', '

['european', 'court', 'human', 'rights', 'council', 'europe', 'also', 'considerable', 'experience', 'regard', 'forms', 'control', ';', 'take', 'basis']
['council', 'europe', 'along', 'court', 'human', 'rights', 'wealth', 'experience', 'forms', 'supervision', 'build']
['started', 'discussions', 'issue', 'making', 'clear', 'member', 'states', 'wish', 'maintain', 'ceiling', 'enlargement']
['started', 'exchange', 'ideas', 'question', 'emerged', 'member', 'states', 'want', 'maintain', 'ceiling', 'enlargement']
['firstly', 'simplification', 'clarification', 'treaties']
['firstly', 'simplification', 'clarification', 'treaties']
['thus', 'urgent', 'staff', 'interservice', 'group', 'quickly', 'strengthened', 'heart', 'secretary-general', 'commission', 'proposed', 'act', 'general', 'scope', 'accompanied', 'examination', 'college', 'basis', 'article', '299', '(', '2', ')', 'fiche', "d'impact", 'detailed']
['matter', 'urgency', 'therefore', 'staff', 'complement', 'interdepartmental', 'group', 'att

['must', 'obviously', 'determine', 'reasons', 'increase', 'incidence', 'bse', 'france', 'also', 'member', 'states']
['clearly', 'explanations', 'necessary', 'increased', 'incidence', 'bse', 'france', 'also', 'member', 'states']
['without', 'doubt', 'better', 'agreement', 'wrong', 'case', 'proposal', 'totally', 'unacceptable', 'europe', '.']
['without', 'doubt', 'better', 'agreement', 'poor', 'one', 'true', 'instance', 'american', 'proposal', 'third-rate', 'completely', 'unacceptable', 'europe']
['european', 'parliament', 'called', 'resolution', '16', 'march', '2000', 'initiatives', 'council', 'presidency', 'intend', 'take', 'play', 'active', 'role', 'order', 'ensure', 'full', 'implementation', 'un', 'peace', 'plan', '?']
['reiterating', 'calls', 'made', 'european', 'parliament', 'resolution', '16', 'march', '2000', 'initiatives', 'presidency', 'council', 'propose', 'take', 'view', 'playing', 'active', 'role', 'guarantee', 'full', 'complete', 'application', 'un', 'peace', 'plan', '?']
[

['secondly', 'must', 'say', "'no", "'", 'concept', 'would', 'admitted', 'big', 'bang', 'time', 'large', 'group', 'countries', 'contradiction', 'principle', 'admission', 'country', 'merits']
['secondly', 'say', 'big', 'bang', 'idea', 'incorporating', 'candidate', 'countries', 'simultaneously', 'large', 'group', 'since', 'would', 'militate', 'principle', 'country', 'admitted', 'merits']
['secondly', 'must', 'say', "'no", "'", 'big', 'bang', 'concept', 'would', 'admit', 'time', 'large', 'group', 'countries', 'odds', 'principle', 'admission', 'country', 'according', 'merits']
['secondly', 'say', 'big', 'bang', 'idea', 'incorporating', 'candidate', 'countries', 'simultaneously', 'large', 'group', 'since', 'would', 'militate', 'principle', 'country', 'admitted', 'merits']
['unfortunately', 'final', 'objective', 'european', 'constitution', 'would', 'exactly', 'opposite', 'course', 'agree', '.']
['unfortunately', 'ultimate', 'objective', 'european', 'constitution', 'would', 'precisely', 'oppos

['development', 'type', 'hope', 'states', 'european', 'union']
['kind', 'future', 'would', 'also', 'like', 'see', 'countries', 'within', 'european', 'union']
['even', 'want', 'incorporate', 'treaty', 'proves', 'text', 'set', 'aside']
['even', 'intend', 'incorporate', 'treaty', 'evidence', 'indeed', 'text', 'exercise', 'always', 'destined', 'laid', 'aside']
['unipersonnelle', 'company', 'society', 'fewer', '250', 'employees', 'project', 'must', 'possible', 'receive', 'financing', 'guarantee', '120', '%', 'also', 'basis', 'good', 'idea']
['one-man', 'company', 'fewer', '250', 'employees', 'project', 'need', 'provide', '120', '%', 'loan', 'guarantees', 'form', 'land', 'receive', 'financing', ';', 'also', 'possible', 'funded', 'basis', 'good', 'idea']
['find', 'little', 'strange', 'try', 'force', 'commission', 'motion', 'resolution', 'ask', 'time', 'draw', 'green', 'paper', 'current', 'state', 'voluntary', 'insurance', 'supplementary', 'sickness']
['find', 'rather', 'odd', 'people', 'alrea

['know', 'france', 'principle', 'whole', 'seem', 'best', 'way', 'combat', 'phenomenon', '.']
['know', 'france', 'whole', 'herd', 'slaughter', 'seem', 'best', 'way', 'forward']
['urgency', 'decided', 'put', 'item', 'agenda']
['action', 'needed', 'quickly', 'decided', 'include', 'item', 'agenda']
['done', ':', 'avoided', 'budgetary', 'crisis', 'returning', 'article', '272', ';', 'financial', 'perspective', 'maintained', 'even', 'used', 'flexibility', 'instrument']
['done', ':', 'avoided', 'budgetary', 'crisis', 'going', 'article', '272', ';', 'financial', 'perspective', 'maintained', 'even', 'though', 'used', 'flexibility', 'instrument']
['establishment', 'provisional', 'unit', 'judicial', 'cooperation', 'step', 'right', 'direction', 'european', 'parliament', 'wants', 'long', 'time', 'ago']
['step', 'right', 'direction', 'european', 'parliament', 'striving', 'long', 'time', 'creation', 'provisional', 'unit', 'judicial', 'cooperation']
['appeal', 'urgently', 'french', 'presidency', 'every

['position', 'defended', 'rapporteur', 'shared', 'european', 'people', "'s", 'party']
['also', 'position', 'advocated', 'rapporteur', 'shared', 'european', 'people', "'", 'party']
['approval', 'minutes', 'previous', 'session']
['approval', 'minutes', 'previous', 'sitting']
['(', 'house', 'accepts', 'oral', 'amendment', ')']
['(', 'parliament', 'accepted', 'oral', 'amendment', ')']
['thirdly', 'acceptance', 'enlargement', 'positive', 'prospects', 'various', 'countries', 'eu', 'whole', 'dependent', '–', 'example', 'denmark', 'us', 'show', 'elimination', 'social', 'democratic', 'deficits', 'european', 'policy']
['thirdly', 'acceptance', 'enlargement', 'positive', 'prospects', 'individual', 'countries', 'eu', 'whole', 'basically', 'depend', '-', 'denmark', 'demonstrated', 'dismantling', 'social', 'democratic', 'deficits', 'european', 'policy']
['faced', 'criticism', 'especially', 'ireland', 'france', 'british', 'chancellor', 'exchequer', 'gave', 'different', 'interpretation', ':', 'tax', '

['faced', 'potential', 'risk', 'important', 'put', 'practice', 'precautionary', 'principle']
['faced', 'potential', 'risk', 'important', 'apply', 'precautionary', 'principle']
['speak', 'one', 'voice', 'grace', 'within', 'wto']
['shall', 'speak', 'one', 'disagreeable', 'voice', 'wto']
['pleased', 'proposed', 'arrangements', 'respect', 'committee', 'preparatory', 'decided', 'general', 'assembly', 'regard', 'mode', 'accreditation', 'governmental', 'organizations', 'process', 'extraordinary', 'session', 'september', '2001']
['welcome', 'arrangements', 'proposed', 'respect', 'preparatory', 'committee', 'adopted', 'general', 'assembly', 'regarding', 'procedures', 'registration', 'non-governmental', 'organisations', 'involvement', 'process', 'special', 'session', 'september', '2001']
['disgrace', 'think', 'european', 'union', 'presents', 'champion', 'modernisation', 'economic', 'life', '!']
['shameful', 'state', 'affairs', 'consider', 'eu', 'champion', 'modernised', 'business', 'practice']
[

['cooperation', 'support', 'kind', 'welcome', 'madam', 'president', 'ladies', 'gentlemen', 'french', 'presidency', 'thank']
['french', 'presidency', 'thanks', 'madam', 'president', 'ladies', 'gentlemen', 'cooperation', 'support', 'also', 'warm', 'reception', 'given', 'us']
['rule', 'also', 'apply', 'récréationnelles', 'dangerous', 'drugs', 'far', 'consumer', 'one', 'risks']
['rule', 'applied', 'equally', 'recreational', 'dangerous', 'drugs', 'one', 'person', 'consuming', 'drug', 'likely', 'affected']
['want', 'help', 'indonesia', 'intervene', 'money', 'concrete', 'actions', 'words']
['want', 'help', 'indonesia', 'must', 'act', 'money', 'deeds', 'merely', 'words']
['parliament', 'however', 'also', 'fought', 'reduction', 'funds', 'allocated', 'innovative', 'measures', 'compensated', 'use', 'framework', 'flexibilisation', 'defined', 'declaration', 'financial', 'perspective']
['house', 'also', 'fought', 'however', 'reduction', 'funds', 'available', 'innovative', 'measures', 'offset', 'mean

['done', ':', 'avoided', 'budgetary', 'crisis', 'returning', 'article', '272', ';', 'financial', 'perspectives', 'maintained', 'even', 'used', 'flexibility', 'instrument']
['done', ':', 'avoided', 'budgetary', 'crisis', 'going', 'article', '272', ';', 'financial', 'perspective', 'maintained', 'even', 'though', 'used', 'flexibility', 'instrument']
['rule', 'moreover', 'apply', 'drugs', 'récréationnelles', 'dangerous', 'provided', 'consumer', 'run', 'risks']
['rule', 'applied', 'equally', 'recreational', 'dangerous', 'drugs', 'one', 'person', 'consuming', 'drug', 'likely', 'affected']
['finally', 'word', 'fourth', 'pillar', 'dedicated', 'equal', 'opportunities', 'reinforces', 'integrated', 'approach', 'equality', 'gender', 'mainstreaming', 'specific', 'guidelines']
['finally', 'word', 'fourth', 'pillar', 'devoted', 'equal', 'opportunities', 'reinforces', 'integrated', 'approach', 'equality', 'gender', 'mainstreaming', 'specific', 'guidelines']
['speak', 'one', 'voice', 'disgracieuse', 'w

['speak', 'single', 'voice', 'language‒describes', 'within', 'wto']
['shall', 'speak', 'one', 'disagreeable', 'voice', 'wto']
['cooperation', 'support', 'also', 'reception', 'madam', 'president', 'ladies', 'gentlemen', 'europe', 'french', 'presidency', 'would', 'like', 'thank']
['french', 'presidency', 'thanks', 'madam', 'president', 'ladies', 'gentlemen', 'cooperation', 'support', 'also', 'warm', 'reception', 'given', 'us']
['priority', 'struggle', 'affirmation', 'fundamental', 'rights', 'found', 'expression', 'political', 'choices']
['priority', 'fight', 'order', 'ensure', 'affirmation', 'fundamental', 'rights', 'put', 'practice', 'political', 'decisions']
['would', 'also', 'wished', 'report', 'court', 'auditors', 'little', 'convivial', 'presented', 'clear', 'recommendations', '-', 'example', 'one', 'two', 'per', 'chapter']
['also', 'liked', 'court', 'auditors', "'", 'report', 'little', 'user-friendly', 'provided', 'number', 'clear', 'recommendations', 'example', 'one', 'two', 'per',

['currently', 'working', 'obtain', 'undeclared', 'employment', 'decent', 'social', 'conditions']
['present', 'engaged', 'illicit', 'work', 'would', 'obtain', 'decent', 'social', 'conditions']
['would', 'like', 'say', 'house', 'afternoon', 'must', 'lose', 'sight', 'fact', 'european', 'union', 'spends', '1', '%', 'gdp', 'member', 'states', 'bulk', 'money', 'administered']
['would', 'like', 'say', 'parliament', 'afternoon', 'lose', 'sight', 'fact', 'eu', 'spends', '1', '%', 'gdp', 'much', 'administered', 'member', 'states']
['spain', 'developed', 'way', 'positive', 'extremely', 'difficult', 'neighbourhood', 'always', 'existed', 'europe', 'north', 'africa', 'morocco']
['spain', 'done', 'magnificent', 'job', 'turning', 'round', 'difficult', 'neighbourly', 'relations', 'europe', 'north', 'africa', 'morocco', 'suffered', 'course', 'history']
['mr', 'president', 'voted', 'favour', 'report', 'garcía-margallo', 'marfil', 'concerns', 'taxation', 'services', 'provided', 'electronic', 'means']
['mr

['feira', 'recognised', 'quality', 'potential', 'candidates', 'accession', 'countries', 'participating', 'stabilisation', 'association', 'process']
['feira', 'council', 'considered', 'countries', 'participating', 'stabilisation', 'association', 'process', 'potential', 'applicants', 'membership', 'european', 'union']
['spain', 'developed', 'remarkably', 'positive', 'difficult', 'neighbourhood', 'always', 'existed', 'europe', 'north', 'africa', 'morocco']
['spain', 'done', 'magnificent', 'job', 'turning', 'round', 'difficult', 'neighbourly', 'relations', 'europe', 'north', 'africa', 'morocco', 'suffered', 'course', 'history']
['recent', 'incidents', 'greek', 'minority', 'himara', 'make', 'imperative', 'need', 'promote', 'measures', 'democratisation', 'strengthening', 'institutions', 'respect', 'human', 'rights', 'albania']
['urgent', 'need', 'following', 'recent', 'attacks', 'greek', 'minority', 'himara', 'promote', 'democratisation', 'measures', 'strengthen', 'institutions', 'respect', 

['create', 'clear', 'binding', 'framework', 'member', 'states', 'various', 'systems', 'meet', 'requirements', 'longer', 'period', 'put', 'test']
['define', 'clear', 'binding', 'framework', 'member', 'states', 'longer', 'evaluation', 'period', 'various', 'support', 'systems', 'required', 'satisfy', 'requirements']
['resolution', 'nice', 'summit', 'adopted', 'reflect']
['resolution', 'nice', 'voted', 'today', 'reflect']
['things', 'getting', 'worse', 'matter', 'women', 'distant', 'countries', 'accepted', 'work', 'necessity', 'alternative', 'continue', 'provide', 'vital', 'needs']
['worst', 'situation', 'women', 'concerned', 'also', 'come', 'distant', 'countries', 'taken', 'work', 'desperation', 'way', 'continuing', 'provide']
['know', 'france', 'principle', 'slaughter', 'whole', 'herd', 'applied', 'seem', 'best', 'way', 'combat', 'phenomenon']
['know', 'france', 'whole', 'herd', 'slaughter', 'seem', 'best', 'way', 'forward']
['unfortunately', 'final', 'objective', 'european', 'constituti

['find', 'little', 'strange', 'force', 'commission', 'motion', 'resolution', 'ask', 'time', 'draw', 'green', 'paper', 'current', 'state', 'voluntary', 'insurance', 'supplementary', 'sickness']
['find', 'rather', 'odd', 'people', 'already', 'trying', 'tie', 'commission', "'s", 'hands', 'relation', 'proposal', 'directive', 'calling', 'present', 'green', 'paper', 'current', 'situation', 'regard', 'optional', 'supplementary', 'health', 'insurance', 'schemes']
['believe', 'austrian', 'model', '’', '–', 'namely', 'bilateral', 'majority', 'decision-making', '-', 'future', 'instrument', 'take', 'decisions', 'ignoring', 'european', 'institutions', 'start', 'new', 'institution', 'group', 'get', 'round', 'unanimity', 'rule', 'council', '?']
['think', 'austrian', 'model', 'i.e', '.', 'bilateral', 'majority', 'resolutions', 'used', 'future', 'way', 'passing', 'bypass', 'european', 'institutions', 'forming', 'new', 'institution', 'group', 'order', 'circumvent', 'unanimity', 'council', '?']
['accepte

['things', 'worse', 'comes', 'women', "'s", 'distant', 'countries', 'accepted', 'work', 'need', 'option', 'continue', 'provide', 'vital', 'needs']
['worst', 'situation', 'women', 'concerned', 'also', 'come', 'distant', 'countries', 'taken', 'work', 'desperation', 'way', 'continuing', 'provide']
['furthermore', 'would', 'like', 'chance', 'prove', 'europe', 'able', 'develop', 'constitutional', 'equality', 'serbia', 'montenegro', 'yugoslavia', 'recognising', 'genuine', 'democratic', 'principles']
['hand', 'would', 'like', 'given', 'chance', 'prove', 'europe', 'able', 'develop', 'constitutional', 'equality', 'serbia', 'montenegro', 'within', 'yugoslavia', 'recognising', 'genuine', 'principles', 'democracy']
['continue', 'put', 'pressure', '.', 'money', 'earn', 'distribution', 'beyond', 'limit', '150', 'grammes', 'enables', 'strengthen', 'position']
['keep', 'pressure', 'strengthening', 'position', 'revenue', 'generating', 'delivering', 'mail', '150', 'grams']
['wonder', 'whether', 'police'

['lastly', 'subject', 'discussed', 'commission', 'invited', 'strength', 'make', 'use', 'procedure', 'accelerated', 'put', 'uvre', 'legislation']
['lastly', 'controversially', 'committee', 'strongly', 'called', 'use', 'fast-track', 'procedure', 'implement', 'legislation']
['safeguarding', 'universal', 'service', 'compensation', 'fund', 'would', 'make', 'possible', 'initiate', 'private', 'profits', 'benefit', 'public', 'probably', 'failure']
['unlikely', 'planned', 'protection', 'universal', 'provision', 'services', 'means', 'compensatory', 'fund', 'result', 'private', 'profits', 'ploughed', 'back', 'public', 'last']
['parliament', 'however', 'also', 'fund', 'actions', 'use', 'framework', 'defined', 'statement', 'financial', 'perspective', '.']
['house', 'also', 'fought', 'however', 'reduction', 'funds', 'available', 'innovative', 'measures', 'offset', 'means', 'resources', 'flexibility', 'instrument', 'demand', 'recorded', 'declaration', 'financial', 'perspective', 'interinstitutional',

['view', 'criticism', 'especially', 'ireland', 'france', 'chancellor', 'exchequer', 'given', 'new', 'interpretation', ':', 'tax', 'users', 'including', 'british', 'lorry', 'drivers']
['changed', 'face', 'criticism', 'especially', 'ireland', 'france', 'different', 'spin', 'put', 'british', 'chancellor', 'exchequer', ':', 'portrayed', 'user', 'charge', 'including', 'lorry', 'drivers']
['lesson', 'must', 'keep', 'assembly', 'debate', 'morning', 'forced', 'conclude', 'maritime', 'laws', 'world', 'state', 'mess', 'get', 'task', 'put', 'order', '.']
['lesson', 'parliament', 'morning', 'must', 'conclude', 'maritime', 'laws', 'throughout', 'world', 'state', 'shambles', 'begin', 'process', 'putting', 'right']
['currently', 'working', 'black', 'market', 'obtain', 'employment', 'decent', 'social', 'conditions']
['present', 'engaged', 'illicit', 'work', 'would', 'obtain', 'decent', 'social', 'conditions']
['addition', 'would', 'like', 'chance', 'prove', 'europe', 'position', 'alone', 'cause', 'ser

['however', 'would', 'like', 'say', 'house', 'afternoon', 'lose', 'sight', 'fact', 'european', 'union', 'spends', '1', '%', 'gdp', 'member', 'states', 'money', 'managed']
['would', 'like', 'say', 'parliament', 'afternoon', 'lose', 'sight', 'fact', 'eu', 'spends', '1', '%', 'gdp', 'much', 'administered', 'member', 'states']
['furthermore', 'would', 'like', 'able', 'demonstrate', 'europe', 'develop', 'constitutional', 'equality', 'serbia', 'montenegro', 'within', 'yugoslavia', 'recognising', 'true', 'democratic', 'principles']
['hand', 'would', 'like', 'given', 'chance', 'prove', 'europe', 'able', 'develop', 'constitutional', 'equality', 'serbia', 'montenegro', 'within', 'yugoslavia', 'recognising', 'genuine', 'principles', 'democracy']
['indeed', 'absolutely', 'certain', 'definition', 'fair', 'price', 'proposed', 'better', 'another', 'various', 'definitions', 'used', 'currently', 'member', 'states', 'enough', 'amply']
['means', 'certain', 'proposed', 'definition', 'equitable', 'price', 

['secondly', 'must', 'say', "'no", "'", 'concept', 'bang', 'would', 'like', 'see', 'time', 'admit', 'large', 'group', 'countries', 'contradiction', 'principle', 'admission', 'country', 'according', 'merits']
['secondly', 'say', 'big', 'bang', 'idea', 'incorporating', 'candidate', 'countries', 'simultaneously', 'large', 'group', 'since', 'would', 'militate', 'principle', 'country', 'admitted', 'merits']
['course', 'important', 'point', 'fact', 'started', 'heading', 'four', 'approach', 'know', 'yet', 'well', 'managed', 'solve', 'problem', 'particular', 'council', 'wish', 'take', 'part', 'sufficiently', 'reflexion']
['course', 'one', 'crucial', 'event', 'namely', 'start', 'made', 'category', 'four', 'looking', 'ways', 'deal', 'yet', 'entirely', 'clear', 'whether', 'managed', 'resolve', 'partly', 'council', 'refuses', 'sufficient', 'input', 'attendant', 'thought', 'processes']
['nigeria', 'encouraging', 'violence', 'employment', 'go', 'pay', 'soldiers', 'draw', 'demonstrators', 'basis', 'w

['want', 'help', 'indonesia', 'must', 'intervene', 'money', 'concrete', 'actions', 'words', '.']
['want', 'help', 'indonesia', 'must', 'act', 'money', 'deeds', 'merely', 'words']
['bit', 'compel', 'commission', 'motion', 'resolution', 'ask', 'time', 'establish', 'green', 'paper', 'current', 'state', 'voluntary', 'insurance', 'health', '.']
['find', 'rather', 'odd', 'people', 'already', 'trying', 'tie', 'commission', "'s", 'hands', 'relation', 'proposal', 'directive', 'calling', 'present', 'green', 'paper', 'current', 'situation', 'regard', 'optional', 'supplementary', 'health', 'insurance', 'schemes']
['course', 'important', 'point', 'fact', 'initiated', 'within', 'category', 'four', 'approach', 'yet', 'know', 'well', 'managed', 'solve', 'problem', 'least', 'council', 'wish', 'participate', 'enough', 'food', 'thought']
['course', 'one', 'crucial', 'event', 'namely', 'start', 'made', 'category', 'four', 'looking', 'ways', 'deal', 'yet', 'entirely', 'clear', 'whether', 'managed', 'resolv

['disgrace', 'believed', 'european', 'union', 'set', 'champion', 'modernisation', 'economic', 'life']
['shameful', 'state', 'affairs', 'consider', 'eu', 'champion', 'modernised', 'business', 'practice']
['mr', 'president', 'speech', 'shall', 'focus', 'report', 'mrs', 'lalumière', 'think', 'well', 'thought-out', 'clearly', 'formulated']
['mr', 'president', 'would', 'like', 'focus', 'speech', 'mrs', 'lalumière', "'", 'report', 'think', 'clearly', 'worded', 'well', 'put', 'together']
['company', 'unipersonnelle', 'society', 'relying', 'less', '250', 'employees', 'project', 'must', 'possible', 'receive', 'funding', 'guarantee', '120', '%', 'also', 'simple', 'basis', 'good', 'idea']
['one-man', 'company', 'fewer', '250', 'employees', 'project', 'need', 'provide', '120', '%', 'loan', 'guarantees', 'form', 'land', 'receive', 'financing', ';', 'also', 'possible', 'funded', 'basis', 'good', 'idea']
['must', 'true', 'brain', 'coordination', 'network', 'national', 'agencies', 'turn', 'activate', 

['objectives', 'primary', 'objective', 'secondary', 'represent', 'stone', 'hitting', 'enabling', 'us', 'judge', 'concrete', 'proposals', 'presented', 'commission']
['objectives', 'main', 'one', 'secondary', 'ones', 'touchstone', 'concrete', 'proposals', 'submitted', 'commission', 'must', 'judged']
['contrary', 'power', 'national', 'democracies', 'increasingly', 'restrict', 'right', 'veto', 'giving', 'parliament', 'eu']
['contrary', 'influence', 'national', 'democracies', 'reduced', 'restriction', 'right', 'veto', 'fact', 'european', 'parliament', 'given', 'power']
['party', 'issues', 'serious', 'reservations', 'connection', 'regulation', 'sales', 'products', 'consumption', 'means', 'community', 'legislation', 'like', 'reserves', 'concept', 'mutual', 'recognition', 'standards']
['-', 'party', 'serious', 'reservations', 'community', 'law', 'applying', 'sale', 'consumer', 'products', 'concept', 'mutual', 'recognition', 'standards']
['find', 'france', 'exemplary', 'committed', 'take', 'ste

['thank', 'much', 'mr', 'commissioner', '.']
['thank', 'much', 'commissioner']
['urgency', 'decided', 'put', 'issue', 'agenda']
['action', 'needed', 'quickly', 'decided', 'include', 'item', 'agenda']
['lesson', 'us', 'within', 'house', 'debate', 'morning', 'forced', 'conclude', 'laws', 'sea', 'world', 'state', 'chaos', 'must', 'get', 'work', 'bring', 'order']
['lesson', 'parliament', 'morning', 'must', 'conclude', 'maritime', 'laws', 'throughout', 'world', 'state', 'shambles', 'begin', 'process', 'putting', 'right']
['either', 'part', 'club', 'must', 'therefore', 'abide', 'decision']
['either', 'club', 'particularly', 'important', 'accept']
['right', 'government', 'remove', 'arbitrarily', 'constitution', 'defining', 'characteristic', 'tyranny']
['right', 'government', 'arbitrarily', 'set', 'aside', 'constitution', 'defining', 'characteristic', 'tyranny']
['indeed', 'convinced', 'definition', 'fair', 'price', 'proposed', 'better', 'another', 'different', 'definitions', 'used', 'member',

In [19]:
scores_train[0:5]

['4', '4', '3', '3', '2']

## Models

In this section we fit our feature set to a model.

### Decision Tree

In [20]:
dt_classifier = DecisionTreeClassifier(random_state=14, max_depth=8)
dt_classifier.fit(data,scores_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=14,
            splitter='best')

In [21]:
print(f'Nodes: {dt_classifier.tree_.node_count}')
print(f'Max Depth: {dt_classifier.tree_.max_depth}')
print(f'Accuracy: {dt_classifier.score(data, scores_train)}')

Nodes: 239
Max Depth: 8
Accuracy: 0.7109164420485176


### XGBoost

In [22]:
xgboost_model = XGBClassifier(booster='gbtree', 
                       n_estimators=1000,
                       n_jobs=4,
                       learning_rate=.05,
                       max_depth=4,
                       random_state=42,
                       gamma=.05,
                       early_stopping_rounds = 5)



In [28]:
xgboost_model = XGBClassifier(n_jobs=4, n_estimators=1000, gamma=.05, random_state=42)
xgboost_model.fit(np.asarray(data), np.asarray(scores_train))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0.05,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=4,
       nthread=None, objective='multi:softprob', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

### Random Forest

In [29]:
rf_classifier = RandomForestClassifier(random_state=14, n_estimators=100)
rf_classifier.fit(data,scores_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=14, verbose=0, warm_start=False)

# Testing

In [None]:
s1_arr_dev, s2_arr_dev, scores_dev, s1_tokens_dev, s2_tokens_dev, s1_lemmas_dev,s2_lemmas_dev = preprocess("./data/dev-set.txt")
dev_data = pipeline(s1_arr_dev, s2_arr_dev, s1_tokens_dev, s2_tokens_dev, s1_lemmas_dev,s2_lemmas_dev)
dt_dev_predictions = dt_classifier.predict(dev_data)
rf_dev_predictions = rf_classifier.predict(dev_data)
xgb_dev_predictions = xgboost_model.predict(np.asarray(dev_data))
dev_data[0:5]

['leaders', 'benefit', 'aujourd', "'", 'hui', 'new', 'luck', 'let', "'s", 'therefore', 'seize']
['leaders', 'given', 'new', 'chance', 'let', 'us', 'hope', 'seize']
['amendment', '7', 'proposing', 'certain', 'changes', 'references', 'paragraphs']
['amendment', '7', 'proposes', 'certain', 'changes', 'references', 'paragraphs']
['would', 'like', 'remind', 'among', 'allies', 'strong', 'tax']
['let', 'remind', 'allies', 'include', 'fervent', 'supporters', 'tax']
['vote', 'take', 'place', '5.30pm']
['vote', 'take', 'place', 'today', '5.30', 'p.m']
['fishermen', 'inactive', 'tired', 'disappointed']
['fishermen', 'inactive', 'tired', 'disappointed']
['majority', 'voting', 'parliament', 'go', 'back', 'article', '272']
['neither', 'qualified', 'majority', 'within', 'house', 'revert', 'article', '272']
['effect', 'augmenting', 'potency', 'big', 'countries', 'detriment', 'babies']
['increases', 'power', 'big', 'countries', 'expense', 'small']
['fishers', 'inactive', 'tired', 'disappointed']
['fish

['mr', 'van', 'orden', 'report', '(', 'a5-0241', '/', '2000', ')']
['van', 'orden', 'report', '(', 'a5-0241/2000', ')']
['mr.', 'president', 'cashman', 'report/ratio', 'summarize', 'words', ':', 'capacity', 'citizens', 'bureaucracy']
['mr', 'president', 'cashman', 'report', 'summarised', 'four', 'words', ':', 'citizens', "'", 'power', 'bureaucracy']
['amendment', '7', 'proposes', 'certain', 'modifications', 'references', 'paragraphs']
['amendment', '7', 'proposes', 'certain', 'changes', 'references', 'paragraphs']
['users', 'losers', 'employees', 'european', 'competitiveness', 'growth', 'régresseront']
['consumers', 'lose', 'employees', 'europe', 'competitive', 'strength', 'growth']
['duty', 'continue', 'support', 'latvia', 'question', 'integration', 'russian', 'population']
['job', 'continue', 'support', 'latvia', 'integration', 'russian', 'population']
['(', 'parliament', 'adopted', 'legislative', 'resolution', ')']
['(', 'parliament', 'adopted', 'legislative', 'resolution', ')']
['r

['maybe', 'could', 'avoided', 'catastrophe']
['perhaps', 'could', 'avoided', 'catastrophe']
['tunisia']
['tunisia']
['mr', 'president', 'report', 'cashman', "'s", 'summarised', 'words', ':', 'power', 'people', 'tape']
['mr', 'president', 'cashman', 'report', 'summarised', 'four', 'words', ':', 'citizens', "'", 'power', 'bureaucracy']
['unanimity', 'council', 'therefore', 'covert', 'inability', 'act']
['unanimous', 'decisions', 'hence', 'inherent', 'incapacity', 'act', 'remain', 'largely', 'norm', 'council']
['fishermen', 'inactive', 'tired', 'disappointed']
['fishermen', 'inactive', 'tired', 'disappointed']
['mister', 'president', 'anxious', 'take', 'defense', 'workers', 'sector']
['mr', 'president', 'rise', 'defence', 'workers', 'industry']
['(', 'parliament', 'adopts', 'legislative', 'resolution', ')']
['(', 'parliament', 'adopted', 'legislative', 'resolution', ')']
['users', 'losers', 'employees', 'european', 'competitiveness', 'growth', 'régresseront']
['consumers', 'lose', 'employ

['part', 'parliamentarians', 'rather', 'present', 'honourably']
['one', 'members', 'attends', 'sittings', 'quite', 'faithfully']
['mister', 'president', 'report', 'cashman', 'summarize', 'words', ':', 'strength', 'citizens', 'bureaucracy']
['mr', 'president', 'cashman', 'report', 'summarised', 'four', 'words', ':', 'citizens', "'", 'power', 'bureaucracy']
['european', 'union', 'involved', 'soon']
['european', 'union', 'got', 'something', 'quickly']
['one', 'members', 'quite', 'honourably']
['one', 'members', 'attends', 'sittings', 'quite', 'faithfully']
['fishermen', 'inactive', 'tired', 'disappointed']
['fishermen', 'inactive', 'tired', 'disappointed']
['right', 'european', 'union', 'entails', 'quickly']
['european', 'union', 'got', 'something', 'quickly']
['mr', 'president', 'want', 'stand', 'workers', 'area']
['mr', 'president', 'rise', 'defence', 'workers', 'industry']
['unit', 'must', 'balance']
['must', 'balance', 'whole']
['keep', 'l', "'", 'unanimity', 'council', 'therefore', '

['others', 'unfortunately', 'diverge', 'response', 'rancune', 'gained']
['unfortunately', 'others', 'separate', 'basis', 'accumulated', 'hatred']
['fishermen', 'inactive', 'tired', 'disappointed']
['fishermen', 'inactive', 'tired', 'disappointed']
['mr', 'president', 'wish', 'take', 'protection', 'workers', 'sector']
['mr', 'president', 'rise', 'defence', 'workers', 'industry']
['vote', 'take', 'place', '17', ':', '30']
['vote', 'take', 'place', 'today', '5.30', 'p.m']
['standards', 'hardly', 'comparable', 'still', 'less', 'transferable']
['standards', 'scarcely', 'comparable', 'let', 'alone', 'transferable']
['effect', 'increasing', 'power', 'large', 'countries', 'detriment', 'small']
['increases', 'power', 'big', 'countries', 'expense', 'small']
['could', 'perhaps', 'able', 'prevent', 'disaster']
['perhaps', 'could', 'avoided', 'catastrophe']
['whole', 'must', 'balance']
['must', 'balance', 'whole']
['already', 'explained', 'us', 'second', 'reading', 'crisis', 'root', 'amendment', 'd

['point', 'procedure', 'opens', 'door', 'arbitrary']
['provision', 'could', 'open', 'door', 'wide', 'arbitrariness']
['qualified', 'majority', 'parliament', 'go', 'back', 'article', '272']
['neither', 'qualified', 'majority', 'within', 'house', 'revert', 'article', '272']
['specific', 'aid', 'market', 'premium', 'grass', 'essential']
['selective', 'aid', 'market', 'support', 'grass', 'subsidy', 'essential']
['duty', 'continue', 'support', 'latvia', 'issue', 'integration', 'russian', 'people']
['job', 'continue', 'support', 'latvia', 'integration', 'russian', 'population']
['vote', 'take', 'place', '17h30']
['vote', 'take', 'place', 'today', '5.30', 'p.m']
['could', 'perhaps', 'avoided', 'catastrophe']
['perhaps', 'could', 'avoided', 'catastrophe']
['tunisia']
['tunisia']
['standards', 'hardly', 'comparable', 'still', 'less', 'transferable']
['standards', 'scarcely', 'comparable', 'let', 'alone', 'transferable']
['might', 'avoid', 'disaster']
['perhaps', 'could', 'avoided', 'catastrophe

['already', 'explained', 'second', 'reading', 'crisis', 'located', 'basis', 'modification', 'directive']
['already', 'explained', 'second', 'reading', 'crisis', 'underlying', 'directive', 'amendment']
['report/ratio', 'maij-weggen', '(', 'a5-0323/2000', ')']
['maij-weggen', 'report', '(', 'a5-0323/2000', ')']
['make', 'point', 'recalling', 'among', 'allies', 'enthusiasts', 'tax']
['let', 'remind', 'allies', 'include', 'fervent', 'supporters', 'tax']
['users', 'losers', 'employees', 'european', 'competitiveness', 'growth', 'diminish']
['consumers', 'lose', 'employees', 'europe', 'competitive', 'strength', 'growth']
['already', 'explained', 'second', 'reading', 'crisis', 'basis', 'amendment', 'directive']
['already', 'explained', 'second', 'reading', 'crisis', 'underlying', 'directive', 'amendment']
['users', 'losers', 'employees', 'european', 'competitiveness', 'growth', 'diminish']
['consumers', 'lose', 'employees', 'europe', 'competitive', 'strength', 'growth']
['mr', 'president', 'wo

['butterflies', 'housed', 'earlier', 'light', 'cycle', '1', 'a.m.', 'p.m.', 'flew', 'toward', 'southeast']
['butterflies', 'exposed', 'earlier', 'light', 'cycle', '1am', '1pm', 'orientated', 'towards', 'south-east']
['good', 'reputation', "''", 'rt', 'jones', 'analyst', 'juli', 'niemann', 'said', 'grant']
['rt', 'jones', 'analyst', 'juli', 'niemann', 'said', 'grant', 'one', 'pulling']
['france', 'infuriated', 'washington', 'leading', 'charge', 'u.n.', 'approval', 'war', 'also', 'sought', 'changes']
['key', 'question', 'whether', 'france', 'infuriated', 'washington', 'leading', 'charge', 'u.n.', 'authorization', 'war', 'would', 'vote', 'yes', "''", 'abstain']
['mel', 'gibson', 'negotiating', 'newmarket', 'films', 'distribute', 'embattled', 'biblical', 'epic', 'passion', 'christ', "''", 'united', 'states']
['mel', 'gibson', "'s", 'passion-stirring', 'biblical', 'epic', 'passion', 'christ', "''", 'open', 'united', 'states', 'feb.', '25', '-', 'ash', 'wednesday', 'roman', 'catholic', 'cale

['wire', 'transfers', 'four', '40', 'accounts', 'open', 'beacon', 'hill', 'totaled', '$', '3.2', 'billion', '2001', '2002', 'morgenthau', 'said']
['2001', '2002', 'wire', 'transfers', '4', 'company', "'s", '40', 'accounts', 'totaled', '$', '3.2', 'billion', 'prosecutors', 'said']
['state', 'university', 'stony', 'brook', 'launched', 'study', '1996', 'earlier', 'studies', 'indicated', 'possible', 'connection']
['stony', 'brook', 'university', 'launched', 'study', '1996', 'earlier', 'studies', 'indicated', 'possible', 'connection', 'electromagnetic', 'fields', 'cancer']
['earlier', 'bashir', "'s", 'supporters', 'defied', 'police', 'order', 'cried', 'allahu', 'akbar', '(', 'god', 'greatest', ')', "''", 'walked', 'seat']
['earlier', 'defied', 'police', 'order', 'cried', 'allahu', 'akbar', "''", '(', 'god', 'greatest', ')', 'bashir', 'walked', 'seat', 'tightly', 'guarded', 'courtroom']
['spokesman', 'james', 'howe', 'said', 'five', 'children', 'aged', '4', '17', 'taken', 'hospital', 'neck',

In [None]:
# make sure our lengths match up
print(len(scores_dev))
print(len(dt_dev_predictions))
print(len(rf_dev_predictions))
print(len(xgb_dev_predictions))

In [None]:
def calc_accuracy(predictions, scores):
    correct = 0
    for i in range(0, len(predictions)): 
        if predictions[i] == scores[i]:
            correct += 1
    return correct / len(predictions) 



    
print('DECISION TREE: ', calc_accuracy(dt_dev_predictions, scores_dev))
print('RANDOM FOREST: ', calc_accuracy(rf_dev_predictions, scores_dev))
print('XGBOOST: ', calc_accuracy(xgb_dev_predictions, scores_dev))