In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

D:\Anaconda\setup\lib\site-packages\numpy\.libs\libopenblas.IPBC74C7KURV7CB2PKT5Z5FNR3SIBV4J.gfortran-win_amd64.dll
D:\Anaconda\setup\lib\site-packages\numpy\.libs\libopenblas.TXA6YQSD3GCQQC22GEQ54J2UDCXDXHWN.gfortran-win_amd64.dll
  stacklevel=1)


In [5]:
df = pd.read_csv('E:\\data\\nlp\\ner\\ner_dataset.csv', encoding = "ISO-8859-1")
#df = df[:100000]
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [6]:
#统计各字段为空的元素个数
print(df.isnull().sum())
#forward fill(用最近的有效字符来填充空白)
df = df.fillna(method='ffill')

Sentence #    1000616
Word                0
POS                 0
Tag                 0
dtype: int64


In [7]:
#查看每列所包含的不重复元素数:
df['Sentence #'].nunique(),df['Word'].nunique(),df['POS'].nunique(),df['Tag'].nunique()

(47959, 35178, 42, 17)

In [8]:
#统计每个Tag的数量，并降序排列
df.groupby('Tag').size().reset_index(name='num').sort_values('num',ascending=False)

Unnamed: 0,Tag,num
16,O,887908
2,B-geo,37644
7,B-tim,20333
5,B-org,20143
14,I-per,17251
6,B-per,16990
13,I-org,16784
3,B-gpe,15870
10,I-geo,7414
15,I-tim,6528


In [9]:
#引入sklearn中和CRF相关的包sklearn-crfsuite
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [10]:
#数据预处理，将原始数据转换成指定格式
class SentenceTransform(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [11]:
#数据格式转换
t_result = SentenceTransform(df)
sent = t_result.get_next()
print(sent)

sentences = t_result.sentences
print(sentences[:3])

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]
[[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops

In [12]:
#特征提取
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [19]:
#生成训练和测试数据集
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
#print(X[:1])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print(y_train[:3])

[['O', 'B-gpe', 'O', 'B-per', 'I-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O']]


In [20]:
#用CRF进行训练
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [21]:
#生成类别标签
classes = np.unique(df.Tag.values).tolist()
print(classes)
#去除类别"O"
classes.pop()
print(classes)

['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']
['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim']


In [22]:
#对模型效果进行预测
y_pred = crf.predict(X_test)
F1 = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=classes)
print("F1:{0}".format(F1))

F1:0.8512087682284608


In [23]:
print(metrics.flat_classification_report(y_test, y_pred, labels = classes))

             precision    recall  f1-score   support

      B-art       0.45      0.12      0.19       111
      B-eve       0.47      0.37      0.41        79
      B-geo       0.86      0.91      0.89      9480
      B-gpe       0.97      0.94      0.96      3987
      B-nat       0.85      0.48      0.62        58
      B-org       0.80      0.74      0.77      5041
      B-per       0.85      0.83      0.84      4283
      B-tim       0.92      0.88      0.90      5240
      I-art       0.17      0.05      0.08        83
      I-eve       0.36      0.29      0.32        73
      I-geo       0.82      0.80      0.81      1931
      I-gpe       0.94      0.62      0.74        52
      I-nat       0.75      0.47      0.58        19
      I-org       0.81      0.80      0.81      4284
      I-per       0.85      0.90      0.87      4349
      I-tim       0.83      0.74      0.78      1673

avg / total       0.86      0.85      0.85     40743



In [24]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-nat  -> I-nat   7.123397
B-art  -> I-art   6.029598
I-art  -> I-art   5.938002
B-eve  -> I-eve   5.914369
I-eve  -> I-eve   5.640564
I-gpe  -> I-gpe   4.932274
I-tim  -> I-tim   4.588277
B-geo  -> I-geo   4.531746
I-org  -> I-org   4.411837
B-tim  -> I-tim   4.309793
B-org  -> I-org   4.176296
B-gpe  -> I-gpe   3.810566
I-geo  -> I-geo   3.647574
O      -> O       3.573804
B-per  -> I-per   3.560070
I-nat  -> I-nat   3.312426
I-per  -> I-per   2.715547
O      -> B-per   1.585798
O      -> B-tim   1.506052
I-geo  -> B-art   1.251252

Top unlikely transitions:
B-tim  -> I-org   -4.118340
B-geo  -> B-geo   -4.165929
I-per  -> I-org   -4.192177
B-per  -> I-org   -4.459603
B-tim  -> B-tim   -4.482683
B-gpe  -> I-per   -4.581899
O      -> I-art   -4.804808
B-gpe  -> I-geo   -4.906594
I-org  -> I-per   -4.908821
B-geo  -> I-per   -4.991151
B-geo  -> I-org   -5.113602
B-org  -> I-per   -5.313064
B-gpe  -> I-org   -5.609349
I-per  -> B-per   -6.042720
B-gpe  -> B-gpe  

In [25]:
import eli5

eli5.show_weights(crf, top=10)

  from numpy.core.umath_tests import inner1d


From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,3.574,0.035,-4.805,0.554,-3.781,0.912,-7.657,-0.164,-3.646,0.51,-2.27,0.808,-7.384,1.586,-6.592,1.506,-7.163
B-art,-0.236,0.0,6.03,0.0,-0.213,-0.816,-1.342,-1.397,-0.653,0.0,0.0,0.0,-2.214,-1.299,-2.235,0.001,-1.332
I-art,-0.293,-0.46,5.938,0.0,0.0,-0.238,-1.239,-0.84,-0.299,0.0,0.0,-1.093,-1.77,-1.005,-2.11,-0.981,-1.088
B-eve,-0.553,0.0,-0.274,-0.139,5.914,-1.211,-1.255,-1.188,-0.393,-0.356,0.0,-1.554,-1.838,-1.973,-1.961,0.352,-1.435
I-eve,0.18,0.0,0.0,-2.128,5.641,-0.935,-0.898,-0.408,0.0,0.0,0.0,-0.906,-1.33,-1.322,-1.378,-1.824,-1.304
B-geo,0.266,0.656,-2.33,-1.173,-2.149,-4.166,4.532,-0.119,-4.064,-0.392,-0.621,-0.524,-5.114,-1.685,-4.991,1.222,-3.887
I-geo,-0.009,1.251,-1.402,-0.358,-1.242,-3.082,3.648,-1.95,-2.4,0.0,-0.45,-0.662,-3.861,-1.056,-3.729,0.446,-2.941
B-gpe,0.537,-1.409,-2.08,-1.288,-2.477,-0.142,-4.907,-6.237,3.811,-0.648,-0.891,0.847,-5.609,-0.609,-4.582,-0.426,-3.476
I-gpe,0.158,0.0,0.0,0.0,0.0,-0.268,-1.286,-0.989,4.932,0.0,0.0,-1.993,-1.328,-0.276,-1.177,-0.711,-0.73
B-nat,-0.478,0.0,0.0,0.0,0.0,-0.409,-0.397,-0.749,0.0,-0.336,7.123,-0.293,-0.886,-0.763,-1.579,0.0,-0.243

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16
+7.691,word.lower():last,,,,,,,,,,,,,,,
+7.523,word.lower():month,,,,,,,,,,,,,,,
+5.784,word.lower():chairman,,,,,,,,,,,,,,,
+5.626,word.lower():columbia,,,,,,,,,,,,,,,
+5.623,BOS,,,,,,,,,,,,,,,
+5.309,word.lower():year,,,,,,,,,,,,,,,
+5.289,word.lower():internet,,,,,,,,,,,,,,,
… 7405 more positive …,… 7405 more positive …,,,,,,,,,,,,,,,
… 4430 more negative …,… 4430 more negative …,,,,,,,,,,,,,,,
-4.742,word.lower():multi-party,,,,,,,,,,,,,,,

Weight?,Feature
+7.691,word.lower():last
+7.523,word.lower():month
+5.784,word.lower():chairman
+5.626,word.lower():columbia
+5.623,BOS
+5.309,word.lower():year
+5.289,word.lower():internet
… 7405 more positive …,… 7405 more positive …
… 4430 more negative …,… 4430 more negative …
-4.742,word.lower():multi-party

Weight?,Feature
+4.864,word.lower():spanish
+4.326,word.lower():twitter
+4.113,+1:word.lower():enkhbayar
+3.903,+1:word.lower():boots
+3.851,word.lower():nevirapine
+3.802,-1:word.lower():engine
+3.698,word.lower():spaceshipone
+3.683,word.lower():english
+3.284,word.lower():russian
+3.051,word[-3:]:One

Weight?,Feature
+2.721,+1:word.lower():came
+2.690,-1:word.lower():boeing
+2.589,+1:word.lower():gained
+2.228,-1:word.lower():cajun
+2.078,+1:word.lower():reports
+1.971,+1:word.lower():expands
+1.901,-1:word.lower():hitler
+1.878,+1:word.lower():teshome
+1.863,word.lower():pound
+1.821,+1:word.lower():rival

Weight?,Feature
+3.819,word.lower():games
+3.675,word.lower():ramadan
+3.462,-1:word.lower():war
+3.139,+1:word.lower():men
+3.044,word.lower():katrina
+2.911,-1:word.lower():wars
+2.855,-1:word.lower():first
+2.832,+1:word.lower():dean
+2.831,word.lower():olympic
+2.827,word[-3:]:pic

Weight?,Feature
+3.910,+1:word.lower():mascots
+3.281,word.lower():games
+2.853,word.lower():series
+2.492,+1:word.lower():tore
+2.439,+1:word.lower():rally
+2.391,+1:word.lower():era
+2.304,+1:word.lower():without
+2.275,word.lower():dean
+2.246,+1:word.lower():caused
+2.162,+1:word.lower():suicide

Weight?,Feature
+5.597,word.lower():mid-march
+5.188,word.lower():caribbean
+5.071,word.lower():martian
+4.984,word.lower():beijing
+4.703,word.lower():europe
+4.295,word.lower():quake-zone
+4.244,+1:word.lower():phoned
+4.231,word.lower():burma
+4.198,word.lower():paris
… 5117 more positive …,… 5117 more positive …

Weight?,Feature
+3.939,word.lower():holiday
+3.891,+1:word.lower():regional
+3.399,word.lower():city
+3.226,word.lower():shogunate
+3.226,-1:word.lower():tokugawa
+3.085,+1:word.lower():produced
+2.948,+1:word.lower():block
+2.874,word.lower():caribbean
+2.856,word.lower():forces
+2.684,-1:word.lower():christmas

Weight?,Feature
+6.730,word.lower():nepal
+6.567,word.lower():afghan
+6.291,word.lower():german
+6.092,word.lower():niger
+5.244,word.lower():israeli
+4.969,word.lower():palestinian
+4.837,word.lower():iranian
+4.747,word.lower():spaniard
+4.669,word.lower():croats
+4.622,word.lower():iraqi

Weight?,Feature
+4.590,+1:word.lower():mayor
+3.811,-1:word.lower():bosnian
+3.676,-1:word.lower():democratic
+3.592,-1:word.lower():soviet
+3.004,word.lower():cypriots
+2.960,+1:word.lower():returned
+2.825,word.lower():city
+2.777,+1:word.lower():health
+2.623,+1:word.lower():also
+2.602,+1:word.lower():under

Weight?,Feature
+5.120,word.lower():katrina
+5.042,word.lower():marburg
+3.775,word.lower():rita
+2.746,word.lower():ebola
+2.703,word.lower():leukemia
+2.692,word[-3:]:mia
+2.643,word.lower():h5n1
+2.643,word[-3:]:5N1
+2.642,word[-2:]:N1
+2.629,word[-3:]:ita

Weight?,Feature
+2.541,+1:word.lower():relief
+2.310,word.lower():rita
+2.300,word[-3:]:ita
+2.066,-1:word.lower():hurricanes
+1.731,+1:word.lower():rita
+1.707,word.lower():flu
+1.671,+1:word.lower():slammed
+1.654,word[-2:]:ta
+1.543,-1:postag:NN
+1.467,word[-2:]:lu

Weight?,Feature
+7.065,word.lower():philippine
+6.078,word.lower():al-qaida
+5.902,word.lower():hamas
+5.164,-1:word.lower():rice
+4.919,word.lower():taleban
+4.645,word.lower():hezbollah
+4.512,word.lower():reuters
+4.485,word.lower():conocophillips
+4.275,-1:word.lower():senator
+4.271,-1:word.lower():nepal

Weight?,Feature
+3.963,+1:word.lower():attained
+3.871,+1:word.lower():hilary
+3.821,-1:word.lower():english
+3.718,-1:word.lower():associated
+3.516,-1:word.lower():people
+3.475,word.lower():times
+3.420,+1:word.lower():ohlmert
+3.416,word.lower():member-countries
+3.344,-1:word.lower():decathlon
… 5665 more positive …,… 5665 more positive …

Weight?,Feature
+6.325,word.lower():president
+5.998,word.lower():obama
+5.816,word.lower():vice
+4.753,word.lower():prime
+4.555,word.lower():clinton
+4.457,word.lower():senator
+4.401,word.lower():hall
+4.336,+1:word.lower():acceded
+4.226,word.lower():johnston
+4.136,word.lower():milosevic

Weight?,Feature
+3.605,+1:word.lower():recep
+3.219,-1:word.lower():condoleezza
+3.112,+1:word.lower():fighters
+3.034,+1:word.lower():gao
+2.849,-1:postag:NN
+2.707,+1:word.lower():timothy
+2.696,+1:word.lower():legally
+2.535,-1:word.lower():'
+2.510,-1:word.lower():michael
… 4430 more positive …,… 4430 more positive …

Weight?,Feature
+6.771,word.lower():multi-candidate
+6.320,word.lower():january
+6.297,word.lower():february
+5.796,word.lower():2000
+5.752,word.lower():weekend
+5.692,word[-3:]:Day
+5.516,word.lower():one-year
+5.348,word.lower():august
+5.324,+1:word.lower():week
+4.879,word.lower():june

Weight?,Feature
+4.467,+1:word.lower():stocky
+4.319,-1:word.lower():this
+4.135,+1:word.lower():month
+4.065,word.lower():working-age
+3.883,+1:word.lower():old
+3.619,+1:word.lower():jose
+3.569,-1:word.lower():second
+3.534,-1:word.lower():past
+3.495,+1:word.lower():reflected
+3.467,+1:word.lower():early
