## Data Prep

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import sqlite3 as sqlite
con = sqlite.connect('test.sqlite3')

In [3]:
df = pd.read_sql_query("SELECT * from Documents", con)
con.close()

In [4]:
pd.options.display.max_colwidth = 200
df.sample(10)

Unnamed: 0,ED_ENC_NUM,NOTE_TEXT,Score,Category
12529,12341,"\nBELGIUM CUTS TREASURY CERTIFICATE RATES BRUSSELS, April 3 - The Belgian National Bank cut interest\nrates on one, two and three-month treasury certificates to 7.30\npct from 7.40 pct effectiv...",0,
19625,19626,"\nAVON <AVP> SEES HIGHER 4TH QTR, 1987 EARNINGS NEW YORK, Oct 20 - Avon Products Inc, which earlier\nreported lower third quarter profits, said its fourth quarter\nand full year earnings will e...",0,
4522,4145,"\nLAMSON AND SESSIONS CO <LMS> 4TH QTR LOSS NEW YORK, March 11 -\n Oper shr loss 26 cts vs profit five cts\n Oper net loss 1,506,000 vs profit 312,000\n Revs 42 mln vs 27.9 mln\n Ye...",0,
572,573,"\nALCAN ALUMINIUM LTD <AL> SETS STOCK SPLIT MONTREAL, March 2 - Alcan Aluminium Ltd said its board\ndeclared a three-for-two stock split, subject to shareholder\napproval at the April 23 annual...",0,
17891,17829,"\nMITSUBISHI HEAVY BUILDS ENERGY-SAVING TANKER TOKYO, June 16 - Mitsubishi Heavy Industries Ltd <MITH.T>\nsaid it began building the world's most advanced energy-saving\ntanker, which consumes ...",0,
12285,12097,"\nBRITISH TELECOM TO SELL EQUIPMENT IN NORTH AMERICA LONDON, April 2 - British Telecommunications Plc <BTY.L>\nwill market electronic data transmission equipment of its own\ndesign in North Ame...",0,
9026,8838,"\nNASD PRESIDENT LEAVES FOR HAMBRECHT AND QUIST NEW YORK, March 24 - <Hambrecht and Quist Group> said\nGordon Macklin has resigned as president of the <National\nAssociation of Securities Deale...",0,
12359,12171,"\nNO EXTENSION ON U.S. DAIRY HERD BUYOUT - LYNG WASHINGTON, April 2 - U.S. Agriculture Secretary Richard\nLyng said he would not agree to an extension of the 18-month\nwhole dairy herd buyout p...",0,
3316,2939,"\nGERMAN SECURITIES PURCHASES SET RECORD IN JANUARY FRANKFURT, March 6 - Purchases of West German bonds in\nJanuary reached a record 13 billion marks' worth as a result of\ninvestment from othe...",0,
14581,14393,"\nJAPAN, BRITAIN TO EXCHANGE SECURITIES MARKET INFO TOKYO, April 8 - Japan and Britain have signed a memorandum\non exchange of securities market information to protect\ninvestors and promote t...",0,


In [5]:
#df.columns
df_unlabeled = df[(df[u'Category'].isnull())]
df_unlabeled.count()


ED_ENC_NUM    20791
NOTE_TEXT     20791
Score         20791
Category          0
dtype: int64

In [6]:
df_labeled = df[(df[u'Category'] == 1) | (df[u'Category'] == 2)]
df_labeled.count()


ED_ENC_NUM    80
NOTE_TEXT     80
Score         80
Category      80
dtype: int64

In [7]:
def ConvertCategoryColToBinVal(df, true_val):
    df_labeled['Category'] = df_labeled['Category'].apply(lambda x: x == true_val)
    
ConvertCategoryColToBinVal(df_labeled, 1)
#df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [8]:
plt.axis('equal')
plt.pie(
    df_labeled.Category.value_counts().tolist(), 
    labels=['False', 'True'], 
    autopct='%1.1f%%', 
    colors=("#E13F29", "#D69A80"));

In [9]:
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\user-
[nltk_data]     old\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [item for item in tokens if item.isalpha()]
    stems = stem_tokens(tokens, stemmer)
    return stems

### Train/Test Split

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df_labeled['NOTE_TEXT'], df_labeled['Category'], test_size=0.2)
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape

(64L,) (64L,)
(16L,) (16L,)


### CountVectorizer

In [56]:
vectorizer = CountVectorizer(tokenizer=tokenize, ngram_range=(1,2))

wcounts = vectorizer.fit_transform(X_train)

In [57]:
wcounts

<64x12760 sparse matrix of type '<type 'numpy.int64'>'
	with 20355 stored elements in Compressed Sparse Row format>

In [58]:
feats = vectorizer.get_feature_names()

feats[:200]

[u'a',
 u'a billion',
 u'a bioengin',
 u'a biotech',
 u'a biotechnolog',
 u'a bizarr',
 u'a boston',
 u'a broadbas',
 u'a build',
 u'a bullish',
 u'a bundesbank',
 u'a call',
 u'a candid',
 u'a caretak',
 u'a chanc',
 u'a clear',
 u'a clinic',
 u'a clone',
 u'a close',
 u'a combin',
 u'a comment',
 u'a common',
 u'a compani',
 u'a competit',
 u'a complic',
 u'a compromis',
 u'a concern',
 u'a condit',
 u'a consensu',
 u'a consolid',
 u'a control',
 u'a correspond',
 u'a crosscurr',
 u'a cure',
 u'a cut',
 u'a day',
 u'a decis',
 u'a definit',
 u'a develop',
 u'a differ',
 u'a difficult',
 u'a direct',
 u'a discoveri',
 u'a distribut',
 u'a diuret',
 u'a dosag',
 u'a dozen',
 u'a draft',
 u'a drop',
 u'a drought',
 u'a drug',
 u'a drugfre',
 u'a faithful',
 u'a fatal',
 u'a fee',
 u'a few',
 u'a file',
 u'a final',
 u'a firm',
 u'a fiscal',
 u'a five',
 u'a fix',
 u'a flurri',
 u'a foreign',
 u'a formal',
 u'a former',
 u'a frequent',
 u'a friendli',
 u'a full',
 u'a further',
 u'a gene

### LogisticRegression using CountVectorizer

In [59]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression()
model.fit(wcounts, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [60]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
predictions = model.predict(vectorizer.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

('AUC: ', 1.0)


In [62]:
# get the feature names as numpy array
feature_names = np.array(vectorizer.get_feature_names())
print(len(feature_names))
# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()
print(len(sorted_coef_index))

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

12760
12760
Smallest Coefs:
[u'blah' u'blah blah' u'billion' u'vs' u'at' u'oct' u'reuter' u'pct' u'in'
 u'eight']

Largest Coefs: 
[u'drug' u'market' u'said' u'the drug' u'a' u'aid' u'germani' u'test'
 u'german' u'that']


### LogisticRegression using Tfidf

In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Learn vocabulary and idf from training set.
# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
# When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. 
vectorizer = TfidfVectorizer(min_df=5).fit(X_train)
feature_names = np.array(vectorizer.get_feature_names())
print(len(feature_names))
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

382
Smallest tfidf:
[u'limited' u'set' u'yet' u'little' u'probably' u'until' u'less' u'already'
 u'half' u'where']

Largest tfidf: 
[u'blah' u'loss' u'billion' u'the' u'oil' u'25' u'week' u'south' u'rate'
 u'vaccine']


In [89]:
# Transform documents to document-term matrix. Returns a Tf-idf-weighted document-term matrix
X_train_vectorized = vectorizer.transform(X_train)
#print(X_train_vectorized)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vectorizer.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

382
('AUC: ', 0.88888888888888884)


In [91]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
[u'blah' u'billion' u'government' u'oct' u'19' u'oil' u'at' u'minister'
 u'year' u'statement']

Largest Coefs: 
[u'drug' u'the' u'aids' u'fda' u'said' u'drugs' u'vaccine' u'of' u'that'
 u'to']


### LogisticRegression using CountVectorizer with n-grams

In [93]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

492

In [94]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

('AUC: ', 0.94444444444444442)


In [95]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
[u'blah' u'blah blah' u'reuter' u'billion' u'19' u'will' u'april' u'oct'
 u'oct 19' u'at']

Largest Coefs: 
[u'drug' u'german' u'products' u'germany' u'said' u'aids' u'had' u'to the'
 u'vaccine' u'said the']


### NaiveBayes using CountVectorizer with n-grams

In [99]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vectorizer = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vectorizer.transform(X_train)
feats = vectorizer.get_feature_names()
len(feats)

492

In [100]:
from sklearn.naive_bayes import MultinomialNB

clf_nb = MultinomialNB(alpha=0.1)

clf_nb.fit(X_train_vectorized, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [98]:
pf = [(clf_nb.feature_log_prob_[1, i], feats[i]) for i in range(len(feats))]
pf.sort(reverse=True)
for p in pf[:25]:
    print 'Positive word %.2f: %s' % (p[0], p[1])

Positive word -2.43: the
Positive word -3.29: of
Positive word -3.35: to
Positive word -3.47: in
Positive word -3.53: said
Positive word -3.70: and
Positive word -3.79: drug
Positive word -4.13: for
Positive word -4.21: that
Positive word -4.43: it
Positive word -4.57: is
Positive word -4.64: of the
Positive word -4.70: the drug
Positive word -4.70: aids
Positive word -4.71: in the
Positive word -4.74: be
Positive word -4.75: would
Positive word -4.80: with
Positive word -4.81: on
Positive word -4.86: as
Positive word -4.93: was
Positive word -4.93: said the
Positive word -4.96: by
Positive word -5.00: new
Positive word -5.00: fda


In [102]:
# Convert the text to arrays of numbers
X_text_vectorized = vectorizer.transform(X_test)

In [103]:
X_text_vectorized

<16x531 sparse matrix of type '<type 'numpy.int64'>'
	with 1147 stored elements in Compressed Sparse Row format>

In [104]:
# Predict the values of the test set
predictions = clf_nb.predict(X_text_vectorized)


In [105]:
# Look at the first 20 predictions
print(predictions.tolist())
print(y_test.tolist())

[True, True, False, False, True, False, True, False, True, True, False, False, False, True, False, False]
[True, True, False, False, True, False, True, False, True, True, False, False, False, False, False, False]


In [106]:
def render_confusion_matrix(ytrue, ypred):
    return pd.crosstab(pd.Series(ytrue), pd.Series(ypred), rownames=['Actual'], colnames=['Predicted'], margins=True)

In [107]:
#y_test.tolist()
render_confusion_matrix(y_test.tolist(), predictions.tolist())

Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,9,1,10
True,0,6,6
All,9,7,16


In [108]:
from sklearn.metrics import classification_report, accuracy_score
print
print 'Accuracy: ', accuracy_score(y_test, predictions)
print
print classification_report(y_test, predictions)


Accuracy:  0.9375

             precision    recall  f1-score   support

      False       1.00      0.90      0.95        10
       True       0.86      1.00      0.92         6

avg / total       0.95      0.94      0.94        16



In [109]:
iwrong_predictions = [i for i,v in enumerate(zip(y_test, predictions)) if v[0] != v[1]]

iwrong_predictions

[13]

In [111]:
proba = clf_nb.predict_proba(X_text_vectorized)
log_proba = clf_nb.predict_log_proba(X_text_vectorized)

In [112]:
diff_prob = proba[:,1] - proba[:,0]
diff_log_proba = log_proba[:,1] - log_proba[:,0]

In [113]:
print 'diff_prob:\n'
print 'mean:',np.mean(diff_prob)
print 'std:', np.std(diff_prob)

print '\ndiff_log_prob:\n'
print 'mean:', np.mean(diff_log_proba)
print 'std:', np.std(diff_log_proba)

diff_prob:

mean: -0.114946661721
std: 0.981629220822

diff_log_prob:

mean: 50.5160502051
std: 101.729117462


In [115]:
# Plot  histogram.
plt.hist(diff_log_proba, range=[-500, 500], bins=30, normed=True, alpha=0.5)
plt.axvline(0, color='r')

<matplotlib.lines.Line2D at 0x1f040390>