## Financial and technology articles taken from [webhose.io](https://webhose.io/datasets)

In [1]:
import pandas as pd
import json,urllib2
import glob
%matplotlib inline

## Take a look at one JSON file

In [2]:
with open('data/news_0000001.json','r') as inFile:
    d=json.loads(inFile.read())

In [3]:
print d.keys()

[u'organizations', u'language', u'uuid', u'thread', u'author', u'url', u'ord_in_thread', u'title', u'locations', u'highlightText', u'entities', u'persons', u'external_links', u'text', u'crawled', u'highlightTitle', u'published']


In [4]:
print d['text']

Published By: Finger Lake Times - Today 
NEW YORK (AP) — Jail watchdogs have expressed skepticism over a plan to mandate all New York City inmates wear uniforms....


## Define a function to open a file and get the text

In [5]:
def getText(f):
    with open(f,'r') as inFile:
        d=json.loads(inFile.read())
    return d['text']

In [9]:
%time financeTexts=map(getText,glob.glob('../financial_news/data/news_*json'))

CPU times: user 7.57 s, sys: 2.75 s, total: 10.3 s
Wall time: 21.6 s


In [10]:
len(financeTexts)

87157

In [11]:
%time techTexts=map(getText,glob.glob('../tech_news/data/news*json'))

CPU times: user 2.49 s, sys: 841 ms, total: 3.33 s
Wall time: 6.6 s


In [12]:
len(techTexts)

22292

## Combine tech and financial news into one dataframe

In [13]:
df=pd.DataFrame(data={'text':financeTexts,'category':'finance'})

In [14]:
df=df.append(pd.DataFrame(data={'text':techTexts,'category':'tech'}))

In [15]:
df.head()

Unnamed: 0,category,text
0,finance,Published By: Montgomery Advertiser: Sports - ...
1,finance,( Source : Ministry of Foreign Affairs of the ...
2,finance,( Source : Engineers Australia ) Sponsored edi...
3,finance,Published By: CBS News - Today \nWill Grier sa...
4,finance,"Updated: 3:56 a.m. Friday, Oct. 2, 2015 | Post..."


In [16]:
df.shape

(109449, 2)

In [17]:
df.head()

Unnamed: 0,category,text
0,finance,Published By: Montgomery Advertiser: Sports - ...
1,finance,( Source : Ministry of Foreign Affairs of the ...
2,finance,( Source : Engineers Australia ) Sponsored edi...
3,finance,Published By: CBS News - Today \nWill Grier sa...
4,finance,"Updated: 3:56 a.m. Friday, Oct. 2, 2015 | Post..."


## Build up a pipeline

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score
from sklearn import preprocessing

## Binarise the category labels

In [19]:
lb = preprocessing.LabelBinarizer()

In [20]:
lb.fit(df['category'])
df['category_bin']=lb.transform(df['category'])

## Test Naive Bayes Classifier fr our baseline

In [33]:
steps=[('vectorise',CountVectorizer()),\
       ('transform',TfidfTransformer()),\
       ('clf',MultinomialNB())]
# Our pipeline has three steps

In [34]:
pipe=Pipeline(steps)

In [35]:
X_train, X_test, y_train, y_test=\
train_test_split(df['text'],df['category_bin'],test_size=0.25)

In [36]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('vectorise', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
     ...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [37]:
pred=pipe.predict(X_test)

In [38]:
print 'Accuracy = %.3f' % f1_score(y_test,pred)

Accuracy = 0.677


## Write out model

In [60]:
import pickle
with open('model.out','w') as outFile:
    pickle.dump(pipe,outFile)

In [71]:
lb

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

## Video 4.3

## Grid Search

In [73]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [74]:
pipe.named_steps.keys()

['clf', 'transform', 'vectorise']

In [75]:
param_grid = dict(vectorise__stop_words=[None,'english'],\
                  vectorise__binary=[True,False],\
                  #vectorise__min_df=[1,5,10],\
                  #clf__class_weight=[None,'balanced'],\
                  #transform__norm=['l1','l2']
                 )

In [76]:
grid_search = GridSearchCV(pipe, param_grid=param_grid,\
                           scoring=make_scorer(f1_score),n_jobs=2)
# With n_jobs=1, takes 10.33
# With n_jobs=-1 takes YYYs

In [77]:
%time res=grid_search.fit(df['text'],df['category_bin'])

CPU times: user 33.3 s, sys: 5.15 s, total: 38.5 s
Wall time: 5min 25s


In [78]:
res.best_params_

{'vectorise__binary': True, 'vectorise__stop_words': None}

In [79]:
print 'Best score = %.3f' % res.best_score_

Best score = 0.857


## Compare classifiers

In [None]:
CountVectorizer()

In [72]:
for clf in [SGDClassifier(),LogisticRegression(),RandomForestClassifier()]:
    print clf.__class__
    steps=[('vectorise',CountVectorizer()),('transform',TfidfTransformer()),\
           ('clf',clf)]
    pipe=Pipeline(steps)
    pipe.set_params(vectorise__decode_error='ignore')
    
    grid_search = GridSearchCV(pipe, param_grid=param_grid,n_jobs=-1,\
                           scoring=make_scorer(f1_score))

    res=grid_search.fit(df['text'],df['category_bin'])
    
    print 'Best score = %.3f' % res.best_score_
    print res.best_params_
    print ''

<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>




KeyboardInterrupt: 