## Financial and technology articles taken from [webhose.io](https://webhose.io/datasets)

In [5]:
import pandas as pd
import json
import glob
%matplotlib inline

In [8]:
ls data

[0m[01;36;40mfinancial_news[0m/  [01;36;40mtech_news[0m/


## Take a look at one JSON file

In [9]:
with open('data/financial_news/09/news_0000001.json','r') as inFile:
    d=json.loads(inFile.read())

In [11]:
print (d.keys())

dict_keys(['organizations', 'uuid', 'thread', 'author', 'url', 'ord_in_thread', 'title', 'locations', 'entities', 'highlightText', 'language', 'persons', 'text', 'external_links', 'published', 'crawled', 'highlightTitle'])


In [12]:
print( d['text'])

BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets 
mostly fell in light volumes on Tuesday as energy shares tracked 
falls in global oil prices, while weaknesses in banking shares 
amid concerns about loans to an ailing steel firm sent the Thai 
index to a one-week closing low. 
Bangkok's SET index shed nearly 1 percent after four 
sessions of gains. The index closed at 1,379.32, its lowest 
closing since Sept. 15. 
Shares of Krung Thai Bank Pcl, the most actively 
traded by turnover, dropped 2.8 percent to a near one-month low, 
reflecting potential impact of loans to Sahaviriya Steel 
Industries Pcl on the bank's earnings. 
Maybank Kim Eng Securities downgraded Krung Thai Bank to 
"hold" from "buy". 
"Even as exposure to SSI loans will be fully provisioned, 
KTB's NPL coverage will still be lower than 130 percent, the 
desired level we think and hence the need for more provisioning 
in the following quarters," the broker said in a report. 
SSI shares plunged 20 percent and S

## Define a function to open a file and get the text

In [13]:
def getText(f):
    with open(f,'r') as inFile:
        d=json.loads(inFile.read())
    return d['text']

In [14]:
ls data/tech_news/

[0m[01;36;40m09[0m/  [01;36;40m10[0m/


In [15]:
%time financeTexts = list(map(getText,glob.glob('data/financial_news/*/news_*json')))

CPU times: user 5.92 s, sys: 16.6 s, total: 22.5 s
Wall time: 42.6 s


In [16]:
len(financeTexts)

47851

In [17]:
%time techTexts= list(map(getText,glob.glob('data/tech_news/*/news_*json')))

CPU times: user 5.22 s, sys: 14.5 s, total: 19.8 s
Wall time: 37.4 s


In [18]:
len(techTexts)

41476

## Combine tech and financial news into one dataframe

In [19]:
df=pd.DataFrame(data={'text':financeTexts,'category':'finance'})

In [20]:
df=df.append(pd.DataFrame(data={'text':techTexts,'category':'tech'}))

In [22]:
df.head()

Unnamed: 0,text,category
0,"Union Bank of India: Quotes , News BSE 159.75 ...",finance
1,"HOUSTON, July 29 Anadarko Petroleum Corp has n...",finance
2,"WASHINGTON, July 29 Greece's international cre...",finance
3,"BRASILIA, July 28 Brazil's government said on ...",finance
4,Digital Media Twitter is 'too difficult to use...,finance


In [23]:
df = df[['category','text']]  # switch

In [24]:
df.head()

Unnamed: 0,category,text
0,finance,"Union Bank of India: Quotes , News BSE 159.75 ..."
1,finance,"HOUSTON, July 29 Anadarko Petroleum Corp has n..."
2,finance,"WASHINGTON, July 29 Greece's international cre..."
3,finance,"BRASILIA, July 28 Brazil's government said on ..."
4,finance,Digital Media Twitter is 'too difficult to use...


In [25]:
df.shape

(89327, 2)

## Build up a pipeline

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score
from sklearn import preprocessing

  from numpy.core.umath_tests import inner1d


## Binarise the category labels

In [27]:
lb = preprocessing.LabelBinarizer()

In [28]:
lb.fit(df['category'])
df['category_bin']=lb.transform(df['category'])

## Test Naive Bayes Classifier fr our baseline

### pipeline steps
1. Vectorize: pass raw data into out `CountVectorizer()`
    - takes data, removes punctuation, splits into words & counts those words
2. Transform: applies TFIDF
    - turns those raw counts into TFIDF scaled counts
3. Classification: scaled feature vectors are passed to a Naive Bayes classifier

In [29]:
steps=[('vectorise',CountVectorizer()),\
       ('transform',TfidfTransformer()),\
       ('clf',MultinomialNB())]
# Our pipeline has three steps

In [30]:
pipe=Pipeline(steps)

In [31]:
X_train, X_test, y_train, y_test=\
train_test_split(df['text'],df['category_bin'],test_size=0.25)

In [32]:
%%time 
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('vectorise', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [33]:
pred=pipe.predict(X_test)

In [34]:
print( 'Accuracy = %.3f' % f1_score(y_test,pred))

Accuracy = 0.819


## Write out model

In [36]:
import pickle
pickle.dump(pipe, open('data/models/model.out','wb'))

In [None]:
# import pickle
# with open('data/models/model.out','w') as outFile:
#     pickle.dump(pipe,outFile)

In [37]:
ls data/models

[0m[01;32mmodel.out[0m*


## Video 4.3

## Grid Search
- considers all of the options av

In [38]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [39]:
param_grid = dict(vectorise__min_df=[1,5,10])
# fewwer 1, 5, 10 words in documents

In [40]:
df.category.value_counts()

finance    47851
tech       41476
Name: category, dtype: int64

In [None]:
#pipe.named_steps.keys()

In [None]:
# param_grid = dict(vectorise__stop_words=[None,'english'],\
#                   vectorise__binary=[True,False],\
#                   #vectorise__min_df=[1,5,10],\
#                   #clf__class_weight=[None,'balanced'],\
#                   #transform__norm=['l1','l2']
#                  )

In [42]:
grid_search = GridSearchCV(pipe, param_grid=param_grid,\
                           scoring=make_scorer(f1_score),n_jobs=2)  
# With n_jobs=1, takes 10.33
# With n_jobs=-1 takes YYYs

In [43]:
%%time 
res=grid_search.fit(df['text'],df['category_bin'])

Process ForkPoolWorker-1:
Traceback (most recent call last):
  File "/home/salas/miniconda3/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/salas/miniconda3/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/salas/miniconda3/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/salas/miniconda3/envs/nlp/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
KeyboardInterrupt
Process ForkPoolWorker-2:
Traceback (most recent call last):
  File "/home/salas/miniconda3/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/salas/miniconda3/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/salas/miniconda3/envs/nlp/lib/python3.6/multiprocessing/pool

KeyboardInterrupt: 

In [None]:
res.best_params_

In [None]:
res

In [None]:
print ('Best score = %.3f' % res.best_score_)

## Compare classifiers

In [None]:
CountVectorizer()

In [None]:
%%time
for clf in [SGDClassifier(),LogisticRegression(),RandomForestClassifier()]:
    print( clf.__class__)
    steps=[('vectorise',CountVectorizer()),('transform',TfidfTransformer()),\
           ('clf',clf)]
    pipe=Pipeline(steps)
    pipe.set_params(vectorise__decode_error='ignore')
    
    grid_search = GridSearchCV(pipe, param_grid=param_grid,n_jobs=-1,\
                           scoring=make_scorer(f1_score))

    res=grid_search.fit(df['text'],df['category_bin'])
    
    print ( 'Best score = %.3f' % res.best_score_)
    print ( res.best_params_)
    print('')