## Financial and technology articles taken from [webhose.io](https://webhose.io/datasets)

In [6]:
import pandas as pd
import json
import glob
%matplotlib inline

In [7]:
ls 

[31m4.2_news_section_classification.ipynb[m[m* [31mapp.py[m[m*
[31m4.2_news_section_classification.py[m[m*    [34mdata[m[m/


## Take a look at one JSON file

In [8]:
with open('data/financial_news/2015-09/news_0000001.json','r') as inFile:
    d=json.loads(inFile.read())

In [10]:
print (d.keys())

dict_keys(['organizations', 'uuid', 'thread', 'author', 'url', 'ord_in_thread', 'title', 'locations', 'entities', 'highlightText', 'language', 'persons', 'text', 'external_links', 'published', 'crawled', 'highlightTitle'])


In [11]:
print( d['text'])

BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets 
mostly fell in light volumes on Tuesday as energy shares tracked 
falls in global oil prices, while weaknesses in banking shares 
amid concerns about loans to an ailing steel firm sent the Thai 
index to a one-week closing low. 
Bangkok's SET index shed nearly 1 percent after four 
sessions of gains. The index closed at 1,379.32, its lowest 
closing since Sept. 15. 
Shares of Krung Thai Bank Pcl, the most actively 
traded by turnover, dropped 2.8 percent to a near one-month low, 
reflecting potential impact of loans to Sahaviriya Steel 
Industries Pcl on the bank's earnings. 
Maybank Kim Eng Securities downgraded Krung Thai Bank to 
"hold" from "buy". 
"Even as exposure to SSI loans will be fully provisioned, 
KTB's NPL coverage will still be lower than 130 percent, the 
desired level we think and hence the need for more provisioning 
in the following quarters," the broker said in a report. 
SSI shares plunged 20 percent and S

## Define a function to open a file and get the text

In [13]:
def getText(f):
    with open(f,'r') as inFile:
        d=json.loads(inFile.read())
    return d['text']

In [32]:
ls data/tech_news/

[34m2015-09[m[m/ [34m2015-10[m[m/


In [29]:
%time financeTexts = list(map(getText,glob.glob('data/financial_news/*/news_*json')))

CPU times: user 13.1 s, sys: 9.82 s, total: 22.9 s
Wall time: 36.6 s


In [30]:
len(financeTexts)

47851

In [33]:
%time techTexts= list(map(getText,glob.glob('data/tech_news/*/news_*json')))

CPU times: user 11.4 s, sys: 10.6 s, total: 22 s
Wall time: 33.8 s


In [34]:
len(techTexts)

41476

## Combine tech and financial news into one dataframe

In [35]:
df=pd.DataFrame(data={'text':financeTexts,'category':'finance'})

In [36]:
df=df.append(pd.DataFrame(data={'text':techTexts,'category':'tech'}))

In [37]:
df.head()

Unnamed: 0,text,category
0,BEIJING Oct 23 Average new home prices in Chin...,finance
1,0 COMMENTS HONG KONG—A scandal involving a gov...,finance
2,"LIMA Oct 9 Ukraine's finance minister, Natalia...",finance
3,Lotte founding family clash escalates as eld...,finance
4,HARP For Homeowners: The Expiring Program That...,finance


In [39]:
df = df[['category','text']]  # switch

In [40]:
df.head()

Unnamed: 0,category,text
0,finance,BEIJING Oct 23 Average new home prices in Chin...
1,finance,0 COMMENTS HONG KONG—A scandal involving a gov...
2,finance,"LIMA Oct 9 Ukraine's finance minister, Natalia..."
3,finance,Lotte founding family clash escalates as eld...
4,finance,HARP For Homeowners: The Expiring Program That...


In [41]:
df.shape

(89327, 2)

## Build up a pipeline

In [43]:
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score
from sklearn import preprocessing

## Binarise the category labels

In [44]:
lb = preprocessing.LabelBinarizer()

In [45]:
lb.fit(df['category'])
df['category_bin']=lb.transform(df['category'])

## Test Naive Bayes Classifier fr our baseline

### pipeline steps
1. Vectorize: pass raw data into out `CountVectorizer()`
    - takes data, removes punctuation, splits into words & counts those words
2. Transform: applies TFIDF
    - turns those raw counts into TFIDF scaled counts
3. Classification: scaled feature vectors are passed to a Naive Bayes classifier

In [46]:
steps=[('vectorise',CountVectorizer()),\
       ('transform',TfidfTransformer()),\
       ('clf',MultinomialNB())]
# Our pipeline has three steps

In [47]:
pipe=Pipeline(steps)

In [48]:
X_train, X_test, y_train, y_test=\
train_test_split(df['text'],df['category_bin'],test_size=0.25)

In [49]:
pipe.fit(X_train,y_train)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Pipeline(memory=None,
     steps=[('vectorise', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [50]:
pred=pipe.predict(X_test)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [51]:
print( 'Accuracy = %.3f' % f1_score(y_test,pred))

Accuracy = 0.820


## Write out model

In [53]:
import pickle
pickle.dump(pipe, open('data/models/model.out','wb'))

In [54]:
# import pickle
# with open('data/models/model.out','w') as outFile:
#     pickle.dump(pipe,outFile)

In [56]:
ls data/models

model.out


## Video 4.3

## Grid Search
- considers all of the options av

In [57]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [58]:
param_grid = dict(vectorise__min_df=[1,5,10])
# fewwer 1, 5, 10 words in documents

In [63]:
df.category.value_counts()

finance    47851
tech       41476
Name: category, dtype: int64

In [59]:
#pipe.named_steps.keys()

In [None]:
# param_grid = dict(vectorise__stop_words=[None,'english'],\
#                   vectorise__binary=[True,False],\
#                   #vectorise__min_df=[1,5,10],\
#                   #clf__class_weight=[None,'balanced'],\
#                   #transform__norm=['l1','l2']
#                  )

In [64]:
grid_search = GridSearchCV(pipe, param_grid=param_grid,\
                           scoring=make_scorer(f1_score),n_jobs=2)  
# With n_jobs=1, takes 10.33
# With n_jobs=-1 takes YYYs

In [65]:
%time res=grid_search.fit(df['text'],df['category_bin'])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


CPU times: user 56.9 s, sys: 25.1 s, total: 1min 21s
Wall time: 11min 37s


In [66]:
res.best_params_

{'vectorise__min_df': 5}

In [70]:
res

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorise', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'vectorise__min_df': [1, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(f1_score), verbose=0)

In [67]:
print ('Best score = %.3f' % res.best_score_)

Best score = 0.829


## Compare classifiers

In [68]:
CountVectorizer()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [69]:
%%time
for clf in [SGDClassifier(),LogisticRegression(),RandomForestClassifier()]:
    print( clf.__class__)
    steps=[('vectorise',CountVectorizer()),('transform',TfidfTransformer()),\
           ('clf',clf)]
    pipe=Pipeline(steps)
    pipe.set_params(vectorise__decode_error='ignore')
    
    grid_search = GridSearchCV(pipe, param_grid=param_grid,n_jobs=-1,\
                           scoring=make_scorer(f1_score))

    res=grid_search.fit(df['text'],df['category_bin'])
    
    print ( 'Best score = %.3f' % res.best_score_)
    print ( res.best_params_)
    print('')



<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>


KeyboardInterrupt: 

Process ForkPoolWorker-3:
Traceback (most recent call last):
  File "/Users/franciscosalas/miniconda3/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/franciscosalas/miniconda3/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/franciscosalas/miniconda3/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 133, in worker
    completed += 1
KeyboardInterrupt
Process ForkPoolWorker-6:
Traceback (most recent call last):
  File "/Users/franciscosalas/miniconda3/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/franciscosalas/miniconda3/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/franciscosalas/miniconda3/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 133, in worker
    completed += 1
KeyboardInterrupt
