Skip to content

Commit

Permalink
Merge cfd0071 into 5b85d99
Browse files Browse the repository at this point in the history
  • Loading branch information
abhijithneilabraham committed Apr 2, 2019
2 parents 5b85d99 + cfd0071 commit fe5c564
Show file tree
Hide file tree
Showing 4 changed files with 229 additions and 0 deletions.
14 changes: 14 additions & 0 deletions examples/09-malayalam/malayalam news classification/Readme.md
@@ -0,0 +1,14 @@
Malayalam News Classification
=============================

This example shows you how to do a search query for online news in malayalam language and the search results are classified into :
Buisiness, entertainment,sports,Kerala,India.
<br>Here Google is the used search engine.Open the code sample <b>pattern_news.py</b> and enter a search keyword in malayalam.</br>

<br>Before running the program, download the folder <b>News</b> from the following link and use it in the same folder as the sample code</br>
<br>

[Download Link to the news file](https://drive.google.com/open?id=1HPtrsoL9cX70rZ31lWgmPCjdbZfN_zjG)</br>



@@ -0,0 +1,26 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 2 02:17:22 2019
@author: abhijithneilabraham
"""


from pattern.web import Bing, SEARCH, plaintext,Google
from ulmfit import ULMFiT
engine = Google(license=key)
searched=[]
search_key='സഞ്ജു സാംസൺ'

for result in engine.search(search_key, type=SEARCH, start=1):
print(repr(plaintext(result.text)))
searched.append(repr(plaintext(result.text)))
print(len(searched))

model = ULMFiT("news/")
for i in searched:
x=model.predict(i)

print(x['intent'])

172 changes: 172 additions & 0 deletions examples/09-malayalam/malayalam news classification/ulmfit.py
@@ -0,0 +1,172 @@
import numpy as np
from fastai.text import *
from fastai.lm_rnn import get_rnn_classifer
import html
from nltk import word_tokenize


class Tokenizer():
def __init__(self, lang='en'):
pass

def spacy_tok(self,x):
return word_tokenize(x)

def proc_text(self, s):
return self.spacy_tok(s)

@staticmethod
def proc_all(ss, lang):
tok = Tokenizer(lang)
return [tok.proc_text(s) for s in ss]

@staticmethod
def proc_all_mp(ss, lang='en'):
ncpus = num_cpus()//2
with ProcessPoolExecutor(ncpus) as e:
return sum(e.map(Tokenizer.proc_all, ss, [lang]*len(ss)), [])


class ULMFiT:

def __init__(self,model: str):
model_path = Path(model)
itos_filename = model_path/"news_lm"/"tmp"/'itos.pkl'
trained_classifier_filename = model_path/'models'/'clas_2.h5'
label2index = model_path/"news_clas"/"l2i.npy"
self.l2i = {v:k for k,v in np.load(label2index).item().items()}
self.stoi, self.model = self.load_model(itos_filename, trained_classifier_filename)
self.re1 = re.compile(r' +')

def load_model(self,itos_filename, classifier_filename):
"""Load the classifier and int to string mapping
Args:
itos_filename (str): The filename of the int to string mapping file (usually called itos.pkl)
classifier_filename (str): The filename of the trained classifier
Returns:
string to int mapping, trained classifer model
"""

# load the int to string mapping file
itos = pickle.load(Path(itos_filename).open('rb'))
# turn it into a string to int mapping (which is what we need)
stoi = collections.defaultdict(lambda:0, {str(v):int(k) for k,v in enumerate(itos)})

# these parameters aren't used, but this is the easiest way to get a model
bptt,em_sz,nh,nl = 70,400,1150,3
dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.5
num_classes = len(self.l2i) # this is the number of classes we want to predict
vs = len(itos)

model = get_rnn_classifer(bptt, 20*70, num_classes, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
layers=[em_sz*3, 50, num_classes], drops=[dps[4], 0.1],
dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])

# load the trained classifier
model.load_state_dict(torch.load(classifier_filename, map_location=lambda storage, loc: storage))

# put the classifier into evaluation mode
model.reset()
model.eval()

return stoi, model


def softmax(self,x):
'''
Numpy Softmax, via comments on https://gist.github.com/stober/1946926
>>> res = softmax(np.array([0, 200, 10]))
>>> np.sum(res)
1.0
>>> np.all(np.abs(res - np.array([0, 1, 0])) < 0.0001)
True
>>> res = softmax(np.array([[0, 200, 10], [0, 10, 200], [200, 0, 10]]))
>>> np.sum(res, axis=1)
array([ 1., 1., 1.])
>>> res = softmax(np.array([[0, 200, 10], [0, 10, 200]]))
>>> np.sum(res, axis=1)
array([ 1., 1.])
'''
if x.ndim == 1:
x = x.reshape((1, -1))
max_x = np.max(x, axis=1).reshape((-1, 1))
exp_x = np.exp(x - max_x)
return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1))

def fixup(self, x):

x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
'<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
' @-@ ','-').replace('\\', ' \\ ').replace('\u200d','').replace('\xa0',' ').replace(
'\u200c','').replace('“',' ').replace('”',' ').replace('"',' ').replace('\u200b','')
x = re.sub('[\(\[].*?[\)\]]', '', x)
x = re.sub('<[^<]+?>', '', x)
x = re.sub('[A-Za-z]+','ENG ', x)
x = re.sub(r'\d+.?(\d+)?','NUM ',x).replace("(","").replace(")","")
return self.re1.sub(' ', html.unescape(x))

def predict_text(self,stoi, model, text):
"""Do the actual prediction on the text using the
model and mapping files passed
"""

# prefix text with tokens:
# xbos: beginning of sentence
# xfld 1: we are using a single field here
input_str = self.fixup(text)
# input_str = re.sub('[A-Za-z]+','ENG ', input_str)
# input_str = re.sub(r'\d+.?(\d+)?','NUM ',input_str).replace("(","").replace(")","")

# predictions are done on arrays of input.
# We only have a single input, so turn it into a 1x1 array
texts = [input_str]

# tokenize using the fastai wrapper around spacy
tok = Tokenizer().proc_text(input_str)

# turn into integers for each word
encoded = [stoi[p] for p in tok]
# print(encoded)
# we want a [x,1] array where x is the number
# of words inputted (including the prefix tokens)
ary = np.reshape(np.array(encoded),(-1,1))

# turn this array into a tensor
tensor = torch.from_numpy(ary)

# wrap in a torch Variable
variable = Variable(tensor)

# do the predictions
predictions = model(variable)

# convert back to numpy
numpy_preds = predictions[0].data.numpy()

return self.softmax(numpy_preds[0])[0], input_str

def predict(self,text):
intent = {}
output, fixed_text = self.predict_text(self.stoi, self.model, text)
intent_ranking = []
for i, out in enumerate(output):
temp = {"confidence": float(format(out, 'f')), "name": self.l2i[i]}
intent_ranking.append(temp)
intent_ranking = sorted(intent_ranking, key=lambda e: e['confidence'], reverse=True)
intent.update({
"intent": intent_ranking.pop(0),
"intent_ranking": intent_ranking
})
intent.update({"processed_text": fixed_text})
return intent#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 1 00:54:22 2019
@author: abhijithneilabraham
"""

17 changes: 17 additions & 0 deletions pattern/web/__init__.py
Expand Up @@ -16,6 +16,7 @@
from builtins import str, bytes, dict, int, chr
from builtins import map, filter, zip
from builtins import object, range, next
from translate import Translator

from .utils import get_url_query, get_form_action, stringify_values, json_iter_parse

Expand Down Expand Up @@ -2146,6 +2147,22 @@ def f(v):
else:
self._pagination[k] = id
return results



def translated(self,lang,query):
trans_results=self.search(query, start=1, count=10)
translator= Translator(to_lang=lang)
translation = translator.translate(trans_results)
return translation

'''
This translated takes the results from a search query and translates it to the language
specified by the lang keyword.
usage===>Twitter.translated("German","cat")
'''


def profile(self, query, start=1, count=10, **kwargs):
""" Returns a list of results for the given author id, alias or search query.
Expand Down

0 comments on commit fe5c564

Please sign in to comment.