In [1]:
import pandas as pd
import numpy as np 
import string
import random

import nltk
from nltk.corpus import brown
from nltk.corpus import reuters

from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer

In [2]:
#load 10k reuters documents
len(reuters.fileids())

10788

In [3]:
#view text from one doc
reuters.raw(fileids=['test/14826'])[0:201]

"ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT\n  Mounting trade friction between the\n  U.S. And Japan has raised fears among many of Asia's exporting\n  nations that the row could inflict far-reaching"

In [4]:
exclude = set(string.punctuation)
alldocslist = []

for index, i in  enumerate(reuters.fileids()):
    text = reuters.raw(fileids=[i])
    text = ''.join(ch for ch in text if ch not in exclude)
    alldocslist.append(text)
    
print(alldocslist[1])

CHINA DAILY SAYS VERMIN EAT 712 PCT GRAIN STOCKS
  A survey of 19 provinces and seven cities
  showed vermin consume between seven and 12 pct of Chinas grain
  stocks the China Daily said
      It also said that each year 1575 mln tonnes or 25 pct of
  Chinas fruit output are left to rot and 21 mln tonnes or up
  to 30 pct of its vegetables The paper blamed the waste on
  inadequate storage and bad preservation methods
      It said the government had launched a national programme to
  reduce waste calling for improved technology in storage and
  preservation and greater production of additives The paper
  gave no further details
  




In [5]:
#tokenize words
plot_data = [[]] * len(alldocslist)
for doc in alldocslist:
    text = doc
    tokentext = word_tokenize(text)
    plot_data[index].append(tokentext)

(plot_data[0][1])

['CHINA',
 'DAILY',
 'SAYS',
 'VERMIN',
 'EAT',
 '712',
 'PCT',
 'GRAIN',
 'STOCKS',
 'A',
 'survey',
 'of',
 '19',
 'provinces',
 'and',
 'seven',
 'cities',
 'showed',
 'vermin',
 'consume',
 'between',
 'seven',
 'and',
 '12',
 'pct',
 'of',
 'Chinas',
 'grain',
 'stocks',
 'the',
 'China',
 'Daily',
 'said',
 'It',
 'also',
 'said',
 'that',
 'each',
 'year',
 '1575',
 'mln',
 'tonnes',
 'or',
 '25',
 'pct',
 'of',
 'Chinas',
 'fruit',
 'output',
 'are',
 'left',
 'to',
 'rot',
 'and',
 '21',
 'mln',
 'tonnes',
 'or',
 'up',
 'to',
 '30',
 'pct',
 'of',
 'its',
 'vegetables',
 'The',
 'paper',
 'blamed',
 'the',
 'waste',
 'on',
 'inadequate',
 'storage',
 'and',
 'bad',
 'preservation',
 'methods',
 'It',
 'said',
 'the',
 'government',
 'had',
 'launched',
 'a',
 'national',
 'programme',
 'to',
 'reduce',
 'waste',
 'calling',
 'for',
 'improved',
 'technology',
 'in',
 'storage',
 'and',
 'preservation',
 'and',
 'greater',
 'production',
 'of',
 'additives',
 'The',
 'paper',


In [6]:
#example
plot_data[0][1][0:10]

['CHINA',
 'DAILY',
 'SAYS',
 'VERMIN',
 'EAT',
 '712',
 'PCT',
 'GRAIN',
 'STOCKS',
 'A']

In [7]:
#all words lowercase
for x in range (len(reuters.fileids())):
    lowers = [word.lower() for word in plot_data[0][x]]
    plot_data[0][x] = lowers

plot_data[0][1][0:10]

['china',
 'daily',
 'says',
 'vermin',
 'eat',
 '712',
 'pct',
 'grain',
 'stocks',
 'a']

In [8]:
stop_words = set(stopwords.words('english'))
for x in range(len(reuters.fileids())):
    filtered_sentence = [w for w in plot_data[0][x] if not w in stop_words]
    plot_data[0][x] = filtered_sentence

plot_data[0][1][0:10]

['china',
 'daily',
 'says',
 'vermin',
 'eat',
 '712',
 'pct',
 'grain',
 'stocks',
 'survey']

In [9]:
#stemming
my_stemmer = SnowballStemmer("english")
stemmed_sentence = [my_stemmer.stem(w) for w in filtered_sentence]
stemmed_sentence[0:10]

porter_stemmer = PorterStemmer()
my_stemmer = SnowballStemmer("english")
stemmed_sentence = [porter_stemmer.stem(w) for w in filtered_sentence]
stemmed_sentence[0:10]

['ltaha',
 'automot',
 'technolog',
 'corp',
 'year',
 'net',
 'shr',
 '43',
 'ct',
 'vs']

### inverted index

In [10]:
l = plot_data[0]
flatten = [item for sublist in l for item in sublist]
words = flatten
words_unique = set(words)
words_unique = list(words_unique)

In [11]:
import math
from textblob import TextBlob as tb

def tf(word, doc):
    return doc.count(word) / len(doc)

def n_containing(word, doclist):
    return sum(1 for doc in doclist if word in doc)

def idf(word, doclist):
    return math.log(len(doclist) / (0.01 + n_containing(word, doclist)))

def tfidf(word, doc, doclist):
    return (tf(word, doc) * idf(word, doclist))

In [12]:
#dictionary of words

import re
import numpy as np

plottest = plot_data[0][0:1000]

worddic = {}

for doc in plottest:
    for word in words_unique:
        if word in doc:
            word = str(word)
            index = plottest.index(doc)
            positions = list(np.where(np.array(plottest[index]) == word)[0])
            idfs = tfidf(word, doc, plottest)
            try:
                worddic[word].append([index, positions, idfs])
            except:
                worddic[word] = []
                worddic[word].append([index, positions, idfs])

In [13]:
# the index create a dic where each word is a KEY and a list
# of doc indexes, word positions, and td-idf score as VALUES

worddic['china']

[[1, [0, 23], 0.1131500878815288],
 [13, [0], 0.06694713532990454],
 [14, [160], 0.013213250394060107],
 [28, [51], 0.05821490028687352],
 [40, [3, 15, 59, 79], 0.14740653650621185],
 [236, [86], 0.04414096834938761],
 [281, [70], 0.0565750439407644],
 [293, [13, 21], 0.11642980057374704],
 [302, [33], 0.059952658504392124],
 [342, [55, 146], 0.05391715597039292],
 [567, [2], 0.06925565723783228],
 [569, [1014, 1072, 1221], 0.009248261212112677],
 [612, [20], 0.01998421950146404],
 [710, [0, 7, 34], 0.17464470086062053],
 [720, [0, 16], 0.23628400704672192],
 [721, [0, 6, 27, 78, 82], 0.2028701070603168],
 [733, [179], 0.021595850106420823],
 [736, [0, 5, 21, 83], 0.13732745708698368]]

In [14]:
# save the dic
np.save('worddic_1000.npy', worddic)

### the search engine

In [15]:
from collections import Counter

def search(searchsentence):
    try:
        searchsentence = searchsentence.lower()
        try:
            words = searchsentence.split(' ')
        except:
            words = list(words)
        enddic = {}
        idfdic = {}
        closedic = {}

        realwords = []
        for word in words:
            if word in list(worddic.keys()):
                realwords.append(word)
        words = realwords
        numwords = len(words)

        for word in words:
            for indpos in worddic[word]:
                index = indpos[0]
                amount = len(indpos[1])
                idfscore = indpos[2]
                enddic[index] = amount
                idfdic[index] = idfscore
                fullcount_order = sorted(enddic.items(), key = lambda x:x[1], reverse=True)
                fullidf_order = sorted(idfdic.items(), key = lambda x:x[1], reverse=True)

        combo = []
        alloptions = {k: worddic.get(k, None) for k in (words)}
        for worddex in list(alloptions.values()):
            for indexpos in worddex:
                for indexz in indexpos:
                    combo.append(indexz)
        comboindex = combo[::3]
        combocount = Counter(comboindex)
        for key in combocount:
            combocount[key] = combocount[key] / numwords
        combocount_order = sorted(combocount.items(), key = lambda x:x[1], reverse=True)

        if len(words) > 1:
            x = []
            y = []
            for record in [worddic[z] for z in words]:
                for index in record:
                    x.append(index[0])
            for i in x:
                if x.count(i) > 1:
                    y.append(i)
            y = list(set(y))

            closedic = {}
            for wordbig in [worddic[x] for x in words]:
                for record in wordbig:
                    if record[0] in y:
                        index = record[0]
                        positions = record[1]
                        try:
                            closedic[index].append(positions)
                        except:
                            closedic[index] = []
                            closedic[index].append(positions)
            
            x = 0
            fdic = {}
            for index in y:
                csum = []
                for seqlist in closedic[index]:
                    while x > 0:
                        secondlist = seqlist
                        x = 0
                        sol = [1 for i in firstlist if i + 1 in secondlist]
                        csum.append(sol)
                        fsum = [item for sublist in csum for item in sublist]
                        fsum = sum(fsum)
                        fdic[index] = fsum
                        fdic_order = sorted(fdic.items(), key = lambda x:x[1], reverse=True)
                    while x == 0:
                        firstlist = seqlist
                        x = x + 1
        else:
            fdic_order = 0
        

        return(searchsentence, words, fullcount_order, combocount_order, fullidf_order, fdic_order)
    
    except:
        return("")
    

#test search

search('indonesia crude palm oil')[1]


['indonesia', 'crude', 'palm', 'oil']

In [16]:
search('indonesia crude palm oil')[1][1:10]

['crude', 'palm', 'oil']

In [17]:
# save metrics to dataframe for use in ranking and machine learning 
result1 = search('china daily says what')
result2 = search('indonesia crude palm oil')
result3 = search('price of nickel')
result4 = search('north yemen sugar')
result5 = search('nippon steel')
result6 = search('China')
result7 = search('Gold')
result8 = search('trade')
df = pd.DataFrame([result1,result2,result3,result4,result5,result6,result7,result8])
df.columns = ['search term', 'actual_words_searched','num_occur','percentage_of_terms','td-idf','word_order']
df

Unnamed: 0,search term,actual_words_searched,num_occur,percentage_of_terms,td-idf,word_order
0,china daily says what,"[china, daily, says]","[(183, 5), (40, 4), (569, 3), (710, 3), (342, ...","[(1, 1.0), (13, 0.6666666666666666), (14, 0.66...","[(675, 0.5095658223243495), (135, 0.4367707048...","[(1, 3), (293, 1), (720, 1), (721, 1), (736, 0..."
1,indonesia crude palm oil,"[indonesia, crude, palm, oil]","[(33, 13), (621, 12), (34, 11), (209, 8), (123...","[(4, 1.0), (6, 1.0), (209, 0.5), (281, 0.5), (...","[(762, 0.48707909813666866), (266, 0.434203698...","[(34, 6), (4, 5), (660, 5), (6, 4), (268, 2), ..."
2,price of nickel,"[price, nickel]","[(572, 19), (639, 8), (108, 7), (148, 7), (736...","[(724, 1.0), (4, 0.5), (7, 0.5), (20, 0.5), (2...","[(50, 0.24460301234499893), (537, 0.2066299280...","[(724, 0)]"
3,north yemen sugar,"[north, yemen, sugar]","[(700, 12), (96, 8), (494, 7), (296, 6), (525,...","[(30, 1.0), (758, 1.0), (47, 0.666666666666666...","[(494, 0.3808351739278394), (30, 0.35115970582...","[(758, 2), (30, 2), (851, 0), (47, 0)]"
4,nippon steel,"[nippon, steel]","[(40, 9), (253, 8), (444, 7), (223, 2), (435, ...","[(40, 1.0), (123, 0.5), (223, 0.5), (253, 0.5)...","[(223, 0.5682589478261134), (40, 0.42228417223...","[(40, 5)]"
5,china,[china],"[(721, 5), (40, 4), (736, 4), (569, 3), (710, ...","[(1, 1.0), (13, 1.0), (14, 1.0), (28, 1.0), (4...","[(720, 0.23628400704672192), (721, 0.202870107...",0
6,gold,[gold],"[(997, 6), (20, 5), (797, 5), (341, 4), (347, ...","[(8, 1.0), (12, 1.0), (20, 1.0), (32, 1.0), (2...","[(304, 0.30902054113001826), (20, 0.2575171176...",0
7,trade,[trade],"[(0, 15), (169, 10), (544, 10), (761, 8), (273...","[(285, 2.0), (701, 2.0), (713, 2.0), (923, 2.0...","[(223, 0.24728127372797265), (449, 0.247281273...",0


In [18]:
print(alldocslist[1])

CHINA DAILY SAYS VERMIN EAT 712 PCT GRAIN STOCKS
  A survey of 19 provinces and seven cities
  showed vermin consume between seven and 12 pct of Chinas grain
  stocks the China Daily said
      It also said that each year 1575 mln tonnes or 25 pct of
  Chinas fruit output are left to rot and 21 mln tonnes or up
  to 30 pct of its vegetables The paper blamed the waste on
  inadequate storage and bad preservation methods
      It said the government had launched a national programme to
  reduce waste calling for improved technology in storage and
  preservation and greater production of additives The paper
  gave no further details
  




### ranking

In [19]:
def rank(term):
    results = search(term)

    num_score = results[2]
    per_score = results[3]
    tfscore = results[4]
    order_score = results[5]

    final_candidates = []

    try:
        first_candidates = []

        for candidates in order_score:
            if candidates[1] > 1:
                first_candidates.append(candidates[0])
        
        second_candidates = []

        for match_candidates in per_score:
            if match_candidates[1] == 1:
                second_candidates.append(match_candidates[0])
            if match_candidates[1] == 1 and match_candidates[0] in first_candidates:
                final_candidates.append(match_candidates[0])
        
        t3_order = first_candidates[0:3]
        for each in t3_order:
            if each not in final_candidates:
                final_candidates.insert(len(final_candidates), each)

        final_candidates.insert(len(final_candidates), tfscore[0][0])
        final_candidates.insert(len(final_candidates), tfscore[1][0])

        t3_per = second_candidates[0:3]
        for each in t3_per:
            if each not in final_candidates:
                final_candidates.insert(len(final_candidates), each)
        
        othertops = [num_score[0][0], per_score[0][0], tfscore[0][0], order_score[0][0]]
        for top in othertops:
            if top not in final_candidates:
                final_candidates.insert(len(final_candidates), top)
        
    except:
        othertops = [num_score[0][0], num_score[1][0], num_score[2][0], per_score[0][0], tfscore[0][0]]
        for top in othertops:
            if top not in final_candidates:
                final_candidates.insert(len(final_candidates), top)
    
    for index, results in enumerate(final_candidates):
        if index < 5:
            print("RESULT", index + 1, ":", alldocslist[results][0:500], "...")

In [20]:
rank('indonesia crude palm oil')

RESULT 1 : INDONESIA SEES CPO PRICE RISING SHARPLY
  Indonesia expects crude palm oil CPO
  prices to rise sharply to between 450 and 550 dlrs a tonne FOB
  sometime this year because of better European demand and a fall
  in Malaysian output Hasrul Harahap junior minister for tree
  crops told Indonesian reporters
      Prices of Malaysian and Sumatran CPO are now around 332
  dlrs a tonne CIF for delivery in Rotterdam traders said
      Harahap said Indonesia would maintain its exports despite
  making r ...
RESULT 2 : INDONESIAN COMMODITY EXCHANGE MAY EXPAND
  The Indonesian Commodity Exchange is
  likely to start trading in at least one new commodity and
  possibly two during calendar 1987 exchange chairman Paian
  Nainggolan said
      He told Reuters in a telephone interview that trading in
  palm oil sawn timber pepper or tobacco was being considered
      Trading in either crude palm oil CPO or refined palm oil
  may also be introduced But he said the question was still
  being

In [21]:
rank('china')

RESULT 1 : CHINA CHILE TO BUILD COPPER TUBE PLANT IN CHINA
  Chinas stateowned Beijing NonFerrous
  Metals Industrial Corp and ltWrought Copper Ltd of Chile signed
  a contract to jointly build a copper tube plant on the
  outskirts of Peking the China Daily said
      The BeijingSantiago Copper Tube Co involves an investment
  of 993 mln dlrs and will on completion have a production
  capacity of 5000 tonnes of copper tubes a year it said
      It said Chile will supply copper at preferential rates to
  t ...
RESULT 2 : NIPPON STEEL DENIES CHINA SEEKING JAPANESE PLANTS
  Nippon Steel Corp ltNSTCT denied local
  newspaper reports that China has been seeking to buy steel
  plants from Japanese firms which plan to suspend output under
  the recently announced rationalisation program
      The Mainichi Shimbun quoted Nippon Steel as saying that
  Chinas State Planning Commission and some Chinese firms have
  asked Japanese makers to sell them steel works and rolling
  mills to expand stee