In [1]:
import requests
import re
import os
import sys
from os import listdir
from os.path import isfile, isdir, join
import string
from math import log, sqrt
import numpy as np

In [2]:
S = requests.Session()

In [3]:
URL = "https://tr.wikipedia.org/w/api.php"
PARAMS = {
    "action": "query",
    "format": "json",
    "list": "allcategories",
    "acmin": 100,
    "aclimit": 500
}

In [4]:
f = open("wiki_categories.txt",'w')
for i in range(100):
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()

    CATEGORIES = DATA["query"]["allcategories"]

    for cat in CATEGORIES:
        cat_name = cat["*"]
        m = re.search("[0-9]{4}",cat_name)
        if not m:
            f.write(cat_name+'\n')

    if "continue" in DATA:
        PARAMS["acfrom"] = DATA["continue"]["accontinue"]
    else:
        break

f.close()

In [5]:
def read_categories():
    with open("tmp_cat.txt",'r', encoding="utf-8") as f:
        categories = f.read().splitlines()
    return categories

In [None]:
categories = read_categories()
print(categories)

In [7]:
if not os.path.isdir("./categories"):
  os.mkdir("categories")
for cat in categories:
    cat = cat.replace("\ufeff", "")
    cat_dir = "categories/"+cat.replace(' ','_')
    if not os.path.isdir(cat_dir):
        os.mkdir(cat_dir)
    title_file = open(os.path.join(cat_dir,"titles.txt"),'w')

    PARAMS = {
        "action": "query",
        "list": "categorymembers",
        "format": "json",
        "cmtitle": "Category:"+cat,
        "cmlimit": "100"
    }

    for i in range(5):
        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()

        PAGES = DATA["query"]["categorymembers"]

        for page in PAGES:
            title = page["title"]
            ID = str(page["pageid"])
            if title[:9] != "Kategori:":
                title_file.write(ID+' '+title+'\n')

        if "continue" in DATA:
            PARAMS["cmcontinue"] = DATA["continue"]["cmcontinue"]
        else:
            break

    title_file.close()

In [8]:
def read_titles(filename):
    IDs = []
    titles = []
    f = open(filename,'r')
    for l in f:
        l.rstrip('\n')
        IDs.append(l.split()[0])
        titles.append(' '.join(l.split()[1:]))
    return IDs,titles

In [None]:
for cat in categories:
    cat = cat.replace("\ufeff", "")
    print("Processing category",cat)
    cat_dir = "categories/"+cat.replace(' ','_')
    title_file = os.path.join(cat_dir,"titles.txt")
    IDs, titles = read_titles(title_file)

    content_file = open(os.path.join(cat_dir,"linear.txt"),'w')

    for i in range(len(titles)):
        PARAMS = {
            "action": "query",
            "prop": "extracts",
            "format": "json",
            "exintro": True,
            "explaintext": True,
            "redirects": True,
            "titles": titles[i]
        }

        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()

        PAGES = DATA["query"]["pages"]

        for page in PAGES:
            extract = PAGES[page]["extract"]
            content_file.write("<doc id=\""+IDs[i]+"\" title=\""+titles[i]+"\">\n")
            content_file.write(extract+'\n')
            content_file.write("</doc>\n\n")

    content_file.close()

In [None]:
d = './categories'
catdirs = [join(d,o) for o in listdir(d) if isdir(join(d,o))]

ng = int(input("The size of n-grams: "))

for cat in catdirs:
    ngrams = {}
    f = open(join(cat,'linear.txt'),'r')
    for l in f:
        if "<doc id" not in l and "</doc" not in l:
            l = l.rstrip('\n').lower()
            for i in range(len(l)-ng+1):
                ngram = l[i:i+ng]

                if ngram in ngrams:
                    ngrams[ngram]+=1
                else:
                    ngrams[ngram]=1
    f.close()

    ngramfile = open(join(cat,"linear."+str(ng)+".ngrams"),'w')
    for k in sorted(ngrams, key=ngrams.get, reverse=True):
        ngramfile.write(k+'\t'+str(ngrams[k])+'\n')
    ngramfile.close()

In [26]:
def contain_punctuation(s):
    punctuation = [c for c in string.punctuation]
    punctuation.append(' ')
    r = any(c in s for c in punctuation)
    return r

In [27]:
def normalise_tfs(tfs,total):
    for k,v in tfs.items():
        tfs[k] = v / total
    return tfs

In [28]:
def log_idfs(idfs,num_cats):
    for k,v in idfs.items():
        idfs[k] = log(num_cats / v)
    return idfs

In [None]:
cat_tfs = {}
cat_tf_idfs = {}
idfs = {}

for cat in catdirs:
    tfs = {}
    sum_freqs = 0
    #print("Processing",filename,"...")
    ngram_files = [join(cat,f) for f in listdir(cat) if isfile(join(cat, f)) and '.ngrams' in f]
    for ngram_file in ngram_files:
        f = open(ngram_file,'r')
        for l in f:
            l = l.rstrip()
            ngram = '\t'.join(i for i in l.split('\t')[:-1])
            freq = int(l.split('\t')[-1])
            tfs[ngram] = freq
            sum_freqs+=freq
            if ngram in idfs:
                idfs[ngram]+=1
            else:
                idfs[ngram]=1
        f.close()

    tfs = normalise_tfs(tfs,sum_freqs)
    cat_tfs[cat] = tfs

    #for k in sorted(idfs, key=tfs.get, reverse=True)[:10]:
        #print(k,idfs[k])

idfs = log_idfs(idfs, len(catdirs))

vocab=[]

for cat in catdirs:
    tf_idfs = {}
    tfs = cat_tfs[cat]
    for ngram,tf in tfs.items():
        tf_idfs[ngram] = tf * idfs[ngram]
    cat_tf_idfs[cat] = tf_idfs

    c = 0
    for k in sorted(tf_idfs, key=tf_idfs.get, reverse=True):
        if c == 100:
            break
        if k not in vocab and not contain_punctuation(k):
            vocab.append(k)
            c+=1

print("VOCAB SIZE:",len(vocab))

for cat in catdirs:
    tf_idfs = cat_tf_idfs[cat]
    f = open(join(cat,'tf_idfs.txt'),'w')
    for ngram in sorted(vocab):
        if ngram in tf_idfs:
            f.write(ngram+' '+str(tf_idfs[ngram])+'\n')
        else:
            f.write(ngram+' 0.0\n')
    f.close()


vocab_file = open("./vocab_file.txt",'w')
for ngram in sorted(vocab):
    vocab_file.write(ngram+'\n')
vocab_file.close()

In [30]:
def read_vocab2():
    i_to_ngrams = {}
    ngrams_to_i = {}
    c = 0
    f = open('./vocab_file.txt','r')
    for l in f:
        l = l.rstrip()
        i_to_ngrams[c] = l
        ngrams_to_i = c
        c+=1
    return i_to_ngrams, ngrams_to_i

In [31]:
def read_vocab():
    with open('vocab_file.txt','r', encoding="utf-8") as f:
        vocab = f.read().splitlines()
    return vocab

In [None]:
catdirs = [join(d,o) for o in listdir(d) if isdir(join(d,o))]
vocab = sorted(vocab)
print(vocab)
vector_file = open('./category_vectors.txt','w')

for cat in catdirs:
    print(cat)
    vec = np.zeros(len(vocab))
    f = open(join(cat,'tf_idfs.txt'),'r')
    for l in f:
        l = l.rstrip('\n')
        ngram = ' '.join([i for i in l.split()[:-1]])
        tf_idf = float(l.split()[-1])
        pos = vocab.index(ngram)
        vec[pos] = tf_idf
    f.close()

    vector_file.write(cat+' '+' '.join([str(v) for v in vec])+'\n')
vector_file.close()

In [33]:
def cosine_similarity(v1, v2):
    num = np.dot(v1, v2)
    den_a = np.dot(v1, v1)
    den_b = np.dot(v2, v2)
    return num / (sqrt(den_a) * sqrt(den_b))

In [34]:
def read_queries(query_file):
    with open(query_file, encoding="utf-8") as f:
        queries = f.read().splitlines()
    return queries

In [35]:
def read_category_vectors():
    vectors = {}
    f = open('./category_vectors.txt','r')
    for l in f:
        l = l.rstrip('\n')
        fields = l.split()
        cat = fields[0]
        vec = np.array([float(v) for v in fields[1:]])
        vectors[cat] = vec
    return vectors

In [36]:
def get_ngrams(l,n):
    l = l.lower()
    ngrams = {}
    for i in range(0,len(l)-n+1):
        ngram = l[i:i+n]
        if ngram in ngrams:
            ngrams[ngram]+=1
        else:
            ngrams[ngram]=1
    return ngrams


In [37]:
def mk_vector(vocab,tfs):
    vec = np.zeros(len(vocab))
    for t,f in tfs.items():
        if t in vocab:
            pos = vocab.index(t)
            vec[pos] = f
    return vec

In [38]:
def tf_idf_calc(d):
  page_tfs = {}
  page_tf_idfs = {}
  idfs = {}
  for page, p_ngrams in d.items():
    tfs = {}
    sum_freqs = 0
    for ngram,f in p_ngrams.items():
      freq = int(f)
      tfs[ngram] = freq
      sum_freqs += freq
      if ngram in idfs:
        idfs[ngram]+=1
      else:
        idfs[ngram]=1
    tfs = normalise_tfs(tfs,sum_freqs)
    page_tfs[page]=tfs
  idfs = log_idfs(idfs,len(d))

  vocab = []
  for page in d:
    tf_idfs = {}
    tfs = page_tfs[page]
    for ngram,tf in tfs.items():
      tf_idfs[ngram] = tf*idfs[ngram]
    page_tf_idfs[page] = tf_idfs
    c = 0
    for k in sorted(tf_idfs, key=tf_idfs.get,reverse=True):
      if c == 100:
        break
      if k not in vocab and not contain_punctuation(k):
        vocab.append(k)
        c+=1
  final_tf_idfs = {}
  for page in d:
    tf_idfs = page_tf_idfs[page]
    p_tf_idfs = {}
    for ngram in sorted(vocab):
      if ngram in tf_idfs:
        p_tf_idfs[ngram] = tf_idfs[ngram]
      else:
        p_tf_idfs[ngram] = float(0)
    final_tf_idfs[page] = p_tf_idfs
  return final_tf_idfs, vocab

In [None]:
vectors = read_category_vectors()
queries = read_queries("query_file.txt")

for q in queries:
    main_cat = ""
    print("\nQUERY:",q)
    q_ngrams = {}
    cosines = {}
    for i in range(4,7):
        n = get_ngrams(q,i)
        q_ngrams = {**q_ngrams, **n}
    print(q_ngrams)
    qvec = mk_vector(vocab,q_ngrams)
    for cat,vec in vectors.items():
        cosines[cat] = cosine_similarity(vec,qvec)
    for cat in sorted(cosines, key=cosines.get, reverse=True):
        if main_cat == "":
          main_cat = cat
    if np.count_nonzero(qvec) == 0:
      print("No results found")
    else:
      f = open(join(main_cat,'linear.txt'),'r')
      output = {}
      l = f.readline()
      while l:
        if "<doc id=" in l:
          title = l[l.rfind("=")+2:-3]
          doc = ""
          l = f.readline()
          while "</doc>" not in l:
            doc+=l
            l = f.readline()
          output[title] = doc
        l = f.readline()
      f.close()
      f_output = {}
      for k,v in output.items():
        ngrams={}
        v = re.sub("\n","",v)
        v = v.lower()
        for i in range(len(v)-ng+1):
           ngram = v[i:i+ng]
           if ngram in ngrams:
              ngrams[ngram]+=1
           else:
              ngrams[ngram]=1
        f_output[k] = ngrams
      final_measures, page_vocab = tf_idf_calc(f_output)
      vecs = {}
      for page, ngrams in final_measures.items():
        vec = np.zeros(len(page_vocab))
        for ngram,tf_idf in ngrams.items():
          tf_idf = float(tf_idf)
          pos = page_vocab.index(ngram)
          vec[pos] = tf_idf
        vecs[page] = vec
      page_cosines = {}
      page_qvec = mk_vector(page_vocab,q_ngrams)
      for page,vec in vecs.items():
        page_cosines[page] = cosine_similarity(vec,page_qvec)
      b = False
      for page in sorted(page_cosines, key=page_cosines.get,reverse=True):
        if b == False:
          print(page,page_cosines[page])
          b = True
      b = False