Skip to content

Commit

Permalink
Merge pull request #17 from AayushSabharwal/master
Browse files Browse the repository at this point in the history
Updated the test cases
  • Loading branch information
descentis committed Sep 18, 2020
2 parents 64adfa8 + 8227ac6 commit 6ba6359
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 38 deletions.
72 changes: 37 additions & 35 deletions kdap/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,33 @@
@author: descentis
"""
import xml.etree.ElementTree as ET
import math
import copy
import glob
import numpy as np
from multiprocessing import Process, Lock, Manager
from datetime import datetime
import math
import os
import re
import sqlite3
import string
import os
import xml.etree.ElementTree as ET
from collections import Counter
from datetime import datetime
from multiprocessing import Process, Lock, Manager
from os.path import expanduser

import mwparserfromhell
from nltk.tokenize import word_tokenize
import copy
from kdap.converter.wikiConverter import wikiConverter
import numpy as np
import wikipedia
import sqlite3
from bx.misc.seekbzip2 import SeekableBzip2File
from internetarchive import download
from pyunpack import Archive
from os.path import expanduser
from kdap.wikiextract.wikiExtract import wikiExtract
from mwviews.api import PageviewsClient
from nltk.tokenize import word_tokenize
from pyunpack import Archive

from kdap.converter.qaConverter import qaConverter
from kdap.wikiextract.knolml_wikiextractor import QueryExecutor
from kdap.converter.wikiConverter import wikiConverter
from kdap.converter.wiki_clean import getCleanText
from collections import Counter
from kdap.wiki_graph import graph_creater as gp
from kdap.wikiextract.wikiExtract import wikiExtract


class instances(object):
Expand Down Expand Up @@ -264,12 +265,12 @@ def get_text(self, *args, **kwargs):

if self.instance_attrib['Body']['Text'].get('text') is not None:
di['text'] = self.instance_attrib['Body']['Text']['text']

clean = False
if kwargs.get('clean') is not None:
clean = kwargs['clean']
if clean:
di['text'] = getCleanText(di['text'])

'''
qe = QueryExecutor()
qe.setOutputFileDirectoryName('lol')
Expand Down Expand Up @@ -778,7 +779,8 @@ def get_wiki_article_by_class(self, *args, **kwargs):
articles = self.display_data(
"select article_nm from article_desc where article_id in " + article_id + ";", conn)
else:
articles = self.download_dataset(sitename='wikipedia', category_list=['WikiProject Mathematics articles'],
articles = self.download_dataset(sitename='wikipedia',
category_list=['WikiProject Mathematics articles'],
download=False)

if kwargs.get('wiki_class') is not None:
Expand Down Expand Up @@ -1134,7 +1136,7 @@ def __countRev(self, *args, **kwargs):
else:
kwargs['revisionLength']['answers'] += 1
l.release()

total_rev += 1
for ch1 in elem:
if 'TimeStamp' in ch1.tag:
Expand All @@ -1159,8 +1161,8 @@ def __countRev(self, *args, **kwargs):
total_rev_dict[t.year] = 1
else:
total_rev_dict[t.year] += 1
#yet to include the daily edits
# yet to include the daily edits

elem.clear()
root_wiki.clear()
except:
Expand Down Expand Up @@ -1419,7 +1421,7 @@ def __get_editor(self, *args, **kwargs):
uList = []
editor_dict = {}
editor_bool = 1
#try:
# try:
for event, elem in context_wiki:
if event == "end" and 'Instance' in elem.tag:
for newch in elem:
Expand Down Expand Up @@ -1483,8 +1485,8 @@ def __get_editor(self, *args, **kwargs):
uList.append(U)
elem.clear()
root_wiki.clear()
#except:
#print('problem with file parsing: ' + f)
# except:
# print('problem with file parsing: ' + f)
if (kwargs.get('users') != None):
if kwargs.get('dir_path') != None:
f = f.replace(kwargs['dir_path'] + '/', '')
Expand Down Expand Up @@ -2959,7 +2961,7 @@ def get_local_gini_coefficient(*args, **kwargs):

l = Lock()
processDict = {}

if (fileNum < cnum):
pNum = fileNum
else:
Expand Down Expand Up @@ -3220,8 +3222,8 @@ def findAllTags(list_tags, *args, **kwargs):
'''
return tagPosts

# Graph Methods for wikipedia articles
# Graph Methods for wikipedia articles

def get_induced_graph_by_articles(self, article_names):
''' Given a list of Wikipedia article names, the function returns the adjacency list of inter-wiki links
Expand All @@ -3236,9 +3238,9 @@ def get_induced_graph_by_articles(self, article_names):
An adjacency list of inter-wiki graph
'''
adj_list = gp.get_inter_graph(article_names)

return adj_list

def get_induced_graph_by_article(self, article_name):
''' Given a Wikipedia article name, the function returns the adjacency list of inter-wiki links present in that article
Expand All @@ -3252,11 +3254,11 @@ def get_induced_graph_by_article(self, article_name):
\*\*adj_list : list
An adjacency list of inter-wiki graph
'''

adj_list = gp.get_graph_by_name(article_name)

return adj_list

def get_city_graph_by_country(self, country_name):
''' Given a country name, the function returns the adjacency list of inter-wiki links for the cities in that country
Expand All @@ -3269,7 +3271,7 @@ def get_city_graph_by_country(self, country_name):
-------
\*\*adj_list : list
An adjacency list of inter-wiki graph
'''
'''
adj_list = gp.get_cities_by_country(country_name)
return adj_list

return adj_list
32 changes: 29 additions & 3 deletions tests/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from kdap import analysis
import os
import shutil
import json


class TestAnalysis(unittest.TestCase):
Expand All @@ -10,18 +11,43 @@ def setUp(self):
self.ropar_filename = 'Indian_Institute_of_Technology_Ropar.knolml'
self.zinc_filename = 'Zinc.knolml'
self.k = analysis.knol()
with open('test_data.txt', 'r') as infile:
self.frames_data = json.loads(infile.read())

def get_wiki_article(self):
article_name = 'IIT Ropar'
self.k.get_wiki_article(article_name=article_name, output_dir=self.test_dir)
self.assertTrue(os.path.exists(self.test_dir + self.ropar_filename))
os.remove(self.test_dir + self.ropar_filename)

def test_download_dataset_article_list(self):
site = 'wikipedia'
self.k.download_dataset(sitename=site, article_list=['IIT Ropar', 'Zinc'], destdir=self.test_dir)
self.k.download_dataset(sitename='wikipedia', article_list=['IIT Ropar', 'Zinc'], destdir=self.test_dir)
self.assertTrue(os.path.exists(self.test_dir + self.ropar_filename))
self.assertTrue(os.path.exists(self.test_dir + self.zinc_filename))
self.frame_test()

# tests are run in alphabetical order, so download_dataset will always run
# before, ensuring this has the necessary knolml file
def frame_test(self):
test_data = {
'id': [],
'title': [],
'bytes': [],
'editor': [],
'time': [],
'text': [],
'stats': []
}
for inst in self.k.frame(file_name=self.test_dir+self.ropar_filename):
test_data['id'].append(inst.instanceId)
test_data['title'].append(inst.get_title())
test_data['bytes'].append(inst.get_bytes())
test_data['editor'].append(inst.get_editor())
test_data['time'].append(inst.get_timestamp())
test_data['text'].append(inst.get_text())
test_data['stats'].append(inst.get_text_stats())

for key in self.frames_data.keys():
self.assertTrue(all(x in self.frames_data[key] for x in test_data[key]))

def tearDown(self):
if os.path.exists(self.test_dir):
Expand Down
1 change: 1 addition & 0 deletions tests/test_data.txt

Large diffs are not rendered by default.

0 comments on commit 6ba6359

Please sign in to comment.