Merge pull request #17 from AayushSabharwal/master

Updated the test cases
descentis · Sep 18, 2020 · 6ba6359 · 6ba6359
2 parents 64adfa8 + 8227ac6
commit 6ba6359
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 38 deletions.
diff --git a/kdap/analysis.py b/kdap/analysis.py
@@ -5,32 +5,33 @@
 
 @author: descentis
 """
-import xml.etree.ElementTree as ET
-import math
+import copy
 import glob
-import numpy as np
-from multiprocessing import Process, Lock, Manager
-from datetime import datetime
+import math
+import os
 import re
+import sqlite3
 import string
-import os
+import xml.etree.ElementTree as ET
+from collections import Counter
+from datetime import datetime
+from multiprocessing import Process, Lock, Manager
+from os.path import expanduser
+
 import mwparserfromhell
-from nltk.tokenize import word_tokenize
-import copy
-from kdap.converter.wikiConverter import wikiConverter
+import numpy as np
 import wikipedia
-import sqlite3
 from bx.misc.seekbzip2 import SeekableBzip2File
 from internetarchive import download
-from pyunpack import Archive
-from os.path import expanduser
-from kdap.wikiextract.wikiExtract import wikiExtract
 from mwviews.api import PageviewsClient
+from nltk.tokenize import word_tokenize
+from pyunpack import Archive
+
 from kdap.converter.qaConverter import qaConverter
-from kdap.wikiextract.knolml_wikiextractor import QueryExecutor
+from kdap.converter.wikiConverter import wikiConverter
 from kdap.converter.wiki_clean import getCleanText
-from collections import Counter
 from kdap.wiki_graph import graph_creater as gp
+from kdap.wikiextract.wikiExtract import wikiExtract
 
 
 class instances(object):
@@ -264,12 +265,12 @@ def get_text(self, *args, **kwargs):
 
         if self.instance_attrib['Body']['Text'].get('text') is not None:
             di['text'] = self.instance_attrib['Body']['Text']['text']
-
+        clean = False
         if kwargs.get('clean') is not None:
             clean = kwargs['clean']
             if clean:
                 di['text'] = getCleanText(di['text'])
-                
+
                 '''
                 qe = QueryExecutor()
                 qe.setOutputFileDirectoryName('lol')
@@ -778,7 +779,8 @@ def get_wiki_article_by_class(self, *args, **kwargs):
                 articles = self.display_data(
                     "select article_nm from article_desc where article_id in " + article_id + ";", conn)
             else:
-                articles = self.download_dataset(sitename='wikipedia', category_list=['WikiProject Mathematics articles'],
+                articles = self.download_dataset(sitename='wikipedia',
+                                                 category_list=['WikiProject Mathematics articles'],
                                                  download=False)
 
         if kwargs.get('wiki_class') is not None:
@@ -1134,7 +1136,7 @@ def __countRev(self, *args, **kwargs):
                                     else:
                                         kwargs['revisionLength']['answers'] += 1
                                 l.release()
-    
+
                         total_rev += 1
                         for ch1 in elem:
                             if 'TimeStamp' in ch1.tag:
@@ -1159,8 +1161,8 @@ def __countRev(self, *args, **kwargs):
                                                         total_rev_dict[t.year] = 1
                                                     else:
                                                         total_rev_dict[t.year] += 1
-                                            #yet to include the daily edits
-    
+                                            # yet to include the daily edits
+
                         elem.clear()
                         root_wiki.clear()
             except:
@@ -1419,7 +1421,7 @@ def __get_editor(self, *args, **kwargs):
                 uList = []
                 editor_dict = {}
                 editor_bool = 1
-                #try:
+                # try:
                 for event, elem in context_wiki:
                     if event == "end" and 'Instance' in elem.tag:
                         for newch in elem:
@@ -1483,8 +1485,8 @@ def __get_editor(self, *args, **kwargs):
                                             uList.append(U)
                         elem.clear()
                         root_wiki.clear()
-                #except:
-                    #print('problem with file parsing: ' + f)
+                # except:
+                # print('problem with file parsing: ' + f)
                 if (kwargs.get('users') != None):
                     if kwargs.get('dir_path') != None:
                         f = f.replace(kwargs['dir_path'] + '/', '')
@@ -2959,7 +2961,7 @@ def get_local_gini_coefficient(*args, **kwargs):
 
         l = Lock()
         processDict = {}
-        
+
         if (fileNum < cnum):
             pNum = fileNum
         else:
@@ -3220,8 +3222,8 @@ def findAllTags(list_tags, *args, **kwargs):
         '''
         return tagPosts
 
-# Graph Methods for wikipedia articles
-    
+    # Graph Methods for wikipedia articles
+
     def get_induced_graph_by_articles(self, article_names):
         ''' Given a list of Wikipedia article names, the function returns the adjacency list of inter-wiki links
 
@@ -3236,9 +3238,9 @@ def get_induced_graph_by_articles(self, article_names):
             An adjacency list of inter-wiki graph
         '''
         adj_list = gp.get_inter_graph(article_names)
-        
+
         return adj_list
-    
+
     def get_induced_graph_by_article(self, article_name):
         ''' Given a Wikipedia article name, the function returns the adjacency list of inter-wiki links present in that article
 
@@ -3252,11 +3254,11 @@ def get_induced_graph_by_article(self, article_name):
         \*\*adj_list : list
             An adjacency list of inter-wiki graph
         '''
-        
+
         adj_list = gp.get_graph_by_name(article_name)
-        
+
         return adj_list
-    
+
     def get_city_graph_by_country(self, country_name):
         ''' Given a country name, the function returns the adjacency list of inter-wiki links for the cities in that country
 
@@ -3269,7 +3271,7 @@ def get_city_graph_by_country(self, country_name):
         -------
         \*\*adj_list : list
             An adjacency list of inter-wiki graph
-        '''        
+        '''
         adj_list = gp.get_cities_by_country(country_name)
-        
-        return adj_list
+
+        return adj_list
diff --git a/tests/test_analysis.py b/tests/test_analysis.py
@@ -2,6 +2,7 @@
 from kdap import analysis
 import os
 import shutil
+import json
 
 
 class TestAnalysis(unittest.TestCase):
@@ -10,18 +11,43 @@ def setUp(self):
         self.ropar_filename = 'Indian_Institute_of_Technology_Ropar.knolml'
         self.zinc_filename = 'Zinc.knolml'
         self.k = analysis.knol()
+        with open('test_data.txt', 'r') as infile:
+            self.frames_data = json.loads(infile.read())
 
     def get_wiki_article(self):
         article_name = 'IIT Ropar'
         self.k.get_wiki_article(article_name=article_name, output_dir=self.test_dir)
         self.assertTrue(os.path.exists(self.test_dir + self.ropar_filename))
-        os.remove(self.test_dir + self.ropar_filename)
 
     def test_download_dataset_article_list(self):
-        site = 'wikipedia'
-        self.k.download_dataset(sitename=site, article_list=['IIT Ropar', 'Zinc'], destdir=self.test_dir)
+        self.k.download_dataset(sitename='wikipedia', article_list=['IIT Ropar', 'Zinc'], destdir=self.test_dir)
         self.assertTrue(os.path.exists(self.test_dir + self.ropar_filename))
         self.assertTrue(os.path.exists(self.test_dir + self.zinc_filename))
+        self.frame_test()
+
+    # tests are run in alphabetical order, so download_dataset will always run
+    # before, ensuring this has the necessary knolml file
+    def frame_test(self):
+        test_data = {
+            'id': [],
+            'title': [],
+            'bytes': [],
+            'editor': [],
+            'time': [],
+            'text': [],
+            'stats': []
+        }
+        for inst in self.k.frame(file_name=self.test_dir+self.ropar_filename):
+            test_data['id'].append(inst.instanceId)
+            test_data['title'].append(inst.get_title())
+            test_data['bytes'].append(inst.get_bytes())
+            test_data['editor'].append(inst.get_editor())
+            test_data['time'].append(inst.get_timestamp())
+            test_data['text'].append(inst.get_text())
+            test_data['stats'].append(inst.get_text_stats())
+
+        for key in self.frames_data.keys():
+            self.assertTrue(all(x in self.frames_data[key] for x in test_data[key]))
 
     def tearDown(self):
         if os.path.exists(self.test_dir):

diff --git a/tests/test_data.txt b/tests/test_data.txt