diff --git a/classrank/database/tables.py b/classrank/database/tables.py index 125f7c0..b9cbebe 100644 --- a/classrank/database/tables.py +++ b/classrank/database/tables.py @@ -102,5 +102,8 @@ class Rating(Base): student_id = Column(Integer, ForeignKey('student.uid'), primary_key=True) section_id = Column(Integer, ForeignKey('section.uid'), primary_key=True) rating = Column(Integer, nullable=True) + difficulty = Column(Integer, nullable=True) + workload = Column(Integer, nullable=True) + grade = Column(Integer, nullable=True) section = relationship('Section', backref='ratings') student = relationship('Student', backref='ratings') diff --git a/classrank/filters/collabfilter.py b/classrank/filters/collabfilter.py index 3d24d93..58a8282 100644 --- a/classrank/filters/collabfilter.py +++ b/classrank/filters/collabfilter.py @@ -1,25 +1,39 @@ import numpy as np from sklearn.decomposition import TruncatedSVD from scipy import sparse +from classrank.filters.datawrapper import DataWrapper class CollaborativeFilter: #This takes in a matrix - def __init__(self, data, numRecommendations): - self.dataset = data + def __init__(self, data=dict(), numRecommendations=1, db=None, metric="rating", school="gatech"): + self.dataset = DataWrapper(instances=data, db=db, school=school, metric=metric) self.updated = False self.sparsedata = None self.sparseifyData() - self.svd = TruncatedSVD() - self.model = self.svd.inverse_transform(self.svd.fit_transform(self.sparsedata)) - - def getRecommendation(self, row, column): + try: + self.svd = TruncatedSVD(n_components=numRecommendations) + self.model = self.svd.inverse_transform(self.svd.fit_transform(self.sparsedata)) + except ValueError: + self.svd = None + self.model = None + raise ValueError("Not enough ratings for predictions") + + def getRecommendation(self, instances): if(self.updated): self.sparseifyData() self.model = self.svd.inverse_transform(self.svd.fit_transform(self.sparsedata)) self.updated = False - return self.model[row][column] + ret = {} + for instance in instances: + values = {} + for feature in instances[instance]: + row = self.dataset.getRow(instance) + column = self.dataset.getColumn(feature) + values[feature] = self.model[row][column] + ret[instance] = values + return ret - def updateValue(self, row, column, value): - self.dataset[row][column] = value + def updateValues(self, instances): + self.dataset.addData(instances) self.updated = True def forceModelUpdate(self): @@ -28,11 +42,12 @@ def forceModelUpdate(self): self.model = self.svd.inverse_transform(self.svd.fit_transform(self.sparsedata)) def sparseifyData(self): - sparsematrix = sparse.dok_matrix((len(self.dataset), len(self.dataset[0]))) - for i in range(len(self.dataset)): - for j in range(len(self.dataset[i])): - if self.dataset[i][j] is not None: - sparsematrix[i, j] = self.dataset[i][j] + data = self.dataset.getData() + sparsematrix = sparse.dok_matrix((len(data), len(data[0]))) + for i in range(len(data)): + for j in range(len(data[i])): + if data[i][j] is not None: + sparsematrix[i, j] = data[i][j] self.sparsedata = sparsematrix def getSparseData(self): @@ -41,8 +56,14 @@ def getSparseData(self): def getModel(self): return self.model - def getData(self): - return self.dataset + def getData(self, *args): + if len(args) == 2: + return self.dataset.getData(args[0], args[1]) + else: + return self.dataset.getData() def getUpdated(self): return self.updated + + def getDataDict(self): + return self.dataset.getDataDict() diff --git a/classrank/filters/datawrapper.py b/classrank/filters/datawrapper.py new file mode 100644 index 0000000..d47ed13 --- /dev/null +++ b/classrank/filters/datawrapper.py @@ -0,0 +1,86 @@ +from classrank.database.wrapper import Query +class DataWrapper: + def __init__(self, instances=dict(), db=None, school="gatech", metric="rating"): + self.db = db + self.dataDict = instances + if db: + self.school = school + self.metric = metric + self.queryDB() + self.instanceLookup = {} + self.featureLookup = {} + self.createLookups() + self.data = [[None for feature in self.featureLookup] for instance in self.instanceLookup] + self.convertData() + + def createLookups(self): + instanceCounter = 0 + featureCounter = 0 + for instance in self.dataDict: + if instance not in self.instanceLookup: + self.instanceLookup[instance] = instanceCounter + instanceCounter += 1 + for feature in self.dataDict[instance]: + if feature not in self.featureLookup: + self.featureLookup[feature] = featureCounter + featureCounter += 1 + + def convertData(self): + for instance in self.dataDict: + for feature in self.dataDict[instance]: + self.data[self.instanceLookup[instance]][self.featureLookup[feature]] = self.dataDict[instance][feature] + + def addData(self, instances): + #update the data dictionary + for instance in instances: + if instance in self.dataDict: + self.dataDict[instance].update(instances[instance]) + else: + self.dataDict[instance] = instances[instance] + #probably more taxing than necesarry + self.createLookups() + self.convertData() + + def getData(self, *args): + if len(args) == 2: + return self.data[self.instanceLookup[args[0]]][self.featureLookup[args[1]]] + else: + return self.data + + def getInstanceLookup(self): + return self.instanceLookup + + def getFeatureLookup(self): + return self.featureLookup + + def getDataDict(self): + return self.dataDict + + def getRow(self, instance): + return self.instanceLookup[instance] + + def getColumn(self, feature): + return self.featureLookup[feature] + + def queryDB(self): + with Query(self.db) as query: + for student in query.query(self.db.student).filter(self.db.school.abbreviation==self.school).all(): + results = query.query(self.db.rating, self.db.section).filter(self.db.rating.student_id == student.uid).\ + filter(self.db.rating.section_id==self.db.section.uid).all() #a tuple of lists + #results = list(zip(*results)) #a list of tuples + #pprint.pprint(results) + instance = {} + for result in results: + courseName = query.query(self.db.course).filter(self.db.course.uid==result[1].course_id).first() + courseName = courseName.name + rating = result[0].__getattribute__(self.metric) + #if self.metric == "rating": + # rating = result[0][0].rating + #elif self.metric == "grade": + # rating = result[0][0].grade + #elif self.metric == "workload": + # rating = result[0][0].workload + #elif self.metric == "difficulty": + # rating = result[0][0].difficulty + instance[courseName] = rating + self.dataDict[student.uid] = instance diff --git a/test/test_datawrapper.py b/test/test_datawrapper.py new file mode 100644 index 0000000..d1acdb9 --- /dev/null +++ b/test/test_datawrapper.py @@ -0,0 +1,90 @@ +import unittest +import copy + +from classrank.filters.datawrapper import DataWrapper + +class TestDataWrapper(unittest.TestCase): + + def setUp(self): + self.dataset = { + 'Lisa Rose': { + 'Lady in the Water': 2.5, + 'Snakes on a Plane': 3.5, + 'Just My Luck': 3.0, + 'Superman Returns': 3.5, + 'You, Me and Dupree': 2.5, + 'The Night Listener': 3.0 + }, + 'Gene Seymour': { + 'Lady in the Water': 3.0, + 'Snakes on a Plane': 3.5, + 'Just My Luck': 1.5, + 'Superman Returns': 5.0, + 'The Night Listener': 3.0, + 'You, Me and Dupree': 3.5 + }, + 'Michael Phillips': { + 'Lady in the Water': 2.5, + 'Snakes on a Plane': 3.0, + 'Superman Returns': 3.5, + 'The Night Listener': 4.0 + }, + 'Claudia Puig': { + 'Snakes on a Plane': 3.5, + 'Just My Luck': 3.0, + 'The Night Listener': 4.5, + 'Superman Returns': 4.0, + 'You, Me and Dupree': 2.5 + }, + 'Mick LaSalle': { + 'Lady in the Water': 3.0, + 'Snakes on a Plane': 4.0, + 'Just My Luck': 2.0, + 'Superman Returns': 3.0, + 'The Night Listener': 3.0, + 'You, Me and Dupree': 2.0 + }, + 'Jack Matthews': { + 'Lady in the Water': 3.0, + 'Snakes on a Plane': 4.0, + 'The Night Listener': 3.0, + 'Superman Returns': 5.0, + 'You, Me and Dupree': 3.5 + }, + 'Toby': { + 'Snakes on a Plane':4.5, + 'You, Me and Dupree':1.0, + 'Superman Returns':4.0 + } + } + self.wrapper = DataWrapper(self.dataset) + + def test_create_lookups(self): + temp = {} + self.assertIsInstance(self.wrapper.getInstanceLookup(), type(temp)) + self.assertIsInstance(self.wrapper.getFeatureLookup(), type(temp)) + + def test_getters(self): + temp = {} + self.assertIsInstance(self.wrapper.getInstanceLookup(), type(temp)) + self.assertNotEqual(self.wrapper.getInstanceLookup(), type(temp)) + + self.assertIsInstance(self.wrapper.getFeatureLookup(), type(temp)) + self.assertNotEqual(self.wrapper.getFeatureLookup(), type(temp)) + + self.assertEqual(self.wrapper.getDataDict(), self.dataset) + + self.assertNotEqual(self.wrapper.getData(), [[None],[None],[None],[None],[None],[None],[None]]) + + def test_add_data(self): + tempData = copy.deepcopy(self.wrapper.getData()) + tempDataDict = copy.deepcopy(self.wrapper.getDataDict()) + + instance = {'Casey' : { 'Snakes on a Plane': 5.0, 'Superman Returns' : 3.4}} + instance['Lisa Rose'] = {'The Night Listener' : 5.0} + self.wrapper.addData(instance) + self.assertNotEqual(tempDataDict, self.wrapper.getDataDict()) + self.assertNotEqual(tempData, self.wrapper.getData()) + + def test_convert_data(self): + self.assertNotEqual(self.wrapper.getData(), [[None],[None],[None],[None],[None],[None],[None]]) diff --git a/test/test_filter.py b/test/test_filter.py index b6d66bd..83c1c2b 100644 --- a/test/test_filter.py +++ b/test/test_filter.py @@ -1,5 +1,5 @@ import unittest - +from unittest.mock import Mock, MagicMock, patch from classrank.filters.collabfilter import CollaborativeFilter import numpy as np from scipy import sparse @@ -7,25 +7,84 @@ class TestSVDFilter(unittest.TestCase): def setUp(self): - self.data = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0],[3.0, 3.5, 1.5, 5.0, 3.0, 3.5],[2.5, 3.0, None, 3.5, 4.0, None],[None, 3.5, 3.0, 4.0, 4.5, 2.5], [3.5, 4.0, 2.0, 3.0, 3.0, 2.0], [3.0, 4.0, None, 5.0, 3.0, 3.5], [None, 4.5, None, 4.0, None, 1.0]] + self.data = { + 'Lisa Rose': { + 'Lady in the Water': 2.5, + 'Snakes on a Plane': 3.5, + 'Just My Luck': 3.0, + 'Superman Returns': 3.5, + 'You, Me and Dupree': 2.5, + 'The Night Listener': 3.0 + }, + 'Gene Seymour': { + 'Lady in the Water': 3.0, + 'Snakes on a Plane': 3.5, + 'Just My Luck': 1.5, + 'Superman Returns': 5.0, + 'The Night Listener': 3.0, + 'You, Me and Dupree': 3.5 + }, + 'Michael Phillips': { + 'Lady in the Water': 2.5, + 'Snakes on a Plane': 3.0, + 'Superman Returns': 3.5, + 'The Night Listener': 4.0 + }, + 'Claudia Puig': { + 'Snakes on a Plane': 3.5, + 'Just My Luck': 3.0, + 'The Night Listener': 4.5, + 'Superman Returns': 4.0, + 'You, Me and Dupree': 2.5 + }, + 'Mick LaSalle': { + 'Lady in the Water': 3.0, + 'Snakes on a Plane': 4.0, + 'Just My Luck': 2.0, + 'Superman Returns': 3.0, + 'The Night Listener': 3.0, + 'You, Me and Dupree': 2.0 + }, + 'Jack Matthews': { + 'Lady in the Water': 3.0, + 'Snakes on a Plane': 4.0, + 'The Night Listener': 3.0, + 'Superman Returns': 5.0, + 'You, Me and Dupree': 3.5 + }, + 'Toby': { + 'Snakes on a Plane':4.5, + 'You, Me and Dupree':1.0, + 'Superman Returns':4.0 + } + } + self.instance = 'Gene Seymour' + self.feature = 'Snakes on a Plane' + self.testInstance = { 'Gene Seymour': { 'Snakes on a Plane': 10} } + self.recTester = { 'Gene Seymour': ['Snakes on a Plane'] } self.fltr = CollaborativeFilter(self.data, 1) + self.test2Instance = {'Gene Seymour' : {'Snakes on a Plane' : 20}} def test_update_value(self): - self.fltr.updateValue(2, 2, 10) - self.assertEqual(10,self.fltr.getData()[2][2]) + self.fltr.updateValues(self.testInstance) + self.assertEqual(10,self.fltr.getDataDict()[self.instance][self.feature]) def test_get_recommendation(self): - recom = self.fltr.getRecommendation(2,2) + recom = self.fltr.getRecommendation(self.recTester) self.assertIsNot(recom, None) def test_force_model_update(self): model = self.fltr.getModel() - self.fltr.updateValue(2, 2, 10) + self.fltr.updateValues(self.testInstance) self.fltr.forceModelUpdate() self.assertTrue(self.listNotEqual(model, self.fltr.getModel())) + def test_get_data_dict(self): + self.assertEqual(self.data, self.fltr.getDataDict()) + def test_get_data(self): - self.assertListEqual(self.data, self.fltr.getData()) + self.assertIsNot(self.fltr.getData("Gene Seymour", "Snakes on a Plane"), None) + self.assertIsInstance(self.fltr.getData(), type([]) ) def test_get_model(self): temp = np.array([1]) @@ -37,14 +96,13 @@ def test_sparseify_data(self): temp2 = self.fltr.getSparseData() - self.fltr.updateValue(2, 2, 10) - self.fltr.getRecommendation(2, 2) + self.fltr.updateValues(self.testInstance) + self.fltr.getRecommendation(self.recTester) self.assertIsInstance(self.fltr.getSparseData(), type(temp)) self.assertTrue(self.npListNotEqual(self.fltr.getSparseData(), temp2)) temp3 = self.fltr.getSparseData() - - self.fltr.updateValue(2, 2, 20) + self.fltr.updateValues(self.test2Instance) self.fltr.forceModelUpdate() self.assertIsInstance(self.fltr.getSparseData(), type(temp)) self.assertTrue(self.npListNotEqual(self.fltr.getSparseData(), temp2)) @@ -53,13 +111,13 @@ def test_sparseify_data(self): def test_is_updated(self): self.assertFalse(self.fltr.getUpdated()) - self.fltr.updateValue(2, 2, 10) + self.fltr.updateValues(self.testInstance) self.assertTrue(self.fltr.getUpdated()) - self.fltr.getRecommendation(2, 2) + self.fltr.getRecommendation(self.recTester) self.assertFalse(self.fltr.getUpdated()) - self.fltr.updateValue(2, 2, 20) + self.fltr.updateValues(self.test2Instance) self.assertTrue(self.fltr.getUpdated()) self.fltr.forceModelUpdate() diff --git a/test/test_filter_db.py b/test/test_filter_db.py new file mode 100644 index 0000000..b0e977c --- /dev/null +++ b/test/test_filter_db.py @@ -0,0 +1,39 @@ +import unittest +from unittest.mock import Mock, MagicMock, patch +from classrank.filters.collabfilter import CollaborativeFilter +import numpy as np +import os +from classrank.database.wrapper import Database, Query + +class TestDatabaseFilter(unittest.TestCase): + def setUp(self): + self.conn = Database(engine=os.environ.get("CONNECTION", "sqlite:///:memory:")) + school = self.conn.school(name="Georgia Tech", abbreviation="gatech") + course = self.conn.course(school=school, name="Intro Java", number="1331", subject="CS") + course2 = self.conn.course(school=school, name="Stuff", number="1332", subject="CS") + section1 = self.conn.section(course=course, semester="fall", year=2016, name="A1") + section2 = self.conn.section(course=course, semester="fall", year=2016, name="A2") + self.section3 = self.conn.section(course=course2, semester="spring",year=2015, name="A") + account = self.conn.account(username="test", email_address="test@test.com", password_hash=b"t", password_salt=b"t") + student = self.conn.student(account=account, school=school) + account2 = self.conn.account(username="test2", email_address="test2@test.com", password_hash=b"t", password_salt=b"t") + self.student2 = self.conn.student(account=account2, school=school) + with Query(self.conn) as q: + q.add(school) + q.add(course) + q.add(section1) + q.add(section2) + q.add(course2) + q.add(self.section3) + q.add(account) + q.add(student) + q.add(self.student2) + q.add(self.conn.rating(student=student, section=section1, rating=5)) + q.add(self.conn.rating(student=self.student2, section=section2, rating=3)) + def test_filter_query(self): + with self.assertRaises(ValueError): + cf = CollaborativeFilter(db=self.conn) + with Query(self.conn) as q: + q.add(self.conn.rating(student=self.student2, section=self.section3, rating=4)) + cf = CollaborativeFilter(db=self.conn) + self.assertIsInstance(cf.getData(), type([]))