### Split dataset to separate days files for convenience of handling

Splitting reader class

In [5]:
from classeslib.input_file_reader import InputFileReader

class SplitFileReader(InputFileReader):
    def __init__(self, days_split_catalog):
        self.days_split_catalog = days_split_catalog
        self.days_file_handlers = {}
        
    def close(self):
        for file_handler in self.days_file_handlers.values():
            file_handler.close()
    
    @staticmethod
    def format_file_name(day):
        return "train." + str(day) + ".csv"
    
    def handle_data_row(self, user_id, day, d1_category, d2_category, d3_category):
        if self.days_file_handlers.has_key(day):
            day_file_handler = self.days_file_handlers[day]
        else:
            day_file_path = self.days_split_catalog + SplitFileReader.format_file_name(day)
            day_file_handler = open(day_file_path, 'w')
            day_file_handler.write(self.header)
            self.days_file_handlers[day] = day_file_handler
        line = InputFileReader.format_line(user_id, day, d1_category, d2_category, d3_category)    
        day_file_handler.write(line) 

Test file spliting

In [6]:
# arrange
test_split_filepath = "./tests_data/unittest.train.csv"
test_split_catalog = "./tests_data/unittest.train.split/"

with open(test_split_filepath, "w") as test_file:
    test_file.write("id3,user_id,id2,date,id1\n")
    test_file.write("111,1,11,1,1\n")
    test_file.write("112,2,11,1,1\n")
    test_file.write("121,3,12,3,1\n")
    test_file.write("122,4,12,5,1\n")
    test_file.write("211,1,21,3,2\n")
    test_file.write("212,2,21,4,2\n")
    test_file.write("221,3,22,3,2\n")
    test_file.write("222,4,22,1,2\n")
    test_file.write("311,1,31,3,3\n")
    test_file.write("312,2,31,4,3\n")
    test_file.write("321,3,32,3,3\n")
    test_file.write("322,4,32,1,3\n")

expected_daily_views = {"1":4, "3":5, "4":2, "5":1}

class SplitterTest(InputFileReader):
    def __init__(self, day):
        self.day = day
        self.views = 0
        
    def handle_data_row(self, user_id, day, d1_category, d2_category, d3_category):
        assert day == self.day, "day " + day + "is diferent from " + self.day
        self.views += 1

testers = [SplitterTest(day) for day in expected_daily_views.keys()]        
    
split_reader = SplitFileReader(test_split_catalog)

# act
split_reader.read_input_file(test_split_filepath)    
split_reader.close()

# assert
import os.path

for tester in testers:
    filename = SplitFileReader.format_file_name(tester.day)
    filepath = test_split_catalog + filename
    assert os.path.isfile(filepath), "file not found: " + filepath
    tester.read_input_file(filepath)
    assert tester.header == split_reader.header, "wrong file header for day " + tester.day
    assert tester.views == expected_daily_views[tester.day], "wrong amount of views on day " + tester.day
    
print ("Test splitting - PASSED")

Test splitting - PASSED


Split real input file

In [3]:
train_filepath = "./train.csv"
from classeslib import persistence_files

In [9]:
split_reader = SplitFileReader(persistence_files.days_split_catalog)

# act
split_reader.read_input_file(train_filepath)    
split_reader.close()

### Gather statistics about users's views and popular categories 

Calculate statistics and save it to file db with periodical dump due to memory usage (pandas is terra incognita yet, unfortunatelly) 

In [2]:
from classeslib.statistics import StatisticsDumper, StatisticsCounter
from classeslib import train_calendar
from classeslib import persistence_files

In [3]:
train_dumper = StatisticsDumper(persistence_files.public_train_statistics_db_file)

Gather statistics from days

In [None]:
for day in train_calendar.public_train_days:
    day_file_path = persistence_files.days_split_catalog + SplitFileReader.format_file_name(day)
    counter = StatisticsCounter(100000, day_file_path, train_dumper) 
    print "handling file ", day_file_path
    counter.calculate_statistics()

In [8]:
(d1_level_statistics,
 d2_level_statistics,
 d3_level_statistics) = train_dumper.restore_statistics(persistence_files.public_train_statistics_db_file)

In [9]:
print len(d3_level_statistics.categories_statistics)

924
