In [1]:
import numpy as np
import pandas as pd
import os
import glob
# import nltk
# nltk.download('punkt')


In [2]:
class Course:
    TITLE_KEY = 'TitreCours'
    DESCRIPTION_KEY = 'DescriptionCours'

    @staticmethod
    def from_file(file_name):
        with open(file_name, 'r') as f:
            lines = f.readlines()

            return Course(
                lines[0].replace(f'{Course.TITLE_KEY}: ', ''),
                lines[1].replace(f'{Course.DESCRIPTION_KEY}: ', ''),
            )

    def __init__(self, title, description):
        self.title = title
        self.description = description

    def __str__(self):
        return f'{Course.TITLE_KEY}: {self.title}\n{Course.DESCRIPTION_KEY}: {self.description}'

class University:
    def __init__(self, name):
        self.name = name
        self.__courses = []

    def get_courses(self):
        if not len(self.__courses):
            file_names = glob.glob(os.path.join(self.name, '*.txt'))

            for file_name in file_names:
                try:
                    self.__courses.append(
                        Course.from_file(file_name)
                    )
                except:
                    continue

        return self.__courses
    
    def __iter__(self):
        for course in self.get_courses():
            yield course
            
    def shuffle(self):
        self.get_courses()
        np.random.shuffle(self.__courses)

    def __len__(self):
        return len(self.get_courses())
    
    def __str__(self):
        string = ''
        for course in self.get_courses():
            string += f'{course}\n'
        
        return string


In [3]:
UNIVERSITY_NAMES = ['HEC', 'Poly', 'UdM', 'UQAM']
universities = [University(name) for name in UNIVERSITY_NAMES]

descriptions = []
for university in universities:
    descriptions += list(map(lambda course: course.description, university))
    
np.random.shuffle(descriptions)
corpus = ' '.join(descriptions)


In [4]:
from nltk.tokenize import word_tokenize

In [9]:
tokens = word_tokenize(corpus)

In [10]:
with open('words', 'w+') as f:
    f.write('\n'.join(tokens))