In [1]:
from rank_bm25 import BM25Okapi
import json
import pandas as pd
from IPython.display import HTML
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
import pickle


def process_all_descriptions(data):

    # tokenize
    data['title_des'] = data.apply(lambda row: word_tokenize(row['title_des']), axis=1)

    # remove stop words
    stop = stopwords.words('english')
    data["title_des"] = data["title_des"].apply(lambda x: [item.lower() for item in x])
    data['title_des'] = data['title_des'].apply(lambda x: [item for item in x if item not in stop])

    # remove punctuations
    data['title_des'] = data['title_des'].apply(lambda x: [''.join(c for c in s if c not in string.punctuation) for s in x])

    #remove digits
    data['title_des'] = data['title_des'].apply(lambda x: [c for c in x if not c.isdigit()])

    # remove empty string
    data['title_des'] = data['title_des'].apply(lambda x: [s for s in x if s])

    # word lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    data['title_des'] = data['title_des'].apply(lambda x: [wordnet_lemmatizer.lemmatize(w) for w in x])

    # word stemming
    porter_stemmer = PorterStemmer()
    data['title_des'] = data['title_des'].apply(lambda x: [porter_stemmer.stem(w) for w in x])

    data['title_des'] = data['title_des'].apply(' '.join)
    data['title_des'] = data['title_des'].astype(str)
    return data

with open('../courseId_description.json', 'r') as f:
    description = json.load(f)
data = pd.DataFrame(description.items(), columns=['id', 'title_des'])
data = process_all_descriptions(data)
tokenized_corpus = [doc.split(" ") for doc in data['title_des'].tolist()]
bm25 = BM25Okapi(tokenized_corpus)


def process_query(query):
    query = word_tokenize(query.lower())
    stop = stopwords.words('english')
    query = [item for item in query if item not in stop]
    query = [''.join(c for c in s if c not in string.punctuation) for s in query]
    query = [c for c in query if not c.isdigit()]
    wordnet_lemmatizer = WordNetLemmatizer()
    query = [wordnet_lemmatizer.lemmatize(w) for w in query]
    porter_stemmer = PorterStemmer()
    query = [porter_stemmer.stem(w) for w in query]
    return query


[nltk_data] Downloading package punkt to /home/jenny/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jenny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jenny/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
print('Keywords for Course Search:')
query = input()
query = process_query(query)
doc_scores = bm25.get_scores(query)
ranking = doc_scores.argsort().tolist()[::-1]
with open('../course_id.pkl', 'rb') as f:
    course_dict = pickle.load(f)
    course_id = course_dict['course_id']
    id_course = course_dict['id_course']
with open('../courseId_description.json', 'r') as f:
    course_des = json.load(f)

returned_courses = pd.DataFrame(columns=['Subject & Number', 'Title & Description'])
for i in ranking[:20]:
    returned_courses = returned_courses.append({'Subject & Number': id_course[i], 'Title & Description': course_des[str(i)]}, ignore_index=True)
HTML(returned_courses.to_html())

Keywords for Course Search:
a data science course which has low requirement for math


Unnamed: 0,Subject & Number,Title & Description
0,Mathematics N10B,"Methods of Mathematics: Calculus, Statistics, and Combinatorics: The sequence Math 10A, Math 10B is intended for majors in the life sciences. Elementary combinatorics and discrete and continuous probability theory. Representation of data, statistical models and testing. Sequences and applications of linear algebra."
1,Mathematics 10B,"Methods of Mathematics: Calculus, Statistics, and Combinatorics: The sequence Math 10A, Math 10B is intended for majors in the life sciences. Elementary combinatorics and discrete and continuous probability theory. Representation of data, statistical models and testing. Sequences and applications of linear algebra."
2,Engineering 178,"Statistics and Data Science for Engineers: This course provides a foundation in data science with emphasis on the application of statistics and machine learning to engineering problems. The course combines theoretical topics in probability and statistical inference with practical methods for solving problems in code. Each topic is demonstrated with examples from engineering. These include hypothesis testing, principal component analysis, clustering, linear regression, time series analysis, classification, and deep learning. Math 53 and 54 are recommended before Engin 178, Math 53 and 54 are allowed concurrently."
3,Industrial Eng & Ops Rsch 235,"Data, Systems and Signals: This is an advanced project course in data science that offers a ""maker"" and/or ""innovation"" viewpoint. The course is focused first on developing an open-ended-real world project relating to data science. Related concepts of computer science tools and theoretical concepts are covered to support the project. These concepts include filtering, prediction, classification, LTI systems, and spectral analysis. After reviewing each concept, we explore implementing it in Python using libraries for math array functions, manipulation of tables, data architectures, natural language, and ML frameworks."
4,Industrial Eng & Ops Rsch 135,"Applied Data Science with Venture Applications: This highly-applied course surveys a variety of key of concepts and tools that are useful for designing and building applications that process data signals of information. The course introduces modern open source, computer programming tools, libraries, and code samples that can be used to implement data applications. The mathematical concepts highlighted in this course include filtering, prediction, classification, decision-making, Markov chains, LTI systems, spectral analysis, and frameworks for learning from data. Each math concept is linked to implementation using Python using libraries for math array functions (NumPy), manipulation of tables (Pandas), long term storage (SQL, JSON, CSV files), natural language (NLTK), and ML frameworks."
5,UGIS-UG Interdisc Studies 304,Supervised Teaching in Mathematics and Science for Secondary Schools: Fieldwork for Cal Teach single subject math or science teaching credential.
6,Optometry 251,"Low Vision: Epidemiology and etiology of low vision. Optical principles of low vision aids. Optometric examination and treatment of the low vision patient. Interdisciplinary rehabilitation resources, counseling, and referral."
7,Optometry 151,"Low Vision: Epidemiology and etiology of low vision. Optical principles of low vision aids. Optometric examination and treatment of the low vision patient. Interdisciplinary rehabilitation resources, counseling, and referral."
8,Optometry 158A,"Low Vision: Optical principles of low vision aids. Epidemiology, etiology, signs, and symptoms of low vision. Optometric examination and treatment of the low vision patient, interdisciplinary rehabilitation resources, counseling, and referral."
9,Mathematics 10A,"Methods of Mathematics: Calculus, Statistics, and Combinatorics: The sequence Math 10A, Math 10B is intended for majors in the life sciences. Introduction to differential and integral calculus of functions of one variable, ordinary differential equations, and matrix algebra and systems of linear equations."
