In [105]:
import json
import pandas as pd
import pickle
from pprint import pprint
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import re
import gensim
from skimage import io
import matplotlib.pyplot as plt
from gensim.test.utils import get_tmpfile

In [106]:
import nltk
import sklearn

print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The nltk version is 3.4.
The scikit-learn version is 0.20.3.


# Load Data

In [107]:
# Load description features
courses_filtered = []
with open('courses.json') as json_file:
    data = json.load(json_file)
    for course in data:
        course_field = course['fields']
        course_field['description'] = course_field['description'].replace('\n', '')
        courses_filtered.append(course_field)

courses_df = pd.DataFrame(courses_filtered)

In [108]:
# Transforming the schema
courses_df = courses_df.rename(columns={'number':'id'})
courses_df.head(10)

Unnamed: 0,course_quality,description,difficulty,name,id
0,2.5,This seminar is designed to introduce students...,3.33,Problems in Greek History: Archaic and Classical,ANCH-535
1,2.8,Topics Varies.,2.6,TOPICS: GREEK/ROMAN ART: BORDERLINES: ROMAN PR...,ARTH-525
2,3.0,Topic varies.,3.25,"TOPICS:ART OF AL-ANDALUS: TEXTILES: DESIGN, TR...",ARTH-538
3,2.3,Digital methods allow archaeologists to approa...,2.8,INT DIGITAL ARCHAEOLOGY,CLST-362
4,3.5,This course is intended to familiarize new gra...,2.4,Materials & Methods in Mediterranean Archaeology,CLST-526
5,3.0,This course exposes students to digital method...,2.75,DIGITAL EXPLORTN OF PAST,NELC-320
6,2.74,This course is an introduction to the basic co...,3.28,ACCT & FINANCIAL REPORT,ACCT-101
7,1.32,The first part of the course presents alternat...,3.17,STRATEGIC COST ANALYSIS,ACCT-102
8,2.45,This course builds on the knowledge you obtain...,3.14,FIN MEASUREMENT & DISCLO,ACCT-212
9,2.82,"In the course, students learn how to analyze f...",2.81,ACCT & BUSINESS ANALYSIS,ACCT-242


# Clean Data

In [109]:
# Functions to clean data to simplify model
# Source: https://github.com/dzungpng/niche_perfume_chatbot/blob/master/train_model.ipynb
def stem_words(text):
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

In [110]:
courses_df['description'][10]

'The objective of this course is to provide an understanding of financial accounting fundamentals for prospective consumers of corporate financial information, such as managers, stockholders, financial analysts, and creditors. The course focuses on understanding how economic events like corporate investments, financing transactions and operating activities are recorded in the three main financial statements (i.e., the income statement, balance sheet, and statement of cash flows). Along the way, students will develop the technical skills needed to analyze corporate financial statements and disclosures for use in financial analysis, and to interpret how accounting standards and managerial incentives affect the financial reporting process. This course is recommended for students who want a more in-depth overview of the financial accounting required for understanding firm performance and potential future risks through analysis of reported financial information, such as students intending t

In [111]:
courses_df['description'] = courses_df.description.apply(func=make_lower_case)

In [112]:
courses_df['description'][10]

'the objective of this course is to provide an understanding of financial accounting fundamentals for prospective consumers of corporate financial information, such as managers, stockholders, financial analysts, and creditors. the course focuses on understanding how economic events like corporate investments, financing transactions and operating activities are recorded in the three main financial statements (i.e., the income statement, balance sheet, and statement of cash flows). along the way, students will develop the technical skills needed to analyze corporate financial statements and disclosures for use in financial analysis, and to interpret how accounting standards and managerial incentives affect the financial reporting process. this course is recommended for students who want a more in-depth overview of the financial accounting required for understanding firm performance and potential future risks through analysis of reported financial information, such as students intending t

In [113]:
courses_df['description'] = courses_df.description.apply(func=remove_stop_words)

In [114]:
courses_df['description'][10]

'objective course provide understanding financial accounting fundamentals prospective consumers corporate financial information, managers, stockholders, financial analysts, creditors. course focuses understanding economic events like corporate investments, financing transactions operating activities recorded three main financial statements (i.e., income statement, balance sheet, statement cash flows). along way, students develop technical skills needed analyze corporate financial statements disclosures use financial analysis, interpret accounting standards managerial incentives affect financial reporting process. course recommended students want in-depth overview financial accounting required understanding firm performance potential future risks analysis reported financial information, students intending go security analysis investment banking.'

In [115]:
courses_df['description'] = courses_df.description.apply(func=remove_punctuation)

In [116]:
courses_df['description'][10]

'objective course provide understanding financial accounting fundamentals prospective consumers corporate financial information managers stockholders financial analysts creditors course focuses understanding economic events like corporate investments financing transactions operating activities recorded three main financial statements i e income statement balance sheet statement cash flows along way students develop technical skills needed analyze corporate financial statements disclosures use financial analysis interpret accounting standards managerial incentives affect financial reporting process course recommended students want in depth overview financial accounting required understanding firm performance potential future risks analysis reported financial information students intending go security analysis investment banking'

In [117]:
courses_df['description'] = courses_df.description.apply(func=stem_words)

In [118]:
courses_df['description'][10]

'object cours provid understand financi account fundament prospect consum corpor financi inform manag stockhold financi analyst creditor cours focus understand econom event like corpor invest financ transact oper activ record three main financi statement i e incom statement balanc sheet statement cash flow along way student develop technic skill need analyz corpor financi statement disclosur use financi analysi interpret account standard manageri incent affect financi report process cours recommend student want in depth overview financi account requir understand firm perform potenti futur risk analysi report financi inform student intend go secur analysi invest bank'

In [119]:
courses_df['name'] = courses_df.name.apply(func=make_lower_case)
courses_df['name'] = courses_df.name.apply(func=remove_stop_words)
courses_df['name'] = courses_df.name.apply(func=remove_punctuation)
courses_df['name'] = courses_df.name.apply(func=stem_words)

In [120]:
courses_df.head(10)

Unnamed: 0,course_quality,description,difficulty,name,id
0,2.5,seminar design introduc student major issu pro...,3.33,problem greek histori archaic classic,ANCH-535
1,2.8,topic vari,2.6,topic greek roman art borderlin roman provinci...,ARTH-525
2,3.0,topic vari,3.25,topic art al andalus textil design trade mean,ARTH-538
3,2.3,digit method allow archaeologist approach rese...,2.8,int digit archaeolog,CLST-362
4,3.5,cours intend familiar new graduat student coll...,2.4,materi method mediterranean archaeolog,CLST-526
5,3.0,cours expos student digit method investig past...,2.75,digit explortn past,NELC-320
6,2.74,cours introduct basic concept standard under f...,3.28,acct financi report,ACCT-101
7,1.32,first part cours present altern method prepar ...,3.17,strateg cost analysi,ACCT-102
8,2.45,cours build knowledg obtain introductori finan...,3.14,fin measur disclo,ACCT-212
9,2.82,cours student learn analyz firm financi statem...,2.81,acct busi analysi,ACCT-242


# TF-IDF Model

In [121]:
courses_df['full_text'] = courses_df['name'] + ' ' + courses_df['description']

In [133]:
#Fit TFIDF 
# Convert raw text describing courses to a matrix of TF-IDF features.
tf = TfidfVectorizer(analyzer='word', 
                     min_df=2,
                     ngram_range=(1, 2),
                     stop_words='english',
                     max_features=10000,)
tf.fit(courses_df['full_text'])

#Transform full_text to document-term matrix.
tfidf_matrix = tf.transform(courses_df['full_text'])
pickle.dump(tf, open("models/tfidf_model.pkl", "wb"))

print(tfidf_matrix.shape)

(1350, 10000)


In [135]:
# Compress with SVD
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500)
latent_matrix = svd.fit_transform(tfidf_matrix)
pickle.dump(svd, open("models/svd_model.pkl", "wb"))

print(latent_matrix.shape)

(1350, 500)


In [146]:
# Source: https://en.wikipedia.org/wiki/Latent_semantic_analysis
# LSA
n = 25 #pick components
#Use elbow and cumulative plot to pick number of components. 
#Need high ammount of variance explained. 
doc_labels = courses_df.id
svd_feature_matrix = pd.DataFrame(latent_matrix[:,0:n] ,index=doc_labels)
print(svd_feature_matrix.shape)
pickle.dump(svd_feature_matrix, open("models/lsa_embeddings.pkl", "wb"))

(1350, 25)


In [147]:
svd_feature_matrix.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ANCH-535,0.17649,-0.005958,0.049331,0.014055,-0.10569,-0.091895,-0.065494,0.019608,-0.020193,0.044704,...,-0.008524,-0.099411,-0.010769,0.006509,-0.100045,-0.091037,0.05879,0.036166,0.028401,-0.003187
ARTH-525,0.075565,-0.000674,0.195844,0.033918,0.009376,-0.047781,-0.083609,-0.021715,0.039739,0.24698,...,0.018817,-0.128903,-0.03057,0.096258,-0.144751,0.038616,0.00901,0.248528,-0.005438,0.047772
ARTH-538,0.144649,0.044863,0.26684,0.051481,0.087524,-0.071968,-0.051486,-0.040071,0.010027,0.362457,...,-0.07372,-0.077759,-0.07582,0.086629,-0.099764,0.149118,-0.057479,0.322557,0.071267,0.070394
CLST-362,0.214006,-0.044021,-0.050112,-0.10518,0.114911,-0.02059,-0.044175,0.251156,-0.015373,-0.029016,...,0.082913,0.018901,-0.116201,0.06547,-0.006971,-0.091077,0.023466,0.010272,0.037646,-0.043956
CLST-526,0.186969,-0.035699,0.003199,-0.069234,0.001375,0.008379,-0.053332,0.103371,-0.067626,0.021249,...,0.044595,-0.053396,-0.013292,-0.006504,-0.02669,-0.081879,0.084164,-0.063896,0.033069,-0.030238


In [141]:
 !ls -lh models/svd_model.pkl

-rw-r--r--  1 dzungnguyen  staff    38M Jul 28 11:15 models/svd_model.pkl


In [142]:
 !ls -lh models/tfidf_model.pkl

-rw-r--r--  1 dzungnguyen  staff   1.8M Jul 28 11:13 models/tfidf_model.pkl


In [148]:
 !ls -lh models/lsa_embeddings.pkl

-rw-r--r--  1 dzungnguyen  staff   287K Jul 28 11:28 models/lsa_embeddings.pkl


# Doc2Vec Model

In [None]:
#Use reviews, descriptions, and notes for vocabulary 
reviews = courses_df.reviews.values.tolist()
descriptions = df.description.values.tolist()
#notes = df.notes.values.tolist() #not using notes because sematics and order of list is not meaningfull. 

documents = []
for i in range(len(df)):
    mystr = reviews[i]
    mystr = mystr + descriptions[i]
    documents.append(re.sub("[^\w]", " ",  mystr).split())

In [None]:
print len(df)
print len(documents)