Source: https://github.com/ashishrana160796/Online-Course-Recommendation-System

In [3]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.decomposition import TruncatedSVD
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import Normalizer
import pickle

# Train - clustering

In [4]:
# 2. Ingest data from source & Preprocessing that needs to be carried out.

# read data, from source
course_df = pd.read_csv("data/courses.csv")


In [5]:
# drop rows with NaN values for any column, specifically 'Description'
course_df = course_df.dropna(how='any')


In [6]:
# Preprocess description for models
# Remove stopwards, numbers(carries information about courses don't remove them.)
# Remove 'll' ASAP. Anything ending with " 'll " has to be replaced.
# Try removing extra text, keep important phrases & nouns

# Pre-preprocessing step: remove words like we'll, you'll, they'll etc.
course_df['Description'] = course_df['Description'].replace({"'ll": " "}, regex=True)
course_df['CourseId'] = course_df['CourseId'].replace({"-": " "}, regex=True)

# Combine three columns namely: CourseId, CourseTitle, Description
# As all of them reveal some information about the course
comb_frame = course_df.CourseId.str.cat(" "+course_df.CourseTitle.str.cat(" "+course_df.Description))

# remove all characters except numbers and alphabets
comb_frame = comb_frame.replace({"[^A-Za-z0-9 ]+": ""}, regex=True)

In [7]:
course_df.head()

Unnamed: 0,CourseId,CourseTitle,DurationInSeconds,ReleaseDate,Description,AssessmentStatus,IsCourseRetired
0,abts advanced topics,BizTalk 2006 Business Process Management,22198,2008-10-25,This course covers Business Process Management...,Live,no
1,abts fundamentals,BizTalk 2006 Fundamentals,24305,2008-06-01,Despite the trend towards service-oriented arc...,Live,no
2,agile team practice fundamentals,Agile Team Practices with Scrum,13504,2010-04-15,This course is much different than most of the...,Live,no
4,aspdotnet advanced topics,ASP.NET 3.5 Advanced Topics,21611,2008-12-05,This course covers more advanced topics in ASP...,Live,no
5,aspdotnet ajax advanced topics,ASP.NET Ajax Advanced Topics,10426,2008-09-30,This course covers advanced topics in ASP.NET ...,Live,no


In [8]:
# 3. Train model with Latent semantic analysis with SVD & k-means

# Create word vectors from combined frames 
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(comb_frame)

In [9]:
# true_k, derived from elbow metho and confirmed from pluralsight's website
# true_k = 8
# Instead, k=30 with elbow is picked which is producing lower error
true_k = 30

# usig SVD for LSA
# svd = TruncatedSVD(true_k)
# lsa = make_pipeline(svd, Normalizer(copy=False))
# X = lsa.fit_transform(X)

# Running model with 15 different centroid initializations & maximum iterations are 500
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=15)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=500,
       n_clusters=30, n_init=15, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [10]:
# 5. Preview clusters, test your model for demo and save your model for further use.

# Preview top 15 words in each cluster, and accordingly different clusters can be assigned 
# a given categories out of 8 categories on pluralsight's website.

# Create a hashmap, mapping each cluster to a given category.

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :15]:
        print(' %s' % terms[ind]),

Top terms per cluster:
Cluster 0:
 course
 learn
 fundamentals
 use
 code
 using
 applications
 new
 programming
 application
 create
 java
 business
 language
 introduction
Cluster 1:
 effects
 cc
 footage
 create
 3d
 animation
 learn
 compositing
 motion
 shot
 animating
 layers
 camera
 creating
 software
Cluster 2:
 unity
 game
 character
 create
 games
 fundamentals
 learn
 player
 creating
 2d
 development
 substance
 course
 required
 software
Cluster 3:
 photoshop
 cc
 concept
 painting
 character
 create
 color
 techniques
 design
 image
 required
 tutorial
 software
 creating
 learn
Cluster 4:
 illustrator
 cc
 indesign
 adobe
 premiere
 pro
 learn
 logo
 design
 creating
 color
 required
 create
 software
 vector
Cluster 5:
 tricks
 tips
 centering
 selfcontained
 lesson
 improve
 video
 workflows
 various
 softimage
 help
 maya
 course
 modeling
 rendering
Cluster 6:
 exchange
 server
 2013
 2010
 2007
 2016
 course
 solutions
 mailboxes
 lync
 2003
 administrators
 instal

In [11]:
# For testing which cluster the following course is having, manipulate string as 'CourseId(with string replacement from '-' to a blankspace)+" "+CourseTitle+" "+Description'

# Y = vectorizer.transform(["aspdotnet data ASP.NET 3.5 Working With Data ASP.NET has established itself as one of the most productive environments for building web applications and more developers are switching over every day. The 2.0 release of ASP.NET builds on the same componentry of 1.1, improving productivity of developers even further by providing standard implementations of common Web application features like membership, persistent user profile, and Web parts, among others. The 3.5 release adds several new controls including the flexible ListView and the LinqDataSource, as well as integrated suport for ASP.NET Ajax. This course will cover the data access, caching, and state management features of ASP.NET."])
# prediction = model.predict(Y)
# print(prediction)     # A cluster category will be given as an output.


# Save machine learning model
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

# Recommend

In [17]:
def cluster_predict(str_input):
    Y = vectorizer.transform(list(str_input))
    prediction = model.predict(Y)
    return prediction


def recommend_util(str_input):
    
    # Predict category of input string category
    temp_df = course_df.loc[course_df['CourseId'] == str_input]
    temp_df['InputString'] = temp_df.CourseId.str.cat(" "+temp_df.CourseTitle.str.cat(" "+temp_df['Description']))
    str_input = list(temp_df['InputString'])
    
    prediction_inp = cluster_predict(str_input)
    prediction_inp = int(prediction_inp)
    
    temp_df = course_df.loc[course_df['ClusterPrediction'] == prediction_inp]
    temp_df = temp_df.sample(10)
    
    return list(temp_df['CourseId'])

In [12]:
# 1. load model and previous preprocessing.

# load model only once
with open('finalized_model.sav', 'rb') as fid:
    model = pickle.load(fid)

In [13]:
# X = vectorizer.fit_transform(course_df['InputString'])
# This will give an error as incorrect number of features, i.e. if features from a different data-frame is used
# seperate code snippet for building vocabulary for trained model
courses_df = pd.read_csv("data/courses.csv")
courses_df = courses_df.dropna(how='any')
courses_df['Description'] = courses_df['Description'].replace({"'ll": " "}, regex=True)
courses_df['CourseId'] = courses_df['CourseId'].replace({"-": " "}, regex=True)
comb_frame = courses_df.CourseId.str.cat(" "+courses_df.CourseTitle.str.cat(" "+courses_df.Description))
comb_frame = comb_frame.replace({"[^A-Za-z0-9 ]+": ""}, regex=True)

In [14]:
# Add clustering labels to every non-retired course
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(comb_frame)

In [15]:
# 2. Current utility variable and data frame preprocessing

# Verbose only, Not getting used in the code: creating labels for clusters manually
label_dict = {
                 0: "Gaming Professionals",
                 1: "Manufacturing & Design",
                 2: "Software Development",
                 3: "Data Professionals",
                 4: "Information & Cyber Security",
                 5: "Movie Making & Animation",
                 6: "IT Ops",
                 7: "Graphic Design"
            }
    
# load the complete data in a dataframe
course_df = pd.read_csv("data/courses.csv")
# drop retired course from analysis. But, courses with no descriptions are kept.
course_df = course_df[course_df.IsCourseRetired == 'no']
    
# create new column in dataframe which is combination of (CourseId, CourseTitle, Description) in existing data-frame
course_df['InputString'] = course_df.CourseId.str.cat(" "+course_df.CourseTitle.str.cat(" "+course_df.Description))

course_df['ClusterPrediction'] = ""

In [16]:
course_df.head()

Unnamed: 0,CourseId,CourseTitle,DurationInSeconds,ReleaseDate,Description,AssessmentStatus,IsCourseRetired,InputString,ClusterPrediction
0,abts-advanced-topics,BizTalk 2006 Business Process Management,22198,2008-10-25,This course covers Business Process Management...,Live,no,abts-advanced-topics BizTalk 2006 Business Pro...,
1,abts-fundamentals,BizTalk 2006 Fundamentals,24305,2008-06-01,Despite the trend towards service-oriented arc...,Live,no,abts-fundamentals BizTalk 2006 Fundamentals De...,
2,agile-team-practice-fundamentals,Agile Team Practices with Scrum,13504,2010-04-15,This course is much different than most of the...,Live,no,agile-team-practice-fundamentals Agile Team Pr...,
4,aspdotnet-advanced-topics,ASP.NET 3.5 Advanced Topics,21611,2008-12-05,This course covers more advanced topics in ASP...,Live,no,aspdotnet-advanced-topics ASP.NET 3.5 Advanced...,
5,aspdotnet-ajax-advanced-topics,ASP.NET Ajax Advanced Topics,10426,2008-09-30,This course covers advanced topics in ASP.NET ...,Live,no,aspdotnet-ajax-advanced-topics ASP.NET Ajax Ad...,


In [18]:
# Cluster category for each live course
course_df['ClusterPrediction']=course_df.apply(lambda x: cluster_predict(course_df['InputString']), axis=0)

In [23]:
course_df.head()

Unnamed: 0,CourseId,CourseTitle,DurationInSeconds,ReleaseDate,Description,AssessmentStatus,IsCourseRetired,InputString,ClusterPrediction
0,abts-advanced-topics,BizTalk 2006 Business Process Management,22198,2008-10-25,This course covers Business Process Management...,Live,no,abts-advanced-topics BizTalk 2006 Business Pro...,0
1,abts-fundamentals,BizTalk 2006 Fundamentals,24305,2008-06-01,Despite the trend towards service-oriented arc...,Live,no,abts-fundamentals BizTalk 2006 Fundamentals De...,0
2,agile-team-practice-fundamentals,Agile Team Practices with Scrum,13504,2010-04-15,This course is much different than most of the...,Live,no,agile-team-practice-fundamentals Agile Team Pr...,28
4,aspdotnet-advanced-topics,ASP.NET 3.5 Advanced Topics,21611,2008-12-05,This course covers more advanced topics in ASP...,Live,no,aspdotnet-advanced-topics ASP.NET 3.5 Advanced...,0
5,aspdotnet-ajax-advanced-topics,ASP.NET Ajax Advanced Topics,10426,2008-09-30,This course covers advanced topics in ASP.NET ...,Live,no,aspdotnet-ajax-advanced-topics ASP.NET Ajax Ad...,0


In [21]:
 queries = ['wp7-core', 'ef41-data-access', 'nosql-big-pic', 'procedural-ice-modeling-softimage-153', \
               'beginners-guide-shading-networks-softimage-510', 'centralized-logging-elastic-stack', \
               'apache-pig-data-transformations']

for query in queries:
    res = recommend_util(query)
    print(query, res)

wp7-core ['windows-server-2012-r2-implementing-essentials', 'windows-10-configuring-plan-desktop-device-deployment-creators', 'windows-10-configuring-manage-identity-creators', 'powershell-v4-new-features', 'windows8-hands-on', 'windows-10-foundations', 'windows-server-2012-70-411-configure-active-directory', 'windows-devices-apps-70-695-desktop-images', 'windows-internals2', 'windows-server-2012-70-411-network-policy-server']
ef41-data-access ['bokeh-building-interactive-visualizations', 'd3-big-picture', 'u-sql-azure-data-lake', 'tableau-10-whats-new', 'need-for-data-literacy-executive-briefing', 'oracle-developer-data-types-essentials', 'pandas-playbook-manipulating-data', 'enterprise-data-management', 'linq-architecture', 'rapidminer-getting-started']
nosql-big-pic ['ssis-basic', 'building-enterprise-distributed-online-analytics-platform', 'combining-painted-image-based-textures-mari-1481', 'whats-new-spring-5', 'an-arch-viz-scene-lwcad-lightwave-3d-2280', 'career-plan-building-man

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of