# Import important libraries

In [1]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

# Load the data

In [2]:
df = pd.read_csv("Data/zenrays.csv")

# Data preparation

In [3]:
df.head()

Unnamed: 0,Courses,Course_Contents
0,AI & ML Training,Why we require AI and ML? Problem with traditi...
1,Machine Learning,Introduction to Big Data and Machine Learning ...
2,Angular 2,Introduction to Single page application Develo...
3,ReactJS Training,Introduction to ES2015 (ES6) Webpack / Babel /...
4,Mean Stack,Introduction to Single page application Develo...


In [4]:
df.shape

(23, 2)

In [5]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [6]:
def clean_text(text, remove_stopwords = True, lemmatization = True):
    
    # Convert words to lower case
    text = text.lower()
    lemmatizer = WordNetLemmatizer()
    #w_tokenizer = word_tokenize()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text,  
                  flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', ' ', text) 
    text = re.sub(r'[_"\--;%()|+&=*%.,!?:#$@\[\]/<>]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\d+\S\d+\S\d+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\[[0-9]*\]',' ',text)
#     text = re.sub(r'\s+',' ',text)
    text = re.sub(r"\b[a-z]\b", " ", text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english") + list(string.punctuation))
        text = [w for w in text if not w in stops]
        text = " ".join(text)
    
    if lemmatization:
        text = [lemmatizer.lemmatize(w) for w in word_tokenize(text)]
        text = " ".join(text)

    return text

In [7]:
clean_course_content = []
for s in df.Course_Contents:
    #print(msg)
    clean_course_content.append(clean_text(s, remove_stopwords=True, lemmatization=True))
print("Phrase cleaning complete.")

Phrase cleaning complete.


In [8]:
clean_course_content

['require ai ml problem traditional software system opportunity ai ml need excel logical mind tool software efficiently build ml model python tensorflow popular understand actually ml model handle data preprocessing data type ml model supervised unsupervised peek reinforcement learning break data training test cross validation technique understand linear regression gradient descent actual hand understand calculation behind gradient descent brush differentiation understand math behind hand code python learn improve model overfitting one difficult aspect learn building ml model use linear regression model understand overfitting learn hand avoid overfitting bias variance tradeoff regularization ridge lasso anova test logistic regression understand classification logistic regression maximum likelihood estimation build end end model logistic regression using scikit learn hand actually build model industry code interview data science competition probability based model decision tree model un

In [9]:
clean_course_content1 = []
for i in clean_course_content:
    clean_course_content1.append(re.sub('(\\b[A-Za-z] \\b|\\b [A-Za-z]\\b)', '', i))

In [10]:
clean_course_content1

['require ai ml problem traditional software system opportunity ai ml need excel logical mind tool software efficiently build ml model python tensorflow popular understand actually ml model handle data preprocessing data type ml model supervised unsupervised peek reinforcement learning break data training test cross validation technique understand linear regression gradient descent actual hand understand calculation behind gradient descent brush differentiation understand math behind hand code python learn improve model overfitting one difficult aspect learn building ml model use linear regression model understand overfitting learn hand avoid overfitting bias variance tradeoff regularization ridge lasso anova test logistic regression understand classification logistic regression maximum likelihood estimation build end end model logistic regression using scikit learn hand actually build model industry code interview data science competition probability based model decision tree model un

In [11]:
words = set(nltk.corpus.words.words())
clean = []
for i in clean_course_content1:
    clean.append(" ".join(w for w in nltk.wordpunct_tokenize(i) if w.lower() in words or not w.isalpha()))
    

In [12]:
clean

['require ai problem traditional system opportunity ai need excel logical mind tool efficiently build model python popular understand actually model handle data data type model unsupervised peek reinforcement learning break data training test cross validation technique understand linear regression gradient descent actual hand understand calculation behind gradient descent brush differentiation understand math behind hand code python learn improve model one difficult aspect learn building model use linear regression model understand learn hand avoid bias variance regularization ridge lasso test logistic regression understand classification logistic regression maximum likelihood estimation build end end model logistic regression learn hand actually build model industry code interview data science competition probability based model decision tree model understand concept entropy impurity information gain detailed hand project predict possible loan defaulter large national bank apply conce

In [13]:
df['key_words'] = clean

In [14]:
df.head()

Unnamed: 0,Courses,Course_Contents,key_words
0,AI & ML Training,Why we require AI and ML? Problem with traditi...,require ai problem traditional system opportun...
1,Machine Learning,Introduction to Big Data and Machine Learning ...,introduction big data machine learning underst...
2,Angular 2,Introduction to Single page application Develo...,introduction single page application developme...
3,ReactJS Training,Introduction to ES2015 (ES6) Webpack / Babel /...,introduction project structure first react int...
4,Mean Stack,Introduction to Single page application Develo...,introduction single page application developme...


# TF-IDF vectorization

In [15]:
# count = CountVectorizer()
tfidf = TfidfVectorizer()
count_matrix = tfidf.fit_transform(df['key_words'])

In [16]:
count_matrix.shape

(23, 998)

# Cosine Similarity

In [17]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [18]:
cosine_sim

array([[1.        , 0.40845187, 0.0531854 , 0.0712029 , 0.0606874 ,
        0.04759254, 0.12598425, 0.09090995, 0.05766598, 0.07146837,
        0.07058736, 0.06324096, 0.04033862, 0.0768375 , 0.0278634 ,
        0.07025641, 0.0606111 , 0.04249196, 0.05737721, 0.03245669,
        0.03033289, 0.03818657, 0.05084827],
       [0.40845187, 1.        , 0.03943243, 0.03864985, 0.04423547,
        0.0371976 , 0.07247895, 0.07136068, 0.04609875, 0.05577112,
        0.0369052 , 0.03439075, 0.07027103, 0.06574449, 0.02869506,
        0.06224008, 0.04958029, 0.01876042, 0.02765832, 0.01383899,
        0.02517811, 0.02325032, 0.07709172],
       [0.0531854 , 0.03943243, 1.        , 0.19832677, 0.94981768,
        0.07004002, 0.19813141, 0.2460451 , 0.08732629, 0.1017653 ,
        0.15791367, 0.12163332, 0.0616307 , 0.09846429, 0.04961234,
        0.3037476 , 0.17272073, 0.06113105, 0.06205329, 0.05101798,
        0.15230791, 0.0639404 , 0.17278153],
       [0.0712029 , 0.03864985, 0.19832677, 1.   

# Recommendation function

In [19]:
indices = pd.Series(df.index)

#  defining the function that takes in movie title 
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended movies
    recommended_movies = []
    title_index = df[(df['Courses'] == title)].index[0]
    #print(title_index)
    # gettin the index of the movie that matches the title
    idx = indices[indices == title_index].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:3].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
        
    return recommended_movies

# Testing

In [20]:
recommendations("AI & ML Training")

[1, 6]

In [21]:
r = recommendations('Software Testing')
for i in r:
    print(i, " --- ", df.iloc[i,0])

19  ---  Automation
18  ---  Manual Testing


In [22]:
r = recommendations('AI & ML Training')
for i in r:
    print(i, " --- ", df.iloc[i,0])

1  ---  Machine Learning
6  ---  Python & Django


In [23]:
r = recommendations('AngularJS Training')
for i in r:
    print(i, " --- ", df.iloc[i,0])

7  ---  Web Development
2  ---  Angular 2
