In [1]:
!pip install nltk==3.6.7
!pip install gensim==4.1.2

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import gensim
from gensim import corpora
import nltk as nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/danswk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danswk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/danswk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
course1 = 'this is an introduction data science course which introduces data science to beginners'
course2 = 'machine learning for beginners'
courses = [course1, course2]
courses

['this is an introduction data science course which introduces data science to beginners',
 'machine learning for beginners']

In [4]:
tokenized_courses = [word_tokenize(course) for course in courses]  # tokenize sample courses
tokenized_courses

[['this',
  'is',
  'an',
  'introduction',
  'data',
  'science',
  'course',
  'which',
  'introduces',
  'data',
  'science',
  'to',
  'beginners'],
 ['machine', 'learning', 'for', 'beginners']]

In [5]:
tokens_dict = gensim.corpora.Dictionary(tokenized_courses)  # create token dictionary
tokens_dict.token2id

{'an': 0,
 'beginners': 1,
 'course': 2,
 'data': 3,
 'introduces': 4,
 'introduction': 5,
 'is': 6,
 'science': 7,
 'this': 8,
 'to': 9,
 'which': 10,
 'for': 11,
 'learning': 12,
 'machine': 13}

In [6]:
courses_bow = [tokens_dict.doc2bow(course) for course in tokenized_courses]  # generate BoW features for each tokenized course
courses_bow

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 2),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1)],
 [(1, 1), (11, 1), (12, 1), (13, 1)]]

In [7]:
for course_idx, course_bow in enumerate(courses_bow):
    print(f'Bag of words for course {course_idx}\n-------------------------')
    
    # print bow value (count) for each token index (word)
    for token_index, token_bow in course_bow:
        token = tokens_dict.get(token_index)
        print(f'- token: {token}, count: {token_bow}')

    print('=========================')

Bag of words for course 0
-------------------------
- token: an, count: 1
- token: beginners, count: 1
- token: course, count: 1
- token: data, count: 2
- token: introduces, count: 1
- token: introduction, count: 1
- token: is, count: 1
- token: science, count: 2
- token: this, count: 1
- token: to, count: 1
- token: which, count: 1
Bag of words for course 1
-------------------------
- token: beginners, count: 1
- token: for, count: 1
- token: learning, count: 1
- token: machine, count: 1


In [8]:
stop_words = set(stopwords.words('english'))  # get English stop words
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [9]:
tokenized_courses[0]  # get course 1 tokens with stop words

['this',
 'is',
 'an',
 'introduction',
 'data',
 'science',
 'course',
 'which',
 'introduces',
 'data',
 'science',
 'to',
 'beginners']

In [10]:
processed_tokens = [w for w in tokenized_courses[0] if not w.lower() in stop_words]  # get course 1 tokens without stop words
processed_tokens

['introduction',
 'data',
 'science',
 'course',
 'introduces',
 'data',
 'science',
 'beginners']

In [11]:
tags = nltk.pos_tag(tokenized_courses[0])  # analyze part of speech (POS), annotate each word
tags

[('this', 'DT'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('introduction', 'NN'),
 ('data', 'NNS'),
 ('science', 'NN'),
 ('course', 'NN'),
 ('which', 'WDT'),
 ('introduces', 'VBZ'),
 ('data', 'NNS'),
 ('science', 'NN'),
 ('to', 'TO'),
 ('beginners', 'NNS')]

In [12]:
# COURSE CONTENT DATASET
course_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/course_processed.csv"
course_content_df = pd.read_csv(course_url)
course_content_df.iloc[0, :]

COURSE_ID                                               ML0201EN
TITLE          robots are coming  build iot apps with watson ...
DESCRIPTION    have fun with iot and learn along the way  if ...
Name: 0, dtype: object

In [13]:
course_content_df['course_texts'] = course_content_df[['TITLE', 'DESCRIPTION']].agg(' '.join, axis=1)  # merge course titles, descriptions
course_content_df = course_content_df.reset_index()
course_content_df['index'] = course_content_df.index

In [14]:
def tokenize_course(course, keep_only_nouns=True):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(course)
    word_tokens = [w for w in word_tokens if (not w.lower() in stop_words) and (not w.isnumeric())]  # remove English stop words, numbers
    
    # keep only nouns
    if keep_only_nouns:
        filter_list = ['WDT', 'WP', 'WRB', 'FW', 'IN', 'JJR', 'JJS', 'MD',
                       'PDT', 'POS', 'PRP', 'RB', 'RBR', 'RBS', 'RP']
        tags = nltk.pos_tag(word_tokens)
        word_tokens = [word for word, pos in tags if pos not in filter_list]

    return word_tokens

In [15]:
course_1 = course_content_df.iloc[0, :]['course_texts']
course_1

'robots are coming  build iot apps with watson  swift  and node red have fun with iot and learn along the way  if you re a swift developer and want to learn more about iot and watson ai services in the cloud  raspberry pi   and node red  you ve found the right place  you ll build iot apps to read temperature data  take pictures with a raspcam  use ai to recognize the objects in those pictures  and program an irobot create 2 robot  '

In [16]:
tokenize_course(course_1)

['robots',
 'coming',
 'build',
 'iot',
 'apps',
 'watson',
 'swift',
 'red',
 'fun',
 'iot',
 'learn',
 'way',
 'swift',
 'developer',
 'want',
 'learn',
 'iot',
 'watson',
 'ai',
 'services',
 'cloud',
 'raspberry',
 'pi',
 'node',
 'red',
 'found',
 'place',
 'build',
 'iot',
 'apps',
 'read',
 'temperature',
 'data',
 'take',
 'pictures',
 'raspcam',
 'use',
 'ai',
 'recognize',
 'objects',
 'pictures',
 'program',
 'irobot',
 'create',
 'robot']

In [17]:
# tokenize courses in courses_df
tokenized_courses = []
for content in course_content_df.course_texts:
    tokenized_course = tokenize_course(content)
    tokenized_courses.append(tokenized_course)
    
tokenized_courses[:2]

[['robots',
  'coming',
  'build',
  'iot',
  'apps',
  'watson',
  'swift',
  'red',
  'fun',
  'iot',
  'learn',
  'way',
  'swift',
  'developer',
  'want',
  'learn',
  'iot',
  'watson',
  'ai',
  'services',
  'cloud',
  'raspberry',
  'pi',
  'node',
  'red',
  'found',
  'place',
  'build',
  'iot',
  'apps',
  'read',
  'temperature',
  'data',
  'take',
  'pictures',
  'raspcam',
  'use',
  'ai',
  'recognize',
  'objects',
  'pictures',
  'program',
  'irobot',
  'create',
  'robot'],
 ['accelerating',
  'deep',
  'learning',
  'gpu',
  'training',
  'complex',
  'deep',
  'learning',
  'models',
  'large',
  'datasets',
  'takes',
  'time',
  'course',
  'learn',
  'use',
  'accelerated',
  'gpu',
  'hardware',
  'overcome',
  'scalability',
  'problem',
  'learning',
  'use',
  'accelerated',
  'hardware',
  'google',
  'tensor',
  'processing',
  'unit',
  'tpu',
  'nvidia',
  'gpu',
  'accelerate',
  'convolutional',
  'neural',
  'network',
  'computations',
  'time',
 

In [18]:
tokens_dict = gensim.corpora.Dictionary(tokenized_courses)

# preview token dictionary
for key, value in list(tokens_dict.token2id.items())[:10]:
    print(f'{key}: {value}')

ai: 0
apps: 1
build: 2
cloud: 3
coming: 4
create: 5
data: 6
developer: 7
found: 8
fun: 9


In [19]:
courses_bow = [tokens_dict.doc2bow(course) for course in tokenized_courses]
courses_bow[0]

[(0, 2),
 (1, 2),
 (2, 2),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 4),
 (11, 1),
 (12, 2),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 2),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 2),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 2),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 2),
 (33, 1)]

In [20]:
bow_docs = [courses_bow[i] for i in range(len(tokenized_courses))]
doc_indices = []
doc_ids = []
tokens = []
bows = []

for doc_index, doc_bow in enumerate(bow_docs):
    for token_index, token_bow in doc_bow:
        doc_indices.append(course_content_df.iloc[doc_index, 0])
        doc_ids.append(course_content_df.iloc[doc_index, 1])
        tokens.append(tokens_dict.get(token_index))
        bows.append(token_bow)

    bow_dicts = {'doc_index':doc_indices,
                 'doc_id':doc_ids,
                 'token':tokens,
                 'bow':bows}

In [21]:
course_bow_df = pd.DataFrame(bow_dicts)  # create dataframe of extracted BoW features
course_bow_df.head(40)

Unnamed: 0,doc_index,doc_id,token,bow
0,0,ML0201EN,ai,2
1,0,ML0201EN,apps,2
2,0,ML0201EN,build,2
3,0,ML0201EN,cloud,1
4,0,ML0201EN,coming,1
5,0,ML0201EN,create,1
6,0,ML0201EN,data,1
7,0,ML0201EN,developer,1
8,0,ML0201EN,found,1
9,0,ML0201EN,fun,1
