Data downloaded from https://www.kaggle.com/extralime/math-lectures/version/2

# Imports

In [1]:
import numpy as np
import pandas as pd

Standardize

In [2]:
import random
import re
import string

In [3]:
# takes ~30 seconds to import
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer

In [None]:
import spacy

Vectorize

In [4]:
from sklearn.feature_extraction.text import CountVectorizer


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv_tfidf = TfidfVectorizer()

Reduce Dimensions

In [6]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

Cluster

In [7]:
from sklearn.cluster import KMeans

Visualize

In [8]:
# !pip install umap-learn

In [9]:
from sklearn import preprocessing


In [10]:
import umap
import seaborn as sns
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


Storage

In [11]:
import pickle

# Read in the data

In [12]:
df = pd.read_csv("raw_text.csv")

In [13]:
le = preprocessing.LabelEncoder()
le.fit(df['label'])

df['le'] = le.transform(df['label']) 
df.head()

Unnamed: 0,text,label,le
0,The following content is\nprovided under a Cre...,Calculus,3
1,"In this sequence of segments,\nwe review some ...",Probability,9
2,The following content is\nprovided under a Cre...,CS,2
3,The following\ncontent is provided under a Cre...,Algorithms,1
4,The following\ncontent is provided under a Cre...,Algorithms,1


In [14]:
grouped_sorted = df.groupby('label').count().sort_values(['text'], ascending=False)
grouped_sorted

Unnamed: 0_level_0,text,le
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Linear Algebra,152,152
Probability,124,124
CS,104,104
Diff. Eq.,93,93
Algorithms,81,81
Statistics,79,79
Calculus,70,70
Data Structures,62,62
AI,48,48
Math for Eng.,28,28


In [15]:
df['le']

0      3
1      9
2      2
3      1
4      1
      ..
855    7
856    5
857    6
858    0
859    5
Name: le, Length: 860, dtype: int64

In [16]:
# df.iloc[0][0]

### investigate stopwords

In [17]:
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

# PIPELINE

FUNCTION 1: Standardize  
FUNCTION 2: Vectorize  
FUNCTION 3: Reduce Dimensions  
FUNCTION 4: Cluster/Visualize

## Function 1: Standardize

Ideas: MWE, SpaCy (lemmatization), tokenization.  
Create list of custom stop words to remove from raw data.  
- going, just, let, minus, professor, audience, plus, okay, print, mit, respect, gonna, just, 

Create Compound Terms. (from nltk.tokenize import MWETokenizer # multi-word expression)  
- Laplace Transform
- Partial Derivative
- (variable) squared
- Power Series Expansion

Stem.  
- Probability vs. Probabilities
- Matrix vs. Matrices
- event vs. events
- edge vs. edges
- transform vs. transformation

In [18]:
# Figure out the MWETokenizer function; split them up and then join them back together.
# my_text = "You all are the greatest students of all time."
# mwe_tokenizer = MWETokenizer([('You','all'), ('of', 'all', 'time')])
# mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(my_text))

# mwe_tokens

In [19]:
import spacy
tokenizer = spacy.load('en_core_web_sm')

In [20]:
def tokenize_lemma(text):
        
    text_obj = tokenizer(text, disable=['parser', 'ner'])
    
    text_lemma = ' '.join([token.lemma_ for token in text_obj if not token.is_stop])
  
    return text_lemma

test_text = "The word probability is the same as the word probabilities"
tokenize_lemma(test_text)

'word probability word probabilitie'

In [21]:
def standardize(df):
    # replace '\n' with whitespace
    newline = lambda x: re.sub('\n', ' ', x)
    
    # remove numbers
    alphanumeric = lambda x: re.sub('\w*\d\w*|&gt;', '', x)
    # remove punctuation, convert to lowercase
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x.lower())
    
    lemmatize = lambda x: tokenize_lemma(x)
    
    # replace all "taylor's" with "taylor"
    df['text'] = df['text'].str.replace("taylor's", "taylor", case = False)
    df['text'] = df['text'].str.replace("transformation", "transform", case = False)
    
    # replace all of (custom stop words list) with empty string
    custom_stop = ['going','just','let','minus','professor','audience','play','okay','print','mit','respect','gonna','ok','thats','im','right','tes']
    rem_stop = lambda x: re.sub("|".join(custom_stop), "", x)
    
    df['text'] = df.text.map(newline).map(alphanumeric).map(punc_lower)
    df['text'] = df['text'].str.replace("laplace transform", "laplace_transform", case = False)
    df['text'] = df['text'].str.replace("partial derivative", "partial_derivative", case = False)
    df['text'] = df['text'].str.replace("power series expansion", "power_series_expansion", case = False)
    df['text'] = df.text.map(rem_stop)
    df['text'] = df.text.map(lemmatize)
    
    return df

In [22]:
df

Unnamed: 0,text,label,le
0,The following content is\nprovided under a Cre...,Calculus,3
1,"In this sequence of segments,\nwe review some ...",Probability,9
2,The following content is\nprovided under a Cre...,CS,2
3,The following\ncontent is provided under a Cre...,Algorithms,1
4,The following\ncontent is provided under a Cre...,Algorithms,1
...,...,...,...
855,The following content is\nprovided under a Cre...,Math for Eng.,7
856,&gt;&gt; [MUSIC] &gt;&gt; DAVID J. MALAN: All ...,Diff. Eq.,5
857,The following content is\nprovided by MIT Open...,Linear Algebra,6
858,The following content is\nprovided under a Cre...,AI,0


In [23]:
df_s = standardize(df) # executes in about 5 minutes

In [26]:
df.iloc[0][0]

'follow content provide creative common license support help   opencourseware continue offer high quality educational resource free donation view additional material hundred   course visit   opencourseware ocwedu     jerison relax sunny london ontario today send substitute   glad agenda today say would talk power series taylor formula guess week   friday    little example application course evaluation survey ill hand   minu class handout say   end term   not pick come grab   people tend pick walk grab   s thing miss decide office hour end term   not decide check website information loe forward final exam uh   not question technical stuff   s talk power series little bit think review story power series   attention power series way write function sum integral power x    number example power series polynomial forget type power series go finite number term end high ais   perfectly good example power series special kind power series want tell today power series behave exactly like polynomia

In [None]:
# df_s.iloc[42][0]

## Function 2: Vectorize

In [27]:
def vectorize_tfidf(df):
    '''
    Creates a sparse matrix of counts with Tf-Idf Vectorizer
    '''
    # define series to go into vectorizer
    x = df['text']
    # define vectorizer
    cv_tfidf = TfidfVectorizer(min_df=2, max_df=0.5)
    # vectorize: convert to sparse matrix
    sparse_matrix = cv_tfidf.fit_transform(x)
    feature_names = cv_tfidf.get_feature_names()
    # print the matrix
    print(pd.DataFrame(sparse_matrix.toarray(), columns=feature_names))
    # return the sparse matrix and feature names
    return sparse_matrix, feature_names

In [28]:
sm_v, feature_names = vectorize_tfidf(df_s)

      aa  aab  aah  aardvark  aaron        ab   abandon  abbreviate  \
0    0.0  0.0  0.0       0.0    0.0  0.000000  0.000000         0.0   
1    0.0  0.0  0.0       0.0    0.0  0.000000  0.000000         0.0   
2    0.0  0.0  0.0       0.0    0.0  0.000000  0.000000         0.0   
3    0.0  0.0  0.0       0.0    0.0  0.000000  0.000000         0.0   
4    0.0  0.0  0.0       0.0    0.0  0.000000  0.000000         0.0   
..   ...  ...  ...       ...    ...       ...       ...         ...   
855  0.0  0.0  0.0       0.0    0.0  0.000000  0.000000         0.0   
856  0.0  0.0  0.0       0.0    0.0  0.000000  0.000000         0.0   
857  0.0  0.0  0.0       0.0    0.0  0.000000  0.000000         0.0   
858  0.0  0.0  0.0       0.0    0.0  0.000000  0.000000         0.0   
859  0.0  0.0  0.0       0.0    0.0  0.109389  0.021198         0.0   

     abbreviation  abc  ...  zoo      zoom   zp  zpk   zr        zs   zt   zx  \
0             0.0  0.0  ...  0.0  0.000000  0.0  0.0  0.0  0.00000

In [29]:
sm_v.shape

(860, 10502)

In [30]:
feature_names

['aa',
 'aab',
 'aah',
 'aardvark',
 'aaron',
 'ab',
 'abandon',
 'abbreviate',
 'abbreviation',
 'abc',
 'abcd',
 'ability',
 'ablate',
 'ablation',
 'abnormal',
 'abort',
 'abraham',
 'abrupt',
 'abs',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'absorb',
 'abstract',
 'abstraction',
 'abstractly',
 'absurd',
 'abumostafa',
 'abundance',
 'abundant',
 'abuse',
 'abx',
 'ac',
 'academia',
 'academic',
 'academy',
 'accelerate',
 'acceleration',
 'accent',
 'accept',
 'acceptable',
 'acceptance',
 'access',
 'accessible',
 'accident',
 'accidental',
 'accidentally',
 'accommodate',
 'accomplish',
 'accord',
 'according',
 'accordingly',
 'account',
 'accountable',
 'accountably',
 'accounting',
 'accumula',
 'accumulate',
 'accumulated',
 'accumulative',
 'accuracy',
 'accurate',
 'accurately',
 'accustomed',
 'ace',
 'ache',
 'achievable',
 'achieve',
 'achievement',
 'acid',
 'ackermann',
 'acknowledge',
 'acknowledgement',
 'acl',
 'acoustic',
 'acquaint',
 'acquire',
 'acro

## Function 3: Reduce Dimensions

In [31]:
def reduce_dim(sm):
    # define Truncated SVD
    lsa = TruncatedSVD(40)
    # do LSA on sparse matrix "sm"
    rd = normalize(lsa.fit_transform(sm))
    
    # print explained variance ratio
#     print("LSA Explained Variance Ratio: ",lsa.explained_variance_ratio_)
    # return matrix of reduced dimensions
    return rd, lsa.components_

In [32]:
rd_v, lsa_components = reduce_dim(sm_v)

In [33]:
rd_v

# identify which rows come from each subject
# get the slice of rd_v that correspond to a subject
# plot a heat map of those rows
# --> see which topics are strongly associated with that subject
# compare subjects/compute entropy per subject (i.e. core math discipline vs. interdisciplinary field e.g. NLP)

array([[ 0.29060678,  0.20256223,  0.11349155, ...,  0.13670003,
        -0.05455222,  0.10012089],
       [ 0.18613807,  0.24687721, -0.05022695, ...,  0.03007343,
         0.22704304, -0.04304533],
       [ 0.18630519,  0.14359342,  0.22163487, ...,  0.02878994,
        -0.02413465, -0.10814504],
       ...,
       [ 0.62592726, -0.01189198,  0.08850105, ..., -0.04721063,
         0.05601461, -0.04634379],
       [ 0.16501553,  0.14118581,  0.34023781, ...,  0.03010239,
         0.11953425,  0.0440007 ],
       [ 0.35590365,  0.2694001 ,  0.15576531, ..., -0.12991931,
         0.13463747, -0.01787092]])

In [34]:
lsa_components

array([[ 7.39615647e-04,  1.45045760e-04,  7.06957133e-04, ...,
         4.33000734e-04,  2.02920909e-03,  1.81086131e-04],
       [-1.91425611e-05,  2.32506343e-04, -3.68802959e-04, ...,
         2.96096316e-04, -1.28958949e-03,  2.30159599e-04],
       [ 5.63842074e-04, -1.70416514e-04, -9.42271902e-06, ...,
         7.22792970e-05, -4.91809037e-05,  1.99960048e-05],
       ...,
       [ 7.70298825e-04,  4.92233755e-04, -1.90660231e-03, ...,
         1.12619626e-03,  6.55838764e-03,  2.00357470e-04],
       [ 1.88365178e-03,  3.54845764e-04,  2.67391462e-04, ...,
        -1.39857193e-03, -8.74487026e-05, -1.42066495e-04],
       [-9.12787189e-04, -4.19588869e-04, -1.02706319e-04, ...,
        -8.03684606e-04, -6.39529421e-04, -1.20599454e-04]])

### Look at the topics

In [35]:
def display_topics(model_components, feature_names, num_top_words, topic_names=None):
    for ix, topic in enumerate(model_components):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [36]:
# x = df_s['text']
# cv_tfidf = TfidfVectorizer(stop_words='english')
# sm = cv_tfidf.fit_transform(x)
# lsa = TruncatedSVD(10)
# rd = lsa.fit_transform(sm)
display_topics(lsa_components, feature_names, 10)


Topic  0
vector, matrix, column, row, probability, equation, solution, transpose, transform, plane

Topic  1
probability, event, random, conditional, node, distribution, outcome, log, sample, tree

Topic  2
node, tree, list, log, edge, search, insert, algorithm, subtree, link

Topic  3
laplace_transform, equation, integral, derivative, solution, differential, pre, delta, sine, theta

Topic  4
vector, dot, plane, length, transform, scalar, direction, span, member, component

Topic  5
laplace_transform, node, st, probability, event, infinity, integral, vector, sine, row

Topic  6
solution, equation, differential, event, probability, derivative, slope, node, initial, pre

Topic  7
node, lambda, tree, subtree, integral, link, address, child, pdf, variance

Topic  8
edge, vertex, path, graph, weight, cycle, short, algorithm, delta, tree

Topic  9
row, integral, determinant, theta, dx, area, curve, field, plane, delta

Topic  10
eigenvalue, eigenvector, matrix, lambda, event, integral, delt

In [37]:
grouped_sorted.index

Index(['Linear Algebra', 'Probability', 'CS', 'Diff. Eq.', 'Algorithms',
       'Statistics', 'Calculus', 'Data Structures', 'AI', 'Math for Eng.',
       'NLP'],
      dtype='object', name='label')

## Pickle stuff

In [38]:
df_pickle_open = open('df.pickle', 'wb')
pickle.dump(df, df_pickle_open)
df_pickle_open.close()

In [39]:
df_pickle_read = open('df.pickle', 'rb')
df_pickle = pickle.load(df_pickle_read)
df_pickle_read.close()

In [40]:
df_pickle

Unnamed: 0,text,label,le
0,follow content provide creative common license...,Calculus,3
1,sequence segment review mathematical backgroun...,Probability,9
2,follow content provide creative common license...,CS,2
3,follow content provide creative common license...,Algorithms,1
4,follow content provide creative common license...,Algorithms,1
...,...,...,...
855,follow content provide creative common license...,Math for Eng.,7
856,music david j malan end week see cir...,Diff. Eq.,5
857,follow content provide opencourseware creati...,Linear Algebra,6
858,follow content provide creative common license...,AI,0


In [41]:
df

Unnamed: 0,text,label,le
0,follow content provide creative common license...,Calculus,3
1,sequence segment review mathematical backgroun...,Probability,9
2,follow content provide creative common license...,CS,2
3,follow content provide creative common license...,Algorithms,1
4,follow content provide creative common license...,Algorithms,1
...,...,...,...
855,follow content provide creative common license...,Math for Eng.,7
856,music david j malan end week see cir...,Diff. Eq.,5
857,follow content provide opencourseware creati...,Linear Algebra,6
858,follow content provide creative common license...,AI,0


In [42]:
with open('grouped_sorted.pickle', 'wb') as writefile:
    pickle.dump(grouped_sorted, writefile)

In [44]:
with open('grouped_sorted.pickle', 'rb') as readfile:
    gs = pickle.load(readfile)
gs

Unnamed: 0_level_0,text,le
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Linear Algebra,152,152
Probability,124,124
CS,104,104
Diff. Eq.,93,93
Algorithms,81,81
Statistics,79,79
Calculus,70,70
Data Structures,62,62
AI,48,48
Math for Eng.,28,28


## Function 4: Cluster/Visualize (irrelevant)

In [None]:
# From KMeansClustering.ipynb: 
# helper function that allows us to display data in 2 dimensions and highlights the clusters
def display_cluster(X,km=[],num_clusters=0):
    color = 'brgcmyk'
    alpha = 0.5
    s = 20
    if num_clusters == 0:
        plt.scatter(X[:,0],X[:,1],c = color[0],alpha = alpha,s = s)
    else:
        for i in range(num_clusters):
            plt.scatter(X[km.labels_==i,0],X[km.labels_==i,1],c = color[i],alpha = alpha,s=s)
            plt.scatter(km.cluster_centers_[i][0],km.cluster_centers_[i][1],c = color[i], marker = 'x', s = 100)
            
def cluster(rd, num_clusters = 5):
    km = KMeans(n_clusters=num_clusters,random_state=10,n_init=10) # n_init, number of times the K-mean algorithm will run
    km.fit(rd)
    display_cluster(rd,km,num_clusters)

In [None]:
cluster(rd_v)

In [None]:
df.label

In [None]:

X = rd_v
sns.scatterplot(X[:,0],X[:,1],hue = df.label)


## Try UMAP (irrelevant)

In [None]:
reducer = umap.UMAP(random_state=42, n_neighbors = 50)
reducer.fit(rd_v)

In [None]:
embedding = reducer.transform(rd_v)
# Verify that the result of calling transform is
# idenitical to accessing the embedding_ attribute
assert(np.all(embedding == reducer.embedding_))
embedding.shape

In [None]:
# palette = [sns.color_palette()[x] for x in df['le']]
plt.figure(figsize = (10,8))
plt.scatter(embedding[:, 0], embedding[:, 1], c=df['le'], cmap='Spectral', s=8)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP projection of Math Subjects', fontsize=15);

## Testing some ideas (irrelevant)

In [None]:
# import these modules 
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 

ps = PorterStemmer() 

# choose some words to be stemmed 
words = ["matrix", "matrices", "transform", "transformation", "probability", "probabilities"] 

for w in words: 
	print(w, " : ", ps.stem(w)) 


In [None]:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
words = ["matrix", "matrices", "transform", "transformation", "probability", "probabilities"] 

for w in words: 
	print(w, " : ", stemmer.stem(w)) 

In [None]:
from nltk.stem.snowball import SnowballStemmer
sb = SnowballStemmer("english")
words = ["matrix", "matrices", "transform", "transformation", "probability", "probabilities"] 

for w in words: 
	print(w, " : ", sb.stem(w)) 

In [None]:
from nltk.stem import WordNetLemmatizer 

# Init the Wordnet Lemmatizer
wn = WordNetLemmatizer()

words = ["matrix", "matrices", "transform", "transformation", "probability", "probabilities"] 

for w in words: 
	print(w, " : ", wn.lemmatize(w)) 

In [None]:
from nltk.stem import WordNetLemmatizer 

# Init the Wordnet Lemmatizer
wn = WordNetLemmatizer()

words = ["matrix", "matrices", "transform", "transformation", "probability", "probabilities","laplace_transformation"] 

for w in words: 
	print(w, " : ", wn.lemmatize(w)) 

In [None]:
import spacy
tokenizer = spacy.load('en_core_web_sm')

In [None]:
def tokenize_lemma(text):
        
    text_obj = tokenizer(text, disable=['parser', 'ner'])
    
    text_lemma = ' '.join([token.lemma_ for token in text_obj if not token.is_stop])
  
    return text_lemma

In [None]:
test_text = "In mathematics, the laplace_transform is an integral transform named after its inventor Pierre-Simon Laplace. It is also known as a laplace_transformation. "
tokenize_lemma(test_text)