Instructions to run:
    Ensure the data set is stored in the working directory for the python environment with the name raw_notes.csv

In [None]:
#Import all necessary modules at the start
import spacy
import pandas as pd
import numpy as np
import pickle
import string
import scipy
import contextualSpellCheck
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB 
from wordcloud import wordcloud
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.decomposition import PCA 
import matplotlib.pyplot as plt
import re

Data Visualisation

In [None]:
#Load stop words, punctuations and start Spacy. Load data from csv
punctuations = string.punctuation
en_spa = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS
data = pd.read_csv('raw_notes.csv')
contextualSpellCheck.add_to_pipe(en_spa)
data = data[['Course','Sentence','Topic']].dropna()

In [None]:
data = pd.read_csv('raw_notes.csv').dropna()
tex = ''.join(str(data['Sentence'])).replace("\n", " ").replace("\r", " ") #Remove line breaks and ensure 
tex = re.sub("[^a-zA-Z#]"," ",tex) # Only alphabetical characters remain

In [None]:
tex # Print out the start of the text string

In [None]:
wordcloud = WordCloud().generate(tex) #Generate a wordcloud for the full corpus
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('Wordcloud.png')

In [None]:
UDAtex = data[(data.Course=="Unstructured Data Analysis")]['Sentence'] #Create a text for each course
ULtex  = data[(data.Course=="Unsupervised Learning\n")]['Sentence']

In [None]:
UDAtex = ''.join(str(UDAtex)).replace("\n", " ").replace("\r", " ")
UDAtex = re.sub("[^a-zA-Z#]"," ",UDAtex)

In [None]:
ULtex = ''.join(str(ULtex)).replace("\n", " ").replace("\r", " ")
ULtex = re.sub("[^a-zA-Z#]"," ",ULtex)

In [None]:
UDAtex

In [None]:
ULtex

In [None]:
wordcloud_UDA = WordCloud().generate(UDAtex) #Create word cloud for each course text
wordcloud_UL = WordCloud().generate(ULtex)

In [None]:
plt.imshow(wordcloud_UDA, interpolation='bilinear') #Save each wordcloud
plt.axis("off") 
plt.savefig('Wordcloud_UDA.png')

In [None]:
plt.imshow(wordcloud_UL, interpolation='bilinear')
plt.axis("off")
plt.savefig('Wordcloud_UL.png')

In [None]:
def spacy_tokenizer(sentence):
    # Create token object from spacy with no stopwords and lemmatize words that are not proper nouns
    docs = en_spa(sentence)
    tokens = docs._.outcome_spellCheck # Perform spell check
    tokens = en_spa(tokens)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "PROPN" else word.lower_ for word in docs]
    tokens = [word for word in tokens if word not in stopwords]
    tokens = [word for word in tokens if word not in punctuations] # Remove punctuations
    return tokens

In [None]:
#Create vectorizers for the different N-Grams required
vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
count_vec = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
vectorizer2 = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,2))
vectorizer3 = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,3))
count_vec2 = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,2))
count_vec3 = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,3))

In [None]:
# Create the sparse matrices from vectorizers intialised in the cell above
X = data['Sentence']
X = vectorizer.fit_transform(X)
Y2 = data['Sentence']
Y2 = vectorizer2.fit_transform(Y2)
Y5 = data['Sentence']
Y5 = vectorizer3.fit_transform(Y5)
X2 = data['Sentence']
X2 = count_vec.fit_transform(X2)
Y3 = data['Sentence']
Y3 = count_vec2.fit_transform(Y3)
Y4= data['Sentence']
Y4 = count_vec3.fit_transform(Y4)

In [None]:
# Due to long computation time, save the vectors as a pickle 
pickle.dump(vectorizer, open("vectorizer.pickle", "wb"))
pickle.dump(vectorizer2, open("vectorizer2.pickle", "wb"))
pickle.dump(vectorizer3, open("vectorizer3.pickle", "wb"))
pickle.dump(count_vec, open("vectorizer4.pickle", "wb"))
pickle.dump(count_vec2, open("vectorizer5.pickle", "wb"))
pickle.dump(count_vec3, open("vectorizer6.pickle", "wb"))

In [None]:
# Perform initial SVD to understand the number of components required in LSA
svd = TruncatedSVD(n_components=1000, n_iter=20, random_state=2023)
svd.fit(X)

In [None]:
# Plot the explained variance by dimensions
plt.plot(svd.explained_variance_ratio_)
plt.title("explained variance ratio")
plt.xlabel("Dimensions")
plt.show()
plt.savefig('ExplainedVariance.png')

Unigram LSA

In [None]:
# Perform LSA for unigram vectorized dataset and output the topics and words within topics
svd = TruncatedSVD(n_components=100, n_iter=20, random_state=2023)
X_ = svd.fit_transform(X)
terms = vectorizer.get_feature_names_out()
for i, comp in enumerate(svd.components_):
    if(i>10):
        break
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:5]
    print("Topic Number "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])
        print(" ")

Unigram + Bigram LSA

In [None]:
# Perform LSA for unigram+bigram vectorized dataset and output the topics and words within topics

svd = TruncatedSVD(n_components=1000, n_iter=20, random_state=2023)
svd.fit(Y2)

plt.plot(svd.explained_variance_ratio_)
plt.title("explained variance ratio TFID 2gram")
plt.xlabel("Dimensions")
plt.show()
plt.xlim(-10,100)
plt.savefig('ExplainedVariance2gram.png')

svd = TruncatedSVD(n_components=10, n_iter=20, random_state=2023)
X_ = svd.fit_transform(Y2)
terms = vectorizer.get_feature_names_out()
for i, comp in enumerate(svd.components_):
    if(i>10):
        break
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:6]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])
        print(" ")

Unigram + Bigram + Trigram LSA

In [None]:
# Perform LSA for unigram+bigram+trigram vectorized dataset and output the topics and words within topics
svd = TruncatedSVD(n_components=1000, n_iter=20, random_state=2023)
svd.fit(Y3)

plt.plot(svd.explained_variance_ratio_)
plt.title("explained variance ratio")
plt.xlabel("Dimensions")
plt.show()
plt.savefig('ExplainedVariance.png')

svd = TruncatedSVD(n_components=100, n_iter=20, random_state=2023)
X_ = svd.fit_transform(X)
terms = vectorizer.get_feature_names_out()
for i, comp in enumerate(svd.components_):
    if(i>10):
        break
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:5]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])
        print(" ")

LDA

In [None]:
# perfrom LDA for the dataset to compare topics to LSA
lda_model = LatentDirichletAllocation(n_components=10, random_state=0)
lda_model.fit(X)

# print the topics learned by the model
for topic_idx, topic in enumerate(lda_model.components_):
    print("Topic %d:" % (topic_idx))
    print(" ".join([count_vec.get_feature_names_out()[i]
                    for i in topic.argsort()[:-10 - 1:-1]]))



Logistic Regression (Unigram)

In [None]:
# Perform logistic regression on the unigram dataset and output the accuracy and F1 score
X_train, X_test, y_train,y_test = train_test_split(X,data['Course'],random_state=2023, test_size=0.2)
clf = LogisticRegression()
clf.fit(X_train,y_train)
y_pred = (clf.predict(X_test))

accuracy_score(y_test,y_pred)

f1_score(y_test,y_pred,pos_label="Unstructured Data Analysis")

In [None]:
# Test prediction scores for new texts
new = ['Pixels are assigned integer values between 0 and 255','NMF provides an approximate decomposition of the data matrix']
X_New = vectorizer.transform(new)
clf.predict_proba(X_New)

SVC (Unigram)

In [None]:
# Perform SVM Classification on the unigram dataset and output the accuracy and F1 score
clf2 = SVC(probability=True)

clf2.fit(X_train,y_train)
y_pred = (clf2.predict(X_test))

print(accuracy_score(y_test,y_pred))

print(f1_score(y_test,y_pred,pos_label="Unstructured Data Analysis"))

In [None]:
clf2.predict_proba(X_New)

In [None]:
# Print the AUC score for the above binary classification models
preds1 = clf.predict_proba(X_test)
preds2 = clf2.predict_proba(X_test)
fpr1, tpr1, thresh1 = roc_curve(y_test, preds1[:,0], pos_label="Unstructured Data Analysis")
fpr2, tpr2, thresh2 = roc_curve(y_test, preds2[:,0], pos_label="Unstructured Data Analysis")
probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, probs, pos_label=1)
auc_score1 = roc_auc_score(y_test, preds1[:,0])
auc_score2 = roc_auc_score(y_test, preds2[:,0])

print(auc_score1, auc_score2)

In [None]:
# Plot the ROC curve
plt.plot(fpr1, tpr1, color='red', label='Logistic Regression')
plt.plot(fpr2, tpr2, color='blue', label='SVM')
plt.plot(p_fpr, p_tpr, color='green')
plt.title('ROC curve Comparison')
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.savefig('ROC_AUC.png')
plt.show()

In [None]:
# Perform 
X_train, X_test, y_train,y_test = train_test_split(X,data['Topic'],random_state=2023, test_size=0.2)
clf3 = MultinomialNB()
clf3.fit(X_train,y_train)
y_pred = (clf3.predict(X_test))
print(accuracy_score(y_test,y_pred))

In [None]:
from sklearn.svm import LinearSVC 
X_train, X_test, y_train,y_test = train_test_split(X,data['Topic'],random_state=2023, test_size=0.2)
clf4 = LinearSVC()
clf4.fit(X_train,y_train)
y_pred = (clf4.predict(X_test))
print(accuracy_score(y_test,y_pred))

K-Means

In [None]:
from sklearn.cluster import KMeans # initialize KMeans with 4 clusters
kmeans = KMeans(verbose=1,n_clusters=4) 
kmeans.fit(X)
clusters = kmeans.labels_
# Output a few words for each cluster (example given in lecture notes for Text clustering)
df = pd.DataFrame(X.todense()).groupby(clusters).mean() 
terms = vectorizer.get_feature_names_out()
for i,r in df.iterrows():
    print('\nCluster Number {}'.format(i))
    print(','.join([terms[t] for t in np.argsort(r)[-10:]]))

pca = PCA(n_components=2, random_state=20)
pca_vecs=pca.fit_transform(X.toarray())
x0 = pca_vecs[:,0]
x1 = pca_vecs[:,1]

In [None]:
# Create the plot for K means clustering
d = {'Cluster':clusters.tolist(), 'x0':x0, 'x1':x1,'Topic':data['Topic']}
df3 = pd.DataFrame(data=d)
groups = df3.groupby('Topic')
fig, ax = plt.subplots()
for name, group in groups:
    ax.plot(group.x0, group.x1, ms=6, label=name)

for i, txt in enumerate(clusters):
    ax.annotate(txt, (x0[i], x1[i]))
ax.legend()
plt.title('K-Means Cluster with single words')
plt.savefig('K-meansWords.png')

plt.show()

Repeat for Unigram + Bigram and Unigram + Bigram + Trigram datasets

In [None]:
from sklearn.cluster import KMeans # initialize KMeans with 4 clusters
kmeans = KMeans(verbose=1,n_clusters=4) 
kmeans.fit(Y2)
clusters = kmeans.labels_
# Output a few words for each cluster (example given in lecture notes for Text clustering)
df = pd.DataFrame(Y2.todense()).groupby(clusters).mean() 
terms = vectorizer.get_feature_names_out()
for i,r in df.iterrows():
    print('\nCluster Number {}'.format(i))
    print(','.join([terms[t] for t in np.argsort(r)[-10:]]))

pca = PCA(n_components=2, random_state=20)
pca_vecs=pca.fit_transform(Y2.toarray())
x0 = pca_vecs[:,0]
x1 = pca_vecs[:,1]

In [None]:
# Create the plot for K means clustering
d = {'Cluster':clusters.tolist(), 'x0':x0, 'x1':x1,'Topic':data['Topic']}
df3 = pd.DataFrame(data=d)
groups = df3.groupby('Topic')
fig, ax = plt.subplots()
for name, group in groups:
    ax.plot(group.x0, group.x1, ms=6, label=name)

for i, txt in enumerate(clusters):
    ax.annotate(txt, (x0[i], x1[i]))
ax.legend()
plt.title('K-Means Cluster with single words')
plt.savefig('K-meansWords.png')

plt.show()

In [None]:
from sklearn.cluster import KMeans # initialize KMeans with 4 clusters
kmeans = KMeans(verbose=1,n_clusters=4) 
kmeans.fit(Y5)
clusters = kmeans.labels_
# Output a few words for each cluster (example given in lecture notes for Text clustering)
df = pd.DataFrame(Y5.todense()).groupby(clusters).mean() 
terms = vectorizer.get_feature_names_out()
for i,r in df.iterrows():
    print('\nCluster Number {}'.format(i))
    print(','.join([terms[t] for t in np.argsort(r)[-10:]]))

pca = PCA(n_components=2, random_state=20)
pca_vecs=pca.fit_transform(Y5.toarray())
x0 = pca_vecs[:,0]
x1 = pca_vecs[:,1]

In [None]:
# Create the plot for K means clustering
d = {'Cluster':clusters.tolist(), 'x0':x0, 'x1':x1,'Topic':data['Topic']}
df3 = pd.DataFrame(data=d)
groups = df3.groupby('Topic')
fig, ax = plt.subplots()
for name, group in groups:
    ax.plot(group.x0, group.x1, ms=6, label=name)

for i, txt in enumerate(clusters):
    ax.annotate(txt, (x0[i], x1[i]))
ax.legend()
plt.title('K-Means Cluster with single words')
plt.savefig('K-meansWords.png')

plt.show()