In [1]:
import numpy as np
import pandas as pd
from collections import OrderedDict
import re
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.utils import shuffle
from wordcloud import WordCloud

ImportError: DLL load failed: The specified module could not be found.

In [None]:
corpus = ['Apple Orange Orange Apple','Apple Banana Apple Banana','Banana Apple Banana Banana Banana Apple',\
          'Banana Orange Banana Banana Orange Banana','Banana Apple Banana Banana Orange Banana']

In [None]:
''' Create a function to vectorize the counts of a list of sentences. Does not ignore case.'''
def myCountVec(corpus):
    bag = set()  # use a set to figure out how many unique words there are
    word_index = dict() # use a dict to store the column index of the word
    for line in corpus:
        for word in line.split():
            if word in bag:
                continue
            else:
                bag.add(word)
    list_of_words = sorted(list(bag)) # sort the list of words derived from set
    for index,word in enumerate(list_of_words):
        word_index[word] = index  # assign index to word
    vec_of_counts = np.zeros((len(corpus),len(word_index))) # create dummy matrix with all locations initilised to 0
    for row,line in enumerate(corpus):
        for word in line.split():
            col = word_index[word]
            vec_of_counts[row][col] += 1  # populate the matrix, increasing a word count when needed.
    return vec_of_counts

In [None]:
myCountVec(corpus)

In [None]:
df = pd.read_csv('MrTrumpSpeeches.csv', sep='~', encoding='latin1')
df['sentiment'] = np.where(df['like_count'] > df['dislike_count'], 1, 0)
df = shuffle(df)
df.head()

In [None]:
# lets check some characteristics of the datframe
df.shape

In [None]:
# ok so 836 rows by 10 columns. Lets check the numeric columns for issues
df.describe()

In [None]:
# ok so far so good. Lets check the non numeric columns
for col in ['id','playlist','title','subtitles']:
    print(df[col].describe())

In [None]:
# so looks like we have all the data. Lets check for missing data and nans
df.isnull().any()

In [None]:
# so there are some genral punctuation marks that should be removed since they dont add anything to the context.
# Some word are enclosed inside [] indicating environment and not really contributing to the sentiment anaylysis
def cleaningFunc(line):
    # Create p_stemmer of class PorterStemmer
    line = re.sub('[\s]{2,}',' ',line)
    line = re.sub('[\/\+\-,:.\'\$*%\&]','',line) # remove puctuation marks etc.
    line = line.lower() # convert everthing to lowercase
    words = line.split() # split into words
    newwords = []
    for word in words:
        if word.startswith('[') or word.endswith(']'): # skip words enclosed in brackets since they indicate action not sentiment or topic
            continue
        word = word.strip(' ') # remove any trailing or leading spaces
        newwords.append(word)
    return " ".join(newwords) # return sentence with word seperated by a single space

In [None]:
df['subtitle_clean'] = df['subtitles'].apply(lambda x : cleaningFunc(x))
df.head()

For the classification tests I will be using F1 score because we are doing binary classification and this score incorporates
both false positives and false negatives in the metric.

In [None]:
# only choosing features that are >5% and <95% frequent
cvecs = CountVectorizer(max_df=0.95,min_df=0.05,stop_words='english') 
Xcv = cvecs.fit_transform(df['subtitle_clean'].values)
print("Count matrix shape :", Xcv.shape)
tfvecs = TfidfVectorizer(max_df=0.95,min_df=0.05,stop_words='english') 
Xtf = tfvecs.fit_transform(df['subtitle_clean'].values)
print("Tfidf matrix shape :", Xtf.shape)
# lets makes the 1 to 3 word ngram tfidf feature matrix
tfvecsngram = TfidfVectorizer(max_df=0.95,min_df=0.05,stop_words='english',ngram_range=(1,3))
Xtfng = tfvecsngram.fit_transform(df['subtitle_clean'].values)
print("Tfidf ngram matrix shape :", Xtfng.shape)

In [None]:
# lets run a loop for all the tests and store the f1 score in a list for plotting later
tests = {1:'Logistic regression model on word count',2:'Logistic regression model on TFIDF',\
         3:'Logistic regression model on TFIDF + ngram',4:'Support Vector Machine model on word count',\
         5:'Support Vector Machine model on TFIDF',6:'Support Vector Machine model on TFIDF + ngram'}
results = []
for index,dataset in enumerate([Xcv,Xtf,Xtfng]*2):
    X_train,X_test,y_train,y_test = train_test_split(dataset,df['sentiment'],test_size=0.2,random_state=0)
    if index < 3:
        logistic = LogisticRegression()
        logistic.fit(X_train,y_train)
        y_pred = logistic.predict(X_test)
        score = int(f1_score(y_test,y_pred) * 10000) / 100.0
        print("F1 score for test {} is {}%".format(tests[index+1],score))
        results.append((index+1,score))
    elif index < 6:
        support_vector = LinearSVC()
        support_vector.fit(X_train,y_train)
        y_pred = support_vector.predict(X_test)
        score = int(f1_score(y_test,y_pred) * 10000) / 100.0
        print("F1 score for test {} is {}%".format(tests[index+1],score))
        results.append((index+1,score))
    else:
        pass
print("Tests complete")
###########################################################################################
# lets plot the barplot of values
plotting_df = pd.DataFrame(list(tests.items()),columns=['index','Test'])
plotting_df['score'] = [x[1] for x in results]
fig,ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(15)
sns.barplot(x='index',y='score',hue='Test',data=plotting_df,ax=ax)
plt.xticks([])
plt.xlabel('') # remove xlabels
plt.ylabel('Percentage accuracy',fontsize=15)
plt.title("F1 scores for Logistic Regression vs LinearSVC",fontsize=15)
ax.legend(loc=5)
for bar in ax.patches:
    x = bar.get_x()
    width = bar.get_width()
    centre = x + width/2.
    bar.set_x(centre - 0.5/2.)
    bar.set_width(0.5)
    height = bar.get_height()
    ax.annotate("%.2f" % height, (x + width / 2., height),
             ha='center', va='center', rotation=0, xytext=(0, 10), textcoords='offset points')

## Topic Modeling (20 marks)
1. Using TFIDF and Count Vectorizer models imported for sklearn, perform topic modelling using the following topic modeling algorithms:
    1. NMF
    2. LDA
    3. SVD

2. When choosing the number of topics give a brief explanation of why that number was chosen.
3. Discuss based on the top 10 words each of the algorithms choose for each topic cluster what category the topics fall under.

In [None]:
documents = list(df['subtitle_clean'])[:100] # choose sample of documents
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english',ngram_range=(1,5))
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

n_topics = 10
no_top_words = 10
no_top_documents = 4

def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
#         print("") 
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print(documents[doc_index])
            print("")

In [None]:
nmf_model = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_

for topic_idx, topic in enumerate(nmf_H):
    print("Topic {}:".format(topic_idx))
    print(" ".join([tfidf_feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print("")

In [None]:
lda_model = LatentDirichletAllocation(n_components=n_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_
display_topics(lda_H, lda_W,tf_feature_names, documents, no_top_words, no_top_documents)

In [None]:
lsi_model = TruncatedSVD(n_components=n_topics, n_iter=12, random_state=1).fit(tfidf)
lsi_W = lsi_model.transform(tfidf)
lsi_H = lsi_model.components_
display_topics(lsi_H, lsi_W, tfidf_feature_names, documents, no_top_words, no_top_documents)

## Visualization (10 marks)
Choose the clusters obtained from a topic model algorithm from above and plot a word cloud
1. for each of the clusters. For example, if the number of topics chosen was 10 and the topics were obtained from the SVD algorithm, 10 word clouds should be plotted.

In [None]:
full_doc = {} # lets make a dict to hold the top docs
for topic_idx, topic in enumerate(lsi_H):
    top_doc_indices = np.argsort( lsi_W[:,topic_idx] )[::-1][0:no_top_documents]
    tmp = ''
    for doc_index in top_doc_indices:
        tmp += df['subtitle_clean'][doc_index]
    full_doc[topic_idx] = tmp

In [None]:
# lets make some wordcluds to visualize the results
plt.rcParams['font.size']=12                #10 
plt.rcParams['savefig.dpi']=100             #72 
plt.rcParams['figure.subplot.bottom']=.1 
f, axes = plt.subplots(len(full_doc),1,figsize=(25,25))
for topic in full_doc:
    text = full_doc[topic]
    wc = WordCloud(width=1000,height=500,background_color='white',max_words=20,\
                  random_state=1).generate(text)
    axes[topic].imshow(wc)
    axes[topic].set_title("Topic number = {}".format(topic))
    axes[topic].set_xticks([])
    axes[topic].set_yticks([])

In [None]:
df.shape

In [None]:
s = "a     \n bbbbbb      c"
s = re.sub('[\s]{2,}',' ',s)
s

In [None]:
s = ['abc','ab','abcd','acd']

s