In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import contractions

from tqdm import tqdm
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from scipy.spatial import distance
from gensim.models import Word2Vec

# KMeans

Below is an implemented KMeans algorithm on Word2Vec embeddings of the WikiHow article data. KMeans is an unsupervised learning algorithm, meaning we do not care about the "Summary" columns during training. The pipeline is summarized as follows:
- Preprocess Text (Remove stopwords, short words, punctuation, etc.)
- Tokenize Data
- Embed data into Word2Vec
- Pass embedded vectors into KMeans

We perform this process for each article and average the rouge score to evaluate our metric.

In [26]:
""" Here we set text to lower case, remove plurals, 
    expand contractions, remove punctuation, remove stopwords, and remove short words 
    (could also remove parentheticals)"""

stop = set(stopwords.words('english'))
def clean_text(text):
    ret = text.lower()
    ret = contractions.fix(text)
    ret = re.sub(r'\([^)]*\)', '', ret)
    ret = re.sub('"','', ret)
    ret = re.sub(r"'s\b","", ret)
    ret = re.sub("[^a-zA-Z]", " ", ret) 
    
    #Remove any words shorter than 2 letters
    tokens = [w for w in ret.split() if not w in stop]
    long_words=[]
    for i in tokens:
        if len(i)>=3:                 
            long_words.append(i)   
    return (" ".join(long_words)).strip()

In [27]:
#Loading Data
data = pd.read_csv('data/cleaned_data.csv')
data = data[['Summary', 'Text']]

#Create corpus for TF-IDF of summaries
#corpus = data["Text"]
#corpus = corpus.apply(clean_text)

#Create Sample Text Doc
test = data['Text'][0]
text = test.replace('\\', '').replace('/', '').replace('.,', '.').replace('.;,', '.')

In [22]:
test = data['Text'][0]
label = data['Summary'][0]
text = test.replace('\\', '').replace('/', '').replace('.,', '.').replace('.;,', '.')
lbl = label.replace('\\', '').replace('/', '').replace('.,', '. ').replace('.;,', '. ')

In [9]:
sentence = sent_tokenize(text)
clean = []
for sen in sentence:
    clean.append(clean_text(sen))

In [34]:
from rouge_score import rouge_scorer
class KMeans_Summ():
    def __init__(self, max_iters=300, n_init=10):
        self.sentence = None
        self.max_iters = max_iters
        self.n_init = n_init
    
    def embed_article(self, min_count, vector_size, article):
        clean=[]
        
        sentence = sent_tokenize(article)
        self.sentence = sentence
        
        for sen in sentence:
            clean.append(clean_text(sen))
        all_words = [i.split() for i in clean]
        model = Word2Vec(all_words, min_count=min_count, vector_size=vector_size)
        
        sent_vector=[]
        for i in clean:
            plus=0
            for j in i.split():
                plus+=model.wv[j]
            if len(i.split()) != 0:
                plus = plus/len(i.split())
            sent_vector.append(plus)
        return sent_vector
    
    def summarize_article(self, n_clusters, vector):
        kmeans = KMeans(n_clusters, init='k-means++', random_state=42, max_iter = self.max_iters, n_init = self.n_init) 
        y_kmeans = kmeans.fit_predict(vector)

        my_list=[]
        #print("Summarizing Article...")
        for i in range(n_clusters):
            my_dict={}

            for j in range(len(y_kmeans)):
                if y_kmeans[j]==i:
                    my_dict[j] = distance.euclidean(kmeans.cluster_centers_[i],vector[j])
            min_distance = min(my_dict.values())
            my_list.append(min(my_dict, key=my_dict.get))

        result = ""
        for i in sorted(my_list):
            result += self.sentence[i] + " "
        #print("Finished Summarizing!")
        return result                  

In [36]:
from rouge_score import rouge_scorer

#Parameter values for search
m_iters = [200, 300, 400]
n_init = [5, 10, 15]

#Loop to store all result text and summaries
tot_results = []
rouge1 = []
grid = {}
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
for init in n_init:
    for miter in m_iters:
        print(init, miter)
        scores = []
        for i, row in enumerate(data["Text"][:50000]):
            summary = data["Summary"][i].replace('\\', '').replace('/', '').replace('.,', '.').replace('.;,', '.')
            text = row.replace('\\', '').replace('/', '').replace('.,', '.').replace('.;,', '.')
            #print(text)

            if len(text) >= 1000:
                try:
                    summ = KMeans_Summ(miter, init)
                    vec = summ.embed_article(1, 300, text)
                    result = summ.summarize_article(3, vec)
                    tot_results.append(result)
                    #print(scorer.score(result, summary))
                    scores.append(scorer.score(result, summary))
                    
                except:
                    scores.append(0)
        
        for score in scores:
            #print("scores")
            if score != 0:
                rouge1.append(score['rouge1'].fmeasure)
        
        grid['{init}-{miter}'.format(init = init, miter = miter)] = np.mean(rouge1)

                       
print(len(tot_results))

5 200
5 300
5 400
10 200
10 300
10 400
15 200
15 300
15 400
338040


In [38]:
print(grid)
print(max(grid))

{'5-200': 0.24654490908978527, '5-300': 0.24654490908978527, '5-400': 0.24654490908978527, '10-200': 0.24629058450438096, '10-300': 0.2461379897531384, '10-400': 0.24603625991897665, '15-200': 0.24590532336630355, '15-300': 0.24580712095179866, '15-400': 0.24573074129607264}
5-400


In [6]:
rouge1 = []

for score in scores:
    if score != 0:
        rouge1.append(score['rouge1'].fmeasure)

print(len(scores))
print(len(rouge1))
print(np.mean(rouge1))

146016
140383
0.25337091953441254


# Keyword Extraction

While not essential to the KMeans above, a simple keyword extraction tool using TF-IDF is implemented below for users to tag their text as well. 

In [23]:
class TF_IDF():
    def __init__(self, corpus):
        self.text = corpus
        self.stopwords = set(stopwords.words("english"))
        self.cv = CountVectorizer(max_df=0.85, stop_words=self.stopwords)
        self.wordcount = self.cv.fit_transform(corpus)
    
        self.transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
        self.transformer.fit(self.wordcount)
    
    def sort_vals(self, matrix):
        tuples = zip(matrix.col, matrix.data)
        return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
    
    def extract_top_k(self, feature_names, items, k=10):
        items = items[:k]

        scores = []
        features = []

        for idx, score in items:
            scores.append(round(score, 3))
            features.append(feature_names[idx])
        
        results = {}
        for idx in range(len(features)):
            results[features[idx]] = scores[idx]
        
        return results

    def extract_keywords(self, doc, k=10):
        feature_names = self.cv.get_feature_names()
        tf_idf_vector = self.transformer.transform(self.cv.transform([doc]))

        sort_items = self.sort_vals(tf_idf_vector.tocoo())
        keywords = self.extract_top_k(feature_names, sort_items, k)

        print("\nDocument")
        print(doc)
        print("\nKeywords")
        for k in keywords:
            print(k, keywords[k])
        
        return keywords