In [16]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import numpy.core.defchararray as npd
import spacy
from spacy.lang.en import English
nlp = spacy.load('en')
import re
import nltk

## Step1: Read Data

In [8]:
df=pd.read_csv("../input/Papers_sub.csv")

In [9]:
df.shape

(10, 7)

In [12]:
df=df[['Id', 'Title', 'EventType', 'PdfName', 'Abstract',
       'PaperText']]

In [13]:
df.head(4)

Unnamed: 0,Id,Title,EventType,PdfName,Abstract,PaperText
0,5677,Double or Nothing: Multiplicative Incentive Me...,Poster,5677-double-or-nothing-multiplicative-incentiv...,Crowdsourcing has gained immense popularity in...,Double or Nothing: Multiplicative\nIncentive M...
1,5941,Learning with Symmetric Label Noise: The Impor...,Spotlight,5941-learning-with-symmetric-label-noise-the-i...,Convex potential minimisation is the de facto ...,Learning with Symmetric Label Noise: The\nImpo...
2,6019,Algorithmic Stability and Uniform Generalization,Poster,6019-algorithmic-stability-and-uniform-general...,One of the central questions in statistical le...,Algorithmic Stability and Uniform Generalizati...
3,6035,Adaptive Low-Complexity Sequential Inference f...,Poster,6035-adaptive-low-complexity-sequential-infere...,We develop a sequential low-complexity inferen...,Adaptive Low-Complexity Sequential Inference f...


## Task 1: Generate FastText word embeddings
1. support a user-defined number of dimensions 
2. have the ability to turn on/off at least one text-preprocessing step.

### Step 1: Preprocessing data
- data be in str format for python 3 
space
tab
vertical tab
carriage return
formfeed
the null character

#### text preprocessing guidelines - 
https://www.kdnuggets.com/2018/08/practitioners-guide-processing-understanding-text-2.html

In [21]:
#import en_core_web_sm
#nlp = en_core_web_sm.load()
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
#from pycontractions import Contractions
import unicodedata

import spacy
nlp = spacy.load('en')
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

In [23]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [24]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [22]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [25]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [26]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [27]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [28]:
def normalize_corpus(doc, html_stripping=False, accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    #normalized_corpus = []
    # normalize each document in the corpus
    #for doc in corpus:
    # strip HTML
    if html_stripping:
        doc = strip_html_tags(doc)

    # remove accented characters
    if accented_char_removal:
        doc = remove_accented_chars(doc)

    # lowercase the text    
    if text_lower_case:
        doc = doc.lower()

    # remove extra newlines
    doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)

    # lemmatize text
    if text_lemmatization:
        doc = lemmatize_text(doc)

    # remove special characters and\or digits    
    if special_char_removal:
        # insert spaces between special characters to isolate them    
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        doc = remove_special_characters(doc, remove_digits=remove_digits)  

    # remove extra whitespace
    doc = re.sub(' +', ' ', doc)

    # remove stopwords
    # http://www.cs.cornell.edu/~xanda/stopwords2017.pdf
    if stopword_removal:
        doc = remove_stopwords(doc, is_lower_case=text_lower_case)

    #normalized_corpus.append(doc)
    return doc

## Task 3: Generate document clustering

In [33]:
df=pd.read_csv("../input/cleaned_data.csv")

#### LDA

In [34]:
from gensim import corpora, models, similarities
from itertools import chain
class document_clustering:
    def __init__(self,data):
        self.data=data
        self.lda_model=None
        self.documents_cluster=dict()
        self.cluster_word_map=dict()
        
    def get_document_cluster(self, topic_count=5):
        # create a document corpus for LDA
        documents =[(i,j) for i,j in zip(self.data.Title, self.data.PaperTextClean)]
        
        # Make sure to include words which have a minimum length of 3
        # NOTE: this is to avoid cases where cleaning removes non alphabetic characters (pg13 -> pg)
        document_updated= [[word for word in document[1].lower().split() if len(word)>3 ] for document in documents]

        # create list of token
        all_tokens = sum(document_updated, [])

        # remove words that appear only once
        tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)

        texts = [[word for word in text if word not in tokens_once] for text in document_updated]

        # Create Dictionary for word corpora
        id2word = corpora.Dictionary(texts)

        # Creates the Bag of Word corpus.
        bag_of_words = [id2word.doc2bow(text) for text in texts]

        # Trains the LDA models.
        self.lda_model = models.ldamodel.LdaModel(corpus=bag_of_words, id2word=id2word, num_topics=topic_count, update_every=1, chunksize=10000, passes=1)

        # Assigns the topics to the documents in corpus
        lda_corpus = self.lda_model[bag_of_words]

        # Find threshold for document to be part of cluster, threshold to be 1/#clusters,
        # Average the sum of all probabilities:
        scores = list(chain(*[[score for topic_id,score in topic] for topic in [doc for doc in lda_corpus]]))
        threshold = sum(scores)/len(scores)

        # saving the LDA Model
        #self.lda_model.save("../models/lda_model")
        
        # Generate document cluster for each topic
        # Document cluster - {cluster_id: [list of documents]}
        for i in range(len(document_updated)):
            for j in lda_corpus[i]:
                if j[1] > threshold:
                    key=j[0]
                    if key in self.documents_cluster:
                        self.documents_cluster[j[0]].append(documents[i][0])
                    else:
                        self.documents_cluster[j[0]]=[documents[i][0]]      
        
        # Generate list of words corresponding to cluster
        # {cluster_id: [(word, significance of word)]}
        self.cluster_word_map={key: self.lda_model.show_topic(key, topn = 10) for key in range(self.lda_model.num_topics)}
        #print(self.cluster_word_map)

In [35]:
df1=df[:100]
obj=document_clustering(df1)
obj.get_document_cluster()
obj.documents_cluster
#obj.cluster_word_map

{3: ['Double or Nothing: Multiplicative Incentive Mechanisms for Crowdsourcing'],
 4: ['Learning with Symmetric Label Noise: The Importance of Being Unhinged',
  'Planar Ultrametrics for Image Segmentation'],
 2: ['Adaptive Low-Complexity Sequential Inference for Dirichlet Process Mixture Models',
  'Robust Portfolio Optimization',
  'Expressing an Image Stream with a Sequence of Natural Sentences'],
 1: ['Covariance-Controlled Adaptive Langevin Thermostat for Large-Scale Bayesian Sampling',
  'Logarithmic Time Online Multiclass prediction'],
 0: ['Parallel Correlation Clustering on Big Graphs']}