In [6]:
import pandas as pd
import glob
import csv
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import collocations
from nltk import pos_tag as pos
from nltk.tokenize import word_tokenize
from gensim.utils import simple_preprocess
from gensim.corpora.dictionary import Dictionary
from gensim.corpora import MmCorpus
from gensim.models import LdaModel, LdaMulticore, CoherenceModel
import matplotlib.pyplot as plt

In [2]:
# Import all CSV files from all Wiki articles and save them to one list
files = glob.glob("articles/*.csv")

all_articles = []

for file in files:
    read_handle = open(file, "r")
    text = list(csv.reader(read_handle, delimiter=","))        
    for article in text[1:]:
        all_articles.append(article[1])     

# How many articles has been read
print('Number of articles:', len(all_articles))

Number of articles: 488


In [4]:
all_tokens = []

# import stopwords
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

for article in all_articles:
    # Removing section headers and new line breaks
    text = re.sub("==.*==",'', article)
    text = re.sub("\n",'', text)
    
    # Convert a document into a list of tokens 
    # This lowercases, tokenizes, removes numerical values
    tokens = simple_preprocess(text)
    
    doc_out = []
    for word in tokens:    
        if word not in stop_words:  # to remove stopwords
            Lemmatized_Word = wnl.lemmatize(word)  # lemmatize
            doc_out.append(Lemmatized_Word)
    
    all_tokens.append(doc_out)

# Print out infromation about articles and number of tokens for top 15
print('Tokens groups:', len(all_tokens),'\n')
print("{0:7}{1:10}".format("-No-","--Tokens--"))
for x, tokens in enumerate(all_tokens[:15]):
    print("{0:3}{1:10}".format(x + 1, len(tokens)))

Tokens groups: 488 

-No-   --Tokens--
  1       324
  2       737
  3       164
  4       238
  5       276
  6       162
  7       547
  8       172
  9       546
 10      1168
 11       324
 12       396
 13       386
 14        95
 15       298


In [5]:
# create dictionary - a map of unique tokens
dictionary = Dictionary(all_tokens)
dictionary.filter_extremes(no_below = 10, no_above = 0.8)
print('Dictionary length:', len(dictionary.keys()))

# 100 tokens by frequency for cleaned up dictionary
new_t_most_freq = dictionary.most_common(100)
print('Top 20 tokens by frequency\n')

num = 1
for t, f in new_t_most_freq[:20]:
    print(str(num) + '.', t, '-', f)
    num = num + 1


# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(token) for token in all_tokens]
print('\nCorpus length:', len(corpus))

Dictionary length: 2299
Top 20 tokens by frequency

1. woman - 1486
2. company - 1318
3. president - 1271
4. new - 1225
5. business - 1113
6. board - 1048
7. school - 1040
8. year - 997
9. ceo - 979
10. first - 964
11. also - 942
12. executive - 778
13. state - 775
14. director - 747
15. one - 747
16. award - 736
17. york - 683
18. time - 673
19. served - 651
20. national - 619

Corpus length: 488


## Bigrams & Trigrams
https://nicharuc.github.io/topic_modeling/

### Bigrams

In [43]:
bigram_measures = collocations.BigramAssocMeasures()

finder = collocations.BigramCollocationFinder.from_documents(all_tokens)

# Filter only those that occur at least N times
finder.apply_freq_filter(20)
bigram_scores = finder.score_ngrams(bigram_measures.pmi)

# Create a dataframe with bigram PMI scores -  Pointwise Mutual Information
bigram_pmi = pd.DataFrame(bigram_scores)
bigram_pmi.columns = ['bigram', 'pmi']
bigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)
print('Number of bigrams:',len(bigram_pmi.index))
bigram_pmi.head(25)

Number of bigrams: 277


Unnamed: 0,bigram,pmi
0,"(hong, kong)",12.838263
1,"(simon, schuster)",12.671039
2,"(planned, parenthood)",12.548756
3,"(cum, laude)",12.101297
4,"(magna, cum)",12.101297
5,"(leon, guerrero)",11.730871
6,"(silicon, valley)",11.568174
7,"(covid, pandemic)",11.441557
8,"(joe, biden)",11.401351
9,"(chamber, commerce)",11.330116


In [8]:
# Filter for bigrams with only noun-type structures
def bigram_filter(bigram):
    tag = pos(bigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['NN']:
        return False
    if bigram[0] in stop_words or bigram[1] in stop_words:
        return False
    if 'n' in bigram or 't' in bigram:
        return False
    if 'PRON' in bigram:
        return False
    return True

In [44]:
filtered_bigram = bigram_pmi[bigram_pmi.apply(lambda bigram:\
                                              bigram_filter(bigram['bigram'])\
                                              and bigram.pmi > 5, axis = 1)][:500]
print('Number of filtered bigrams:',len(filtered_bigram.index))
filtered_bigram.head()

Number of filtered bigrams: 180


Unnamed: 0,bigram,pmi
0,"(hong, kong)",12.838263
1,"(simon, schuster)",12.671039
2,"(planned, parenthood)",12.548756
4,"(magna, cum)",12.101297
5,"(leon, guerrero)",11.730871


In [12]:
# Joining bigrams
bigrams = [' '.join(x) for x in filtered_bigram.bigram.values if len(x[0]) > 2 or len(x[1]) > 2]
bigrams[:10]

['hewlett packard',
 'alma mater',
 'desmond hellmann',
 'hong kong',
 'ben ishay',
 'simon schuster',
 'von tobel',
 'planned parenthood',
 'phi beta',
 'douglas elliman']

### Trigrams

In [15]:
trigram_measures = collocations.TrigramAssocMeasures()
finder = collocations.TrigramCollocationFinder.from_documents(all_tokens)
# Filter only those that occur at least N times
finder.apply_freq_filter(10)
trigram_scores = finder.score_ngrams(trigram_measures.pmi)

# Trigram dataframe
trigram_pmi = pd.DataFrame(trigram_scores)
trigram_pmi.columns = ['trigram', 'pmi']
trigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)
print('Number of trigrams:',len(trigram_pmi.index))
trigram_pmi.head()

Number of trigrams: 140


Unnamed: 0,trigram,pmi
0,"(buena, salud, guide)",25.723594
1,"(phi, beta, kappa)",25.454133
2,"(magna, cum, laude)",24.202595
3,"(robert, wood, johnson)",21.49669
4,"(graduated, magna, cum)",21.043672


In [20]:
# Filter for trigrams with only noun-type structures
def trigram_filter(trigram):
    tag = pos(trigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['JJ','NN']:
        return False
    if trigram[0] in stop_words or trigram[-1] in stop_words or trigram[1] in stop_words:
        return False
    if 'n' in trigram or 't' in trigram:
         return False
    if 'PRON' in trigram:
        return False
    return True 

In [21]:
filtered_trigram = trigram_pmi[trigram_pmi.apply(lambda trigram: \
                                                 trigram_filter(trigram['trigram'])\
                                                 and trigram.pmi > 5, axis = 1)]

print('Number of filtered trigrams:',len(filtered_trigram.index))
filtered_trigram.head()

Number of filtered trigrams: 136


Unnamed: 0,trigram,pmi
0,"(buena, salud, guide)",25.723594
1,"(phi, beta, kappa)",25.454133
2,"(magna, cum, laude)",24.202595
3,"(robert, wood, johnson)",21.49669
4,"(graduated, magna, cum)",21.043672


In [22]:
# Joining trigrams
trigrams = [' '.join(x) for x in filtered_trigram.trigram.values if len(x[0]) > 2 or len(x[1]) > 2 and len(x[2]) > 2]
trigrams[:10]

['buena salud guide',
 'phi beta kappa',
 'magna cum laude',
 'robert wood johnson',
 'graduated magna cum',
 'doctor humane letter',
 'current dollar term',
 'wall street journal',
 'boy girl club',
 'carnegie mellon university']

### Concatenate n-grams

In [23]:
# Concatenate n-grams
def replace_ngram(x):
    for gram in trigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    for gram in bigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    return x

In [30]:
# Combine all tokens into one string
clean_articles = [' '.join(x) for x in all_tokens]

# Remplace ngrams with underscore (_) versions
ngram_articles = [replace_ngram(x) for x in clean_articles]

In [33]:
ngram_articles[0]

'dr anabel jensen american educator author best known work curriculum utilizing emotional intelligence former director nueva learning center became president six second ceo synapse school currently professor notre dame de namur university anabel lee jensen born two u army officer danish descent began attending brigham young university graduated ba psychology master education received ph university_california_berkeley majored child development minored statistic executive director nueva learning center california helped develop self science curriculum featured daniel goleman book emotional intelligence matter iq helped bring eq mainstream former nueva school administrator teacher jensen karen mccown joshua freedman marsha rideout left school found six second eq network non_profit focused education eq founding president helped write training program psychometric assessment organization including six second emotional intelligence assessment sei youth version sei yv co_founded elementary mi

In [36]:
# tokenize reviews + remove stop words + remove names + remove words with less than 2 characters
articles_w_ngrams = [word_tokenize(x) for x in ngram_articles]

In [37]:
articles_w_ngrams[2]

['jessica',
 'mah',
 'born',
 'may',
 'westchester',
 'county',
 'new_york',
 'american',
 'entrepreneur',
 'mah',
 'founded',
 'several',
 'company',
 'including',
 'indinero',
 'mahway',
 'mah',
 'born',
 'westchester',
 'county',
 'new_york',
 'parent',
 'entrepreneur',
 'clothing',
 'business',
 'immigrant',
 'hong_kong',
 'relocated',
 'united_state',
 'mah',
 'finished',
 'high_school',
 'age',
 'joined',
 'bard',
 'college',
 'simon',
 'rock',
 'age',
 'mah',
 'joined',
 'university_california_berkeley',
 'computer_science',
 'program',
 'graduated',
 'age',
 'mah',
 'began',
 'first',
 'business',
 'purchasing',
 'server',
 'space',
 'bulk',
 'selling',
 'fraction',
 'space',
 'cheaper',
 'price',
 'age',
 'started',
 'first',
 'internet',
 'company',
 'selling',
 'computer',
 'part',
 'ebay',
 'university_california_berkeley',
 'mah',
 'classmate',
 'andy',
 'su',
 'co_founded',
 'indinero',
 'fintech',
 'company',
 'providing',
 'accounting',
 'financial',
 'software',
 'busi