In [1]:
# Importing our required libraries
import os
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
import random
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import word_tokenize
import nltk
import spacy
import re

# scikit-learn library will generate our document-term matrix
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

# topic modelling
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import gensim


# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
from pprint import pprint

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import random
random.seed(929)

In [3]:
wsj = pd.read_csv('..//data/wsj_full.csv')
# display(wsj.head())

# only relevant articles are selected
tp_li = [0, 2, 6, 7, 8, 9, 10, 14, 17, 19]
wsj_selected = wsj[wsj['Topic_Num'].isin(tp_li)] 

In [4]:
# Getting document-term matrix using scikit-learn's CountVectorizer
vectorizer = CountVectorizer(min_df = .03, stop_words = 'english')

# Feeding in list of all text
document_term_matrix = vectorizer.fit_transform(wsj_selected['Text'].tolist())

# Getting words 
features = vectorizer.get_feature_names()

# Converting to dataframe
df_document_term = pd.DataFrame(document_term_matrix.todense(), columns=features)
df_document_term

Unnamed: 0,000,10,100,11,12,13,14,15,150,16,...,wouldn,wrote,wsj,year,years,yield,yields,york,young,zero
0,0,2,0,0,0,0,0,0,0,0,...,1,0,0,16,1,2,2,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,2,0,0,0,0,0,0,0,0,...,0,0,1,5,2,1,1,1,0,0
3,0,1,0,0,1,1,1,0,0,0,...,0,0,2,4,0,0,0,1,0,0
4,0,1,0,0,2,0,0,0,0,1,...,1,0,0,2,0,0,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19311,0,0,1,0,0,0,0,1,0,0,...,0,0,0,1,1,0,0,1,0,0
19312,0,2,0,0,0,0,0,0,0,0,...,0,0,0,5,0,1,1,2,0,0
19313,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
19314,4,0,0,0,0,0,0,0,0,0,...,0,0,0,3,0,0,0,1,0,0


In [5]:
df_document_term['file name'] = wsj_selected['Article ID'].values.tolist()

# Setting file name as our index
df_document_term.set_index('file name', drop=True, inplace=True)
df_document_term

Unnamed: 0_level_0,000,10,100,11,12,13,14,15,150,16,...,wouldn,wrote,wsj,year,years,yield,yields,york,young,zero
file name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2424324530.xml,0,2,0,0,0,0,0,0,0,0,...,1,0,0,16,1,2,2,0,0,0
2249646792.xml,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2352540052.xml,0,2,0,0,0,0,0,0,0,0,...,0,0,1,5,2,1,1,1,0,0
2452848249.xml,0,1,0,0,1,1,1,0,0,0,...,0,0,2,4,0,0,0,1,0,0
2437040210.xml,0,1,0,0,2,0,0,0,0,1,...,1,0,0,2,0,0,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2382382107.xml,0,0,1,0,0,0,0,1,0,0,...,0,0,0,1,1,0,0,1,0,0
2236060833.xml,0,2,0,0,0,0,0,0,0,0,...,0,0,0,5,0,1,1,2,0,0
2187814592.xml,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
2325697439.xml,4,0,0,0,0,0,0,0,0,0,...,0,0,0,3,0,0,0,1,0,0
