In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import gensim
import pyLDAvis.gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from pprint import pprint

import numpy as np
import pandas as pd
import seaborn as sb
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
from IPython.core.display import HTML
np.random.seed(2020)

In [None]:
df = pd.read_csv("../input/news_dataset.csv")
df.head()

In [None]:
# Visualization 
# Top words used across all one million news titles
datafile = df['news_title']

# Defining the helper function
def top_words(top_n_words, count_vectorizer, text_data): # Returning a tuple of the top n words in a sample & their accompanying counts

    vectorized_titles = count_vectorizer.fit_transform(text_data.values)
    vectorized_total = np.sum(vectorized_titles, axis = 0)
    word_indices = np.flip(np.argsort(vectorized_total)[0,:], 1)
    word_values = np.flip(np.sort(vectorized_total)[0,:],1)
    
    word_vectors = np.zeros((top_n_words, vectorized_titles.shape[1]))
    for i in range(top_n_words):
        word_vectors[i,word_indices[0,i]] = 1

    words = [word[0].encode('ascii').decode('utf-8') for 
             word in count_vectorizer.inverse_transform(word_vectors)]

    return (words, word_values[0,:top_n_words].tolist()[0])

In [None]:
# Omitting stop words
count_vectorizer = CountVectorizer(stop_words = 'english')
words, word_values = top_words(top_n_words = 15, count_vectorizer = count_vectorizer, text_data = datafile)

fig, ax = plt.subplots(figsize=(16,8))
ax.bar(range(len(words)), word_values);
ax.set_xticks(range(len(words)));
ax.set_xticklabels(words, rotation = 'vertical');
ax.set_title('Top Words in News Titles Dataset (Excluding Stop Words)');
ax.set_xlabel('Word');
ax.set_ylabel('Number of Occurences');
plt.show()

In [None]:
# Generating a histogram of news title word lengths
# Converting all news title strings to TextBlobs
tagged_titles = [TextBlob(datafile[i]).pos_tags for i in range(datafile.shape[0])]

In [None]:
tagged_df = pd.DataFrame({'tags':tagged_titles})

word_counts = [] 
pos_counts = {}

for title in tagged_df[u'tags']:
    word_counts.append(len(title))
    for tag in title:
        if tag[1] in pos_counts:
            pos_counts[tag[1]] += 1
        else:
            pos_counts[tag[1]] = 1
            
print('Total Number of Words: ', np.sum(word_counts))
print('Mean Number of Words per News Title: ', np.mean(word_counts))

In [None]:
y = stats.norm.pdf(np.linspace(0,50,50), np.mean(word_counts), np.std(word_counts))

fig, ax = plt.subplots(figsize = (18,8))
ax.hist(word_counts, bins = range(1,50), density = True);
ax.plot(np.linspace(0,50,50), y, 'r--', linewidth = 1);
ax.set_title('News Title Word Lengths');
ax.set_xticks(range(1,50));
ax.set_xlabel('Number of Words');
plt.show()

In [None]:
text_data = df[['news_title']]
text_data['index'] = text_data.index
documents = text_data

In [None]:
len(documents)

In [None]:
documents[:10]

In [None]:
# Data Preprocessing
nltk.download('wordnet')

In [None]:
stemmer = SnowballStemmer('english')

In [None]:
def stem_lemmatize(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos = 'v'))

In [None]:
def preprocessing(text):
    outcome = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            outcome.append(stem_lemmatize(token))
    return outcome

In [None]:
sample_news = documents[documents['index'] == 9860].values[0][0]

print('The Original Document: ')
words = []
for news in sample_news.split(' '):
    words.append(news)
print(words)
print('\n\n Lemmatized & Tokenized Document: ')
print(preprocessing(sample_news))

In [None]:
records_processed = documents['news_title'].map(preprocessing)

In [None]:
records_processed[:20]

In [None]:
#  Bag of Words on the news_title
dictionary = gensim.corpora.Dictionary(records_processed)

In [None]:
corpus = [dictionary.doc2bow(doc) for doc in records_processed]
corpus[9860]

In [None]:
doc_9860 = corpus[9860]

for i in range(len(doc_9860)):
    print("Word {} (\"{}\") shows up {} time.".format(doc_9860[i][0], dictionary[doc_9860[i][0]], doc_9860[i][1]))

In [None]:
# LDA using Bag of Words
# passes = 15 (To make more sense in topics)
model_bow = gensim.models.LdaMulticore(corpus, num_topics = 10, id2word = dictionary, passes = 15, workers = 2)
model_bow.save('model5.gensim')

for idx, topic in model_bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
topic_list = []
topics = model_bow.print_topics()
for t in topics:
    print(t)
    topic_list.append(t)

In [None]:
print(topic_list)

In [None]:
#pyLDAvis.enable_notebook()
#titles = pyLDAvis.gensim.prepare(model_bow, corpus, dictionary, mds = 'tsne')
#pyLDAvis.save_html(panel, 'lda.html')
#titles

In [None]:
#  Topics Classification
# Evaluating the performance - LDA Bag of Words Model - Classification of Sample Document
records_processed[9860]

In [None]:
for index, score in sorted(model_bow[corpus[9860]], key = lambda tup: - 1 * tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, model_bow.print_topic(index, 10)))