In [50]:
import csv
import pandas as pd
import numpy as np
import nltk
from nltk.stem import *
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.corpora import Dictionary
from gensim.models import LdaModel


## Task 1:

In [51]:
news_categorizer = pd.read_csv('C:/Users/felix/Documents/TU Dortmund/Text as Data/Sheet4/NewsCategorizer.csv')

In [52]:
data = news_categorizer[['category','short_description']].copy()

## Task 2:

In [53]:
def preprocessing(listoftexts):
    """
    This function performs for a list of texts all preprocessing steps
    parameter: list of texts
    output: list of lists containing where each contained lists contains the preprocessed tokens
    """
    
    # preprosessing of the stopwords:
    lemmatizer = WordNetLemmatizer()
    
    stop_words = list(set(stopwords.words('english')))
    
    preprocessed_stop_words = []
    
    for k in np.arange(len(stop_words)):
        text= ""
        for i in np.arange(len(stop_words[k])):
            if stop_words[k][i].isalpha() or stop_words[k][i].isspace():
                text += stop_words[k][i].lower()
        text = lemmatizer.lemmatize(text)
        preprocessed_stop_words.append(text)
    
    # preprosessing of the actual text
    
    # list to save the proprocessed books
    preprocessed_books = []
    
    # list containing the regexes to clean the texts for not wanted pattern

    #for loop iterating over all texts while performing the proprocessing steps
    for k in np.arange(len(listoftexts)):
        
        # list to save the preprocessed text
        preprocessed_book = []
                
        # remove non-alphabetical chars except space and split the text into a list of tokens
        text= ""
        for i in np.arange(len(listoftexts[k])):
            if listoftexts[k][i].isalpha() or listoftexts[k][i].isspace():
                text += listoftexts[k][i].lower()
        text_split = text.split()
        
        # lemmatize each token and remove it if it is part of the stopwords
        for i in text_split:
            i = lemmatizer.lemmatize(i)
            if i not in preprocessed_stop_words:
                preprocessed_book.append(i)
        
        # add the list of preprocessed tokens of the text to the list of preprocessed texts
        preprocessed_books.append(preprocessed_book)
        
    return preprocessed_books

In [54]:
short_description = data['short_description'].to_list()


In [55]:
preprocessed_short_description = preprocessing(short_description)

## Task 3:

In [56]:
dictionary = Dictionary(preprocessed_short_description)

In [57]:
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [58]:
corpus = [dictionary.doc2bow(description) for description in preprocessed_short_description]

In [60]:
corpus

[[(0, 1),
  (1, 1),
  (2, 2),
  (3, 2),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 3),
  (9, 2),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 1)],
 [(22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1)],
 [(28, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1)],
 [(44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 2)],
 [(53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1)],
 [(66, 1), (67, 2), (68, 1), (69, 1), (70, 2), (71, 1), (72, 1)],
 [(73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1)],
 [(1, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1)],
 [(52, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 1),
  (91

In [61]:
print('Number of unique tokens: %d' % len(dictionary))

Number of unique tokens: 4516


In [62]:
print('Number of documents: %d' % len(corpus))

Number of documents: 50000


In [63]:
num_topics = 10
passes = 15
iterations = 15

In [64]:
temp = dictionary[0]
id2word = dictionary.id2token

In [65]:
model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    alpha='auto',
    eta='auto',
    num_topics=num_topics,
    passes=passes
)

In [66]:
top_topics = model.top_topics(corpus)

In [87]:
print(top_topics)

[([(0.018083455, 'one'), (0.015956607, 'year'), (0.014132701, 'time'), (0.01176855, 'u'), (0.010602658, 'people'), (0.009917231, 'first'), (0.009829322, 'world'), (0.009065936, 'get'), (0.009053767, 'make'), (0.007827169, 'way'), (0.007815186, 'many'), (0.007564923, 'say'), (0.007524305, 'team'), (0.0073450897, 'two'), (0.007084856, 'week'), (0.00698462, 'woman'), (0.006907083, 'thing'), (0.0068391985, 'would'), (0.006642261, 'need'), (0.0063841008, 'even')], -3.349851781966112), ([(0.0368946, 'like'), (0.03409599, 'day'), (0.017081302, 'look'), (0.016434181, 'back'), (0.013333863, 'kid'), (0.012793657, 'season'), (0.0124761965, 'love'), (0.011100588, 'let'), (0.010573908, 'long'), (0.009324968, 'thats'), (0.008968384, 'keep'), (0.008661861, 'give'), (0.008025624, 'real'), (0.007642249, 'getting'), (0.007481933, 'head'), (0.0074294265, 'away'), (0.007077231, 'far'), (0.006938188, 'nothing'), (0.0068906397, 'set'), (0.006878804, 'left')], -4.297623928217506), ([(0.03934868, 'state'), (0

## Task 4:

In [67]:
topics_per_doc = model.print_topics()

In [79]:
best_topics = []

for doc in corpus:
    topics = model.get_document_topics(doc)
    max_topic = max(topics, key=lambda x:x[1])[0]
    best_topics.append(max_topic)
    

50000 [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 

In [82]:
data['topics'] = best_topics

In [83]:
data['topics'].value_counts()

6    49901
2       80
9       18
0        1
Name: topics, dtype: int64

In [84]:
results = data.groupby(['category','topics']).count()

In [86]:
print(results)

                       short_description
category       topics                   
BUSINESS       6                    5000
ENTERTAINMENT  6                    5000
FOOD & DRINK   6                    4995
               9                       5
PARENTING      6                    4996
               9                       4
POLITICS       6                    5000
SPORTS         6                    5000
STYLE & BEAUTY 2                      80
               6                    4919
               9                       1
TRAVEL         6                    4997
               9                       3
WELLNESS       0                       1
               6                    4994
               9                       5
WORLD NEWS     6                    5000


While K-means assigned the clusters more or less eually distributed among the categories, we see now that LDA mainly assigns just one topic. We should have in mind that our LDA model uses just 15 passes and 15 iterations. An increase in the number of passes and iterations could improve our performance.