In [4]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
data = pd.read_csv("reddit_worldnews_sentiments_clean.csv")
data.head()

Unnamed: 0,Title,Num_Comments,Upvotes,Downvotes,Upvote_Ratio,Date_Posted,Flair,Post_Category,Top_Comment_Score,Sentiment_Label
0,An anti-gay Hungarian politician has resigned ...,849,204547,0,0.93,01/12/20 18:15,,Top,7555.0,negative
1,Trump Impeached for Abuse of Power,879,202909,0,0.88,19/12/19 1:23,Trump,Top,5150.0,negative
2,Vladimir Putin's black belt revoked by interna...,798,200149,0,0.89,28/02/22 20:45,,Top,2907.0,neutral
3,"Two weeks before his inauguration, Donald J. T...",914,189352,0,0.84,19/07/18 2:06,,Top,249.0,positive
4,"Queen Elizabeth II has died, Buckingham Palace...",905,189025,0,0.79,08/09/22 17:32,,Top,1.0,negative


In [8]:
title = data["Title"]
title.head()

0    An anti-gay Hungarian politician has resigned ...
1                   Trump Impeached for Abuse of Power
2    Vladimir Putin's black belt revoked by interna...
3    Two weeks before his inauguration, Donald J. T...
4    Queen Elizabeth II has died, Buckingham Palace...
Name: Title, dtype: object

In [9]:
title = title.str.lower().replace('[^\w\s]', '', regex=True)

In [11]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

In [14]:
tokenizer = Tokenizer()

# Fit the tokenizer on the text data
tokenizer.fit_on_texts(title)

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(title)

# Get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Create the term-document matrix
maxlen = max(len(seq) for seq in sequences)
term_doc_matrix = pad_sequences(sequences, padding='post', maxlen=maxlen)

# Convert the term-document matrix to a DataFrame
tdm_df = pd.DataFrame(term_doc_matrix)

# Reverse the mapping from indices to words
reverse_word_index = dict([(value, key) for (key, value) in tokenizer.word_index.items()])

# Replace integer indices with corresponding terms
tdm_df = tdm_df.replace(reverse_word_index)

# Display the term-document matrix with terms
print(tdm_df)

               0           1          2           3             4           5              6          7             8           9          10          11            12          13    14         15        16      17     18  19      20   21          22       23       24            25  26    27   28    29        30        31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
0              an     antigay  hungarian  politician           has    resigned          after      being        caught          by     police     fleeing             a       25man  orgy    through         a  window      0   0       0    0           0        0        0             0   0     0    0     0         0         0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
1           trump   impeached        for       abuse            of       power              0          0             0           0          0           0             0           0     0     

In [15]:
tdm_df.to_csv("Term Document Index.csv", index=False)

In [20]:
import numpy as np

In [23]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(title)

# Create the term-document matrix
term_document_matrix = np.zeros((len(title), len(tokenizer.word_index) + 1))  # Add 1 for padding

for i, doc in enumerate(title):
    tokens = tokenizer.texts_to_sequences([doc])[0]
    for token in tokens:
        term_document_matrix[i, token] += 1

# Display the term-document matrix
print("Term-Document Matrix:")
print(term_document_matrix)


Term-Document Matrix:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 1. 1.]]


In [26]:
tokenizer.word_index

{'to': 1,
 'in': 2,
 'of': 3,
 'the': 4,
 'a': 5,
 'and': 6,
 'for': 7,
 'on': 8,
 'us': 9,
 'says': 10,
 'ukraine': 11,
 'as': 12,
 'from': 13,
 'is': 14,
 'with': 15,
 'by': 16,
 'has': 17,
 'trump': 18,
 'that': 19,
 'after': 20,
 'russia': 21,
 'russian': 22,
 'at': 23,
 'be': 24,
 'new': 25,
 'china': 26,
 'over': 27,
 'not': 28,
 'who': 29,
 'will': 30,
 'it': 31,
 'have': 32,
 'its': 33,
 'president': 34,
 'putin': 35,
 'was': 36,
 'coronavirus': 37,
 'war': 38,
 'world': 39,
 'he': 40,
 'people': 41,
 'uk': 42,
 'minister': 43,
 'against': 44,
 'into': 45,
 'an': 46,
 'are': 47,
 'his': 48,
 'more': 49,
 'police': 50,
 'their': 51,
 'government': 52,
 'up': 53,
 'than': 54,
 'eu': 55,
 'navalny': 56,
 'out': 57,
 'military': 58,
 'chinese': 59,
 'climate': 60,
 'hong': 61,
 'kong': 62,
 'first': 63,
 'gaza': 64,
 'they': 65,
 'years': 66,
 'un': 67,
 'report': 68,
 'say': 69,
 'all': 70,
 'russias': 71,
 'calls': 72,
 'ban': 73,
 'ukrainian': 74,
 'security': 75,
 'canada': 76,