# Clase 23

In [1]:
import numpy as np
import warnings
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition  import TruncatedSVD
warnings.filterwarnings('ignore')

### 1. Latent Semantic Analysis (LSA):

In [2]:
# The data.
my_docs = ["The economic slowdown is becoming more severe",
           "The movie was simply awesome",
           "I like cooking my own food",
           "Samsung is announcing a new technology",
           "Machine Learning is an example of awesome technology",
           "All of us were excited at the movie",
           "We have to do more to reverse the economic slowdown"]

#### 1.1. Create a TF IDF representation:
TfidfVectorizer() arguments: <br>
- *max_features* : maximum number of features (distict words). <br>
- *min_df* : The minimum DF. Integer value means count and real number (0~1) means proportion. <br>
- *max_df* : The maximum DF. Integer value means count and real number (0~1) means proportion. Helps to filter out the stop words. <br>

In [3]:
my_docs = [x.lower() for x in my_docs]

In [4]:
my_docs

['the economic slowdown is becoming more severe',
 'the movie was simply awesome',
 'i like cooking my own food',
 'samsung is announcing a new technology',
 'machine learning is an example of awesome technology',
 'all of us were excited at the movie',
 'we have to do more to reverse the economic slowdown']

In [5]:
my_stop_words = ['us', 'like']

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
vectorizer = TfidfVectorizer(max_features = 15, min_df = 1, max_df = 3, stop_words = stopwords.words('english') + my_stop_words)
X = vectorizer.fit_transform(my_docs).toarray()

In [8]:
# Size of X (=m x n). m = number of documents = 7 & n = number of features.
X.shape

(7, 15)

In [9]:
# View the features.
features = vectorizer.get_feature_names_out()
print(features)

['announcing' 'awesome' 'cooking' 'economic' 'example' 'excited' 'food'
 'movie' 'new' 'reverse' 'samsung' 'severe' 'simply' 'slowdown'
 'technology']


#### 1.2. Apply the truncated SVD:

In [10]:
n_topics = 4
svd = TruncatedSVD(n_components=n_topics, n_iter=100)
svd.fit(X)

In [11]:
# get the V^t matrix.
vt = svd.components_
vtabs = np.abs(vt)

In [12]:
# Check for the size of V^t.
vt.shape

(4, 15)

#### 1.3. From each topic, extract the top features:

In [13]:
n_top = 3
for i in range(n_topics):
    topic_features = [features[idx] for idx in np.argsort(-vtabs[i,:])]   # argsort() shows the sorted index.
    topic_features_top = topic_features[0:n_top]
    if i == 0:
        topic_matrix = [topic_features_top]                    # list의 list 만들 준비! (¡La lista está lista!)
    else:
        topic_matrix.append(topic_features_top)

In [14]:
# Show the top features for each topic.
topic_matrix

[['economic', 'slowdown', 'reverse'],
 ['movie', 'awesome', 'simply'],
 ['technology', 'movie', 'excited'],
 ['food', 'cooking', 'new']]

In [15]:
# In view of the top features, we can name the topics.
topic_names = ['Economy', 'Movie','Technology', 'Cuisine']

#### 1.4. Label each document with the most predominant topic:

In [16]:
n_docs = len(my_docs)
for i in range(n_docs):
    score_pick = 0
    topic_pick = 0
    tokennized_doc = nltk.word_tokenize(my_docs[i])
    for j in range(n_topics):
        found = [ x in topic_matrix[j] for x in tokennized_doc ]
        score = np.sum(found)
        if (score > score_pick):
            score_pick = score
            topic_pick = j
    print("Document " + str(i+1) + " = " + topic_names[topic_pick])

Document 1 = Economy
Document 2 = Movie
Document 3 = Cuisine
Document 4 = Technology
Document 5 = Movie
Document 6 = Technology
Document 7 = Economy


**NOTE**: We can notice some inaccuracies.

In [23]:
import numpy as np
import warnings
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition  import LatentDirichletAllocation
warnings.filterwarnings('ignore')

## 1. Latent Dirichlet Allocation (LDA):



In [24]:
# The data.
my_docs = ["The economic slowdown is becoming more severe",
           "The movie was simply awesome",
           "I like cooking my own food",
           "Samsung is announcing a new technology",
           "Machine Learning is an example of awesome technology",
           "All of us were excited at the movie",
           "We have to do more to reverse the economic slowdown"]

In [25]:
my_docs

['The economic slowdown is becoming more severe',
 'The movie was simply awesome',
 'I like cooking my own food',
 'Samsung is announcing a new technology',
 'Machine Learning is an example of awesome technology',
 'All of us were excited at the movie',
 'We have to do more to reverse the economic slowdown']

In [26]:
my_docs = [x.lower() for x in my_docs]

### 1.1. Create a DTM representation

In [27]:
# Instancia de nuestra función
vectorizer = CountVectorizer(max_features = 15, min_df = 1, max_df = 3, stop_words = list(ENGLISH_STOP_WORDS))
X = vectorizer.fit_transform(my_docs).toarray()

In [28]:
X.shape

(7, 15)

In [29]:
X

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0]])

In [30]:
# View the features.
features = vectorizer.get_feature_names_out()
print(features)

['announcing' 'awesome' 'economic' 'example' 'excited' 'food' 'learning'
 'movie' 'new' 'reverse' 'samsung' 'severe' 'simply' 'slowdown'
 'technology']


### 1.2. Apply the LDA:


In [31]:
# Get the topics.
n_topics = 4
lda = LatentDirichletAllocation(n_components=n_topics, random_state=123)
my_docs_topic = lda.fit_transform(X)

In [32]:
# row = document, column = topic.
my_docs_topic

array([[0.80515032, 0.06273478, 0.06255696, 0.06955794],
       [0.06455228, 0.06274657, 0.81007434, 0.06262681],
       [0.12522103, 0.12571681, 0.62369815, 0.12536402],
       [0.05117543, 0.05023664, 0.84846627, 0.05012166],
       [0.84833626, 0.05018186, 0.05138822, 0.05009367],
       [0.08342836, 0.08363634, 0.74944622, 0.08348908],
       [0.06434067, 0.06264053, 0.06253428, 0.81048452]])

In [33]:
# Sum along the row has to give 1.
my_docs_topic.sum(axis=1)

array([1., 1., 1., 1., 1., 1., 1.])

### 1.3. From each topic, extract the top features:


In [34]:
topic_composition = lda.components_
topic_composition.shape     # row = topic, column = feature (word).

(4, 15)

In [35]:
n_top = 3
for i in range(n_topics):
    topic_features = [features[idx] for idx in np.argsort(-topic_composition[i,:])]   # argsort() shows the sorted index.
    topic_features_top = topic_features[0:n_top]
    if i == 0:
        topic_matrix = [topic_features_top]                    # list의 list 만들 준비!
    else:
        topic_matrix.append(topic_features_top)

In [36]:
# Show the top features for each topic.
topic_matrix

[['awesome', 'technology', 'example'],
 ['food', 'awesome', 'excited'],
 ['movie', 'announcing', 'new'],
 ['economic', 'slowdown', 'reverse']]

In [37]:
# In view of the top features, we can name the topics.
topic_names = ['Technology', 'Cuisine', 'Movie','Economy']

### 1.4. Label each document with the most predominant topic:


In [38]:
# The most probable topic is given directly by the LDA output.
n_docs = len(my_docs)
for i in range(n_docs):
    topic_pick = np.argmax(my_docs_topic[i,:])
    print("Document " + str(i+1) + " = " + topic_names[topic_pick])

Document 1 = Technology
Document 2 = Movie
Document 3 = Movie
Document 4 = Movie
Document 5 = Technology
Document 6 = Movie
Document 7 = Economy


In [56]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.datasets.reuters import load_data, get_word_index       # Reuters news data.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
warnings.filterwarnings('ignore')                  # Turn the warnings off.
%matplotlib inline

#### Ejercicio de [link](https://github.com/davidlealo/sic_ai_2024/blob/main/005_pln/chapter_7/61.%20ex_0514.ipynb)

In [57]:
n_words = 1000                                        # Size of the vocabulary.
(X_train, y_train), (X_test, y_test) = load_data(num_words = n_words, test_split = 0.3)
n_train_size = X_train.shape[0]

In [58]:
# Check for the shapes.
print("-"*50)
print("Training data X shape: {}".format(X_train.shape))
print("Training data y shape: {}".format(y_train.shape))
print("-"*50)
print("Test data X shape: {}".format(X_test.shape))
print("Test data y shape: {}".format(y_test.shape))
print("-"*50)

--------------------------------------------------
Training data X shape: (7859,)
Training data y shape: (7859,)
--------------------------------------------------
Test data X shape: (3369,)
Test data y shape: (3369,)
--------------------------------------------------


In [59]:
# Number of unique values of y = Number of categories of the newswires.
n_cat = pd.Series(y_train).nunique()
n_cat

46

In [60]:
# Print out an observation (document) contained in X.
# It is encoded as integers (indices).
print(X_train[0])

[1, 2, 2, 8, 43, 10, 447, 5, 25, 207, 270, 5, 2, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 2, 2, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 2, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12]


In [61]:
# Let's check for length of the first 100 documents.
# We notice that the length is not uniform.
print([len(a) for a in X_train[0:100]])

[87, 56, 139, 224, 101, 116, 100, 100, 82, 106, 31, 59, 65, 316, 527, 76, 114, 17, 91, 77, 231, 108, 83, 29, 95, 110, 23, 373, 114, 354, 133, 222, 571, 155, 83, 208, 170, 269, 74, 19, 23, 78, 21, 377, 104, 299, 89, 56, 94, 139, 118, 36, 137, 107, 83, 66, 70, 112, 88, 51, 83, 123, 17, 185, 84, 52, 102, 73, 106, 486, 107, 82, 263, 172, 491, 190, 143, 62, 26, 88, 114, 38, 85, 112, 793, 104, 25, 21, 101, 28, 25, 81, 135, 73, 62, 18, 90, 266, 91, 64]


In [62]:
# Download the dictionary to translate the indices.
my_dict = get_word_index(path='reuters_word_index.json')

In [63]:
my_dict

{'mdbl': 10996,
 'fawc': 16260,
 'degussa': 12089,
 'woods': 8803,
 'hanging': 13796,
 'localized': 20672,
 'sation': 20673,
 'chanthaburi': 20675,
 'refunding': 10997,
 'hermann': 8804,
 'passsengers': 20676,
 'stipulate': 20677,
 'heublein': 8352,
 'screaming': 20713,
 'tcby': 16261,
 'four': 185,
 'grains': 1642,
 'broiler': 20680,
 'wooden': 12090,
 'wednesday': 1220,
 'highveld': 13797,
 'duffour': 7593,
 '0053': 20681,
 'elections': 3914,
 '270': 2563,
 '271': 3551,
 '272': 5113,
 '273': 3552,
 '274': 3400,
 'rudman': 7975,
 '276': 3401,
 '277': 3478,
 '278': 3632,
 '279': 4309,
 'dormancy': 9381,
 'errors': 7247,
 'deferred': 3086,
 'sptnd': 20683,
 'cooking': 8805,
 'stratabit': 20684,
 'designing': 16262,
 'metalurgicos': 20685,
 'databank': 13798,
 '300er': 20686,
 'shocks': 20687,
 'nawg': 7972,
 'tnta': 20688,
 'perforations': 20689,
 'affiliates': 2891,
 '27p': 20690,
 'ching': 16263,
 'china': 595,
 'wagyu': 16264,
 'affiliated': 3189,
 'chino': 16265,
 'chinh': 16266,
 '

In [64]:
# Exchange the 'key' and 'value'.
my_dict_inv = {v:k for k,v in my_dict.items()}

In [65]:
my_dict_inv

{10996: 'mdbl',
 16260: 'fawc',
 12089: 'degussa',
 8803: 'woods',
 13796: 'hanging',
 20672: 'localized',
 20673: 'sation',
 20675: 'chanthaburi',
 10997: 'refunding',
 8804: 'hermann',
 20676: 'passsengers',
 20677: 'stipulate',
 8352: 'heublein',
 20713: 'screaming',
 16261: 'tcby',
 185: 'four',
 1642: 'grains',
 20680: 'broiler',
 12090: 'wooden',
 1220: 'wednesday',
 13797: 'highveld',
 7593: 'duffour',
 20681: '0053',
 3914: 'elections',
 2563: '270',
 3551: '271',
 5113: '272',
 3552: '273',
 3400: '274',
 7975: 'rudman',
 3401: '276',
 3478: '277',
 3632: '278',
 4309: '279',
 9381: 'dormancy',
 7247: 'errors',
 3086: 'deferred',
 20683: 'sptnd',
 8805: 'cooking',
 20684: 'stratabit',
 16262: 'designing',
 20685: 'metalurgicos',
 13798: 'databank',
 20686: '300er',
 20687: 'shocks',
 7972: 'nawg',
 20688: 'tnta',
 20689: 'perforations',
 2891: 'affiliates',
 20690: '27p',
 16263: 'ching',
 595: 'china',
 16264: 'wagyu',
 3189: 'affiliated',
 16265: 'chino',
 16266: 'chinh',
 2

In [66]:
# Translate each document.
i_news = 10                                        # Document number that can be changed at will.
news = list(pd.Series(X_train[i_news]).apply(lambda x: my_dict_inv[x]))
print(' '.join(news))

the federal gain only growth lt they meeting year reuter company did year an they of of reuter company of of 1987 had of of reuter had profits of pct dlrs


### Data preprocessing

In [67]:
# Padding: newswire lengths are uniformly matched to maxlen.
# Cut away if longer than maxlen and fill with 0s if shorter than maxlen.
X_train = sequence.pad_sequences(X_train, maxlen = 100)
X_test = sequence.pad_sequences(X_test, maxlen = 100)

In [68]:
# Apply one-hot-encoding to the y variable.
y = np.concatenate([y_train,y_test],axis=0)
y = to_categorical(y,46)
y_train = y[:n_train_size,:]
y_test = y[n_train_size:,:]

### 1.4. Define the model:


In [69]:
n_neurons = 100                   # Neurons within each memory cell.
n_input = 100                     # Dimension of the embeding space.

In [70]:
# LSTM network model.
my_model = Sequential()
my_model.add(Embedding(n_words, n_input))           # n_words = vocabulary size, n_input = dimension of the embedding space.
my_model.add(LSTM(units=n_neurons, return_sequences=False, input_shape=(None, n_input), activation='tanh'))
my_model.add(Dense(n_cat, activation='softmax'))

In [71]:
# View the summary.
my_model.summary()

### 1.5. Define the optimizer and compile:
