In [1]:
import numpy as np
import re

In [2]:

data = "Information technology (IT) encompasses the use of computers, networks, storage, and other physical devices to create, process, store, secure, and exchange electronic data. It is essential for the efficient and effective operation of modern organizations, spanning various sectors such as finance, healthcare, education, and entertainment. IT includes disciplines like software development, data management, hardware maintenance, and network administration. Innovations such as cloud computing, artificial intelligence, big data analytics, and the Internet of Things (IoT) continue to transform interactions with technology, enhancing productivity, connectivity, and growth opportunities."
print(data)

Information technology (IT) encompasses the use of computers, networks, storage, and other physical devices to create, process, store, secure, and exchange electronic data. It is essential for the efficient and effective operation of modern organizations, spanning various sectors such as finance, healthcare, education, and entertainment. IT includes disciplines like software development, data management, hardware maintenance, and network administration. Innovations such as cloud computing, artificial intelligence, big data analytics, and the Internet of Things (IoT) continue to transform interactions with technology, enhancing productivity, connectivity, and growth opportunities.


In [3]:
sentences = data.split('.')
sentences

['Information technology (IT) encompasses the use of computers, networks, storage, and other physical devices to create, process, store, secure, and exchange electronic data',
 ' It is essential for the efficient and effective operation of modern organizations, spanning various sectors such as finance, healthcare, education, and entertainment',
 ' IT includes disciplines like software development, data management, hardware maintenance, and network administration',
 ' Innovations such as cloud computing, artificial intelligence, big data analytics, and the Internet of Things (IoT) continue to transform interactions with technology, enhancing productivity, connectivity, and growth opportunities',
 '']

In [4]:
#This code snippet performs text cleaning on a list of sentences
clean_sent=[] # inititalized an empty list: clean_sent=[] will hold the cleaned sentences.
for sentence in sentences:
    if sentence=="":
        continue
    sentence = re.sub('[^A-Za-z0-9]+', ' ', (sentence)) # uses a regular expression to replace all non-alphanumeric characters (anything other than letters and numbers) with a space.
    sentence = re.sub(r'(?:^| )\w (?:$| )', ' ', (sentence)).strip() # removes any single characters that are surrounded by spaces 
    sentence = sentence.lower()  #converts to lower case
    clean_sent.append(sentence) #adds the cleaned sentence to the clean_sent list.

print(clean_sent)

['information technology it encompasses the use of computers networks storage and other physical devices to create process store secure and exchange electronic data', 'it is essential for the efficient and effective operation of modern organizations spanning various sectors such as finance healthcare education and entertainment', 'it includes disciplines like software development data management hardware maintenance and network administration', 'innovations such as cloud computing artificial intelligence big data analytics and the internet of things iot continue to transform interactions with technology enhancing productivity connectivity and growth opportunities']


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_sent)
sequences = tokenizer.texts_to_sequences(clean_sent)
print(sequences)

#This code uses the Tokenizer class from TensorFlow's Keras module to convert a list of cleaned sentences into sequences of integers. 
#Each unique word in clean_sent is assigned an integer based on its frequency (more common words generally receive lower integers). 



[[10, 6, 2, 11, 3, 12, 4, 13, 14, 15, 1, 16, 17, 18, 7, 19, 20, 21, 22, 1, 23, 24, 5], [2, 25, 26, 27, 3, 28, 1, 29, 30, 4, 31, 32, 33, 34, 35, 8, 9, 36, 37, 38, 1, 39], [2, 40, 41, 42, 43, 44, 5, 45, 46, 47, 1, 48, 49], [50, 8, 9, 51, 52, 53, 54, 55, 5, 56, 1, 3, 57, 4, 58, 59, 60, 7, 61, 62, 63, 6, 64, 65, 66, 1, 67, 68]]


In [6]:
index_to_word = {}
word_to_index = {}

for i, sequence in enumerate(sequences):
#print(sequence)
    word_in_sentence = clean_sent[i].split()
#print(word_in_sentence)
    
    for j, value in enumerate(sequence):
        index_to_word[value] = word_in_sentence[j]
        word_to_index[word_in_sentence[j]] = value

print(index_to_word, "\n")
print(word_to_index)

{10: 'information', 6: 'technology', 2: 'it', 11: 'encompasses', 3: 'the', 12: 'use', 4: 'of', 13: 'computers', 14: 'networks', 15: 'storage', 1: 'and', 16: 'other', 17: 'physical', 18: 'devices', 7: 'to', 19: 'create', 20: 'process', 21: 'store', 22: 'secure', 23: 'exchange', 24: 'electronic', 5: 'data', 25: 'is', 26: 'essential', 27: 'for', 28: 'efficient', 29: 'effective', 30: 'operation', 31: 'modern', 32: 'organizations', 33: 'spanning', 34: 'various', 35: 'sectors', 8: 'such', 9: 'as', 36: 'finance', 37: 'healthcare', 38: 'education', 39: 'entertainment', 40: 'includes', 41: 'disciplines', 42: 'like', 43: 'software', 44: 'development', 45: 'management', 46: 'hardware', 47: 'maintenance', 48: 'network', 49: 'administration', 50: 'innovations', 51: 'cloud', 52: 'computing', 53: 'artificial', 54: 'intelligence', 55: 'big', 56: 'analytics', 57: 'internet', 58: 'things', 59: 'iot', 60: 'continue', 61: 'transform', 62: 'interactions', 63: 'with', 64: 'enhancing', 65: 'productivity', 66

In [8]:
#this code is preparing data for a word embedding model that learns word representations based on their context


vocab_size = len(tokenizer.word_index) + 1 #defines vocabulary size along with +1 due to padding reasons

emb_size = 10 #embedding soze which represents the number of dimensions in tbhe word embeddings
context_size = 2 #the size of the context window, indicating how many words before and after the target word to include in the context

contexts = [] #This list will store the context words for each target word.
targets = [] #This list will store each target word that the model aims to predict based on the context.


for sequence in sequences:  #Iterates over each sentence in sequences
    for i in range(context_size, len(sequence) - context_size): 
        target = sequence[i]
        context = [sequence[i - 2], sequence[i - 1], sequence[i + 1], sequence[i + 2]]
#         print(context)
        contexts.append(context)
        targets.append(target)
#Loops through each word in the sentence, starting from the position context_size and stopping context_size words before the end. 
#This ensures that there are enough words on both sides of the target word for context.

print(contexts, "\n")
print(targets)

[[10, 6, 11, 3], [6, 2, 3, 12], [2, 11, 12, 4], [11, 3, 4, 13], [3, 12, 13, 14], [12, 4, 14, 15], [4, 13, 15, 1], [13, 14, 1, 16], [14, 15, 16, 17], [15, 1, 17, 18], [1, 16, 18, 7], [16, 17, 7, 19], [17, 18, 19, 20], [18, 7, 20, 21], [7, 19, 21, 22], [19, 20, 22, 1], [20, 21, 1, 23], [21, 22, 23, 24], [22, 1, 24, 5], [2, 25, 27, 3], [25, 26, 3, 28], [26, 27, 28, 1], [27, 3, 1, 29], [3, 28, 29, 30], [28, 1, 30, 4], [1, 29, 4, 31], [29, 30, 31, 32], [30, 4, 32, 33], [4, 31, 33, 34], [31, 32, 34, 35], [32, 33, 35, 8], [33, 34, 8, 9], [34, 35, 9, 36], [35, 8, 36, 37], [8, 9, 37, 38], [9, 36, 38, 1], [36, 37, 1, 39], [2, 40, 42, 43], [40, 41, 43, 44], [41, 42, 44, 5], [42, 43, 5, 45], [43, 44, 45, 46], [44, 5, 46, 47], [5, 45, 47, 1], [45, 46, 1, 48], [46, 47, 48, 49], [50, 8, 51, 52], [8, 9, 52, 53], [9, 51, 53, 54], [51, 52, 54, 55], [52, 53, 55, 5], [53, 54, 5, 56], [54, 55, 56, 1], [55, 5, 1, 3], [5, 56, 3, 57], [56, 1, 57, 4], [1, 3, 4, 58], [3, 57, 58, 59], [57, 4, 59, 60], [4, 58, 60

In [9]:
#printing features with target
for i in range(5):
    words = []
    target = index_to_word.get(targets[i])
    for j in contexts[i]:
        words.append(index_to_word.get(j))
    print(words," -> ", target)

['information', 'technology', 'encompasses', 'the']  ->  it
['technology', 'it', 'the', 'use']  ->  encompasses
['it', 'encompasses', 'use', 'of']  ->  the
['encompasses', 'the', 'of', 'computers']  ->  use
['the', 'use', 'computers', 'networks']  ->  of


In [10]:
# Convert the contexts and targets to numpy arrays
X = np.array(contexts) #numpy error are easy to operate in python as it is inbuild library
Y = np.array(targets)

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda 

In [12]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=2*context_size),
    Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    Dense(256, activation='relu'),
    Dense(512, activation='relu'),
    Dense(vocab_size, activation='softmax')
])
#sequential model is a model that trains the neural network layer by layer



In [13]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#optimizer 

In [14]:
history = model.fit(X, Y, epochs=150)

Epoch 1/150

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.0071 - loss: 4.2347   
Epoch 2/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0741 - loss: 4.2276 
Epoch 3/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0975 - loss: 4.2203 
Epoch 4/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0741 - loss: 4.2148 
Epoch 5/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0975 - loss: 4.2050 
Epoch 6/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0897 - loss: 4.1912 
Epoch 7/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0780 - loss: 4.1764 
Epoch 8/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0702 - loss: 4.1550 
Epoch 9/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [15]:
from sklearn.decomposition import PCA

embeddings = model.get_weights()[0]

pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)
#PCA is principle component analysis
#to reduce the problem of overfitting


In [18]:
# test model: select some sentences from above paragraph
test_sentenses = [
    "information technology encompasses the",
    "it includes disciplines like software",
    "innovations such as cloud computing"
]

In [19]:
import numpy as np
for sent in test_sentenses:
    test_words = sent.split(" ")
    x_test =[]
    for i in test_words:
        x_test.append(word_to_index.get(i))
    x_test = np.array([x_test])
    
    pred = model.predict(x_test)
    pred = np.argmax(pred[0])
    print("making a prediction for these words ", test_words, "\nresult = ", index_to_word.get(pred),"\n")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
making a prediction for these words  ['information', 'technology', 'encompasses', 'the'] 
result =  it 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
making a prediction for these words  ['it', 'includes', 'disciplines', 'like', 'software'] 
result =  disciplines 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
making a prediction for these words  ['innovations', 'such', 'as', 'cloud', 'computing'] 
result =  as 

