<a href="https://colab.research.google.com/github/fsndzomga/Deep-Learning-With-Python/blob/main/model_predicting_the_next_word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:

import string
import numpy as np
import re
import random
from sklearn.linear_model import SGDRegressor
import pickle
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import json
#import tensorflow as tf

def preprocessing_text(text_file_path):
    # Loading my text file
    text = open(text_file_path, "r").read().lower()

    # Remove numbers using isdigit()
    text = ''.join([c for c in text if not c.isdigit()])

    # Remove punctuation using str.translate() and string.punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Replace multiple spaces with single space
    text = re.sub(' +', ' ', text)

    # Replace line breaks with single space
    text = text.replace("\n", " ")

    return text


def encoding(text):
    # Generate the list of unique words in my text
    unique_words = set(text.split(" "))

    # Get the number of elements in the set
    num_elements = len(unique_words)

    with open("encoding.json", "r") as file:
      json_data = file.read()
      text_encoding = json.loads(json_data)


    # use the text encoding to map the original corpus
    encoded_text = [text_encoding[word] for word in text.split(" ")]

    return encoded_text


def add_zeros_to_array(array, m):
    # Calculate the number of zeros to add
    num_zeros = m - len(array)

    # Add zeros to the beginning of the array
    result = [0] * num_zeros + array

    return result

def features_targets(encoded_text, num_words):

    features = []
    targets = []

    for i in range(len(encoded_text) - num_words + 1):
        subtext = encoded_text[i:i + num_words + 1]
        for j in range(1, len(subtext)):
            features.append(add_zeros_to_array(subtext[0:j], num_words))
            targets.append(subtext[j])

    return [features, targets]


def training_scikit_learn_model(features, targets):
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

    # Train the model
    model = SGDRegressor()
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the accuracy of the model using mean squared error
    mse = mean_squared_error(y_test, y_pred)
    print("Mean Squared Error:", mse)

    # Evaluate the accuracy of the model using mean absolute error
    mae = mean_absolute_error(y_test, y_pred)
    print("Mean Absolute Error:", mae)

    # Evaluate the accuracy of the model using R2 score
    r2 = r2_score(y_test, y_pred)
    print("R2 Score:", r2)
    # Save the trained model to a file
    with open('model.pkl', 'wb') as f:
        pickle.dump(model, f)

def decoding(text_encoding, encoded_text):
    # Search for the value in the dictionary
    for key, val in text_encoding.items():
      if encoded_text == val:
          # Return the key if the value is found
          decoded_text = key
          break
      else:
        # 0 if the value is not found
        decoded_text = "0"
    return decoded_text

def run(file_path, num_words):
    # preprocessing
    text = preprocessing_text(file_path)

    #encoding
    encoded_text = encoding(text)[0]
    text_encoding = encoding(text)[1]

    # generating features and targets
    features, targets = features_targets(encoded_text, num_words)

    # training the scikit learn model
    training_scikit_learn_model(features, targets)

    #using the model
    user_text = input("Enter your text:")

    # Write the text to a file
    with open("user_text.txt", "w") as f:
        f.write("Hello, World!")

    user_text = preprocessing_text("user_text.txt")

    encoded_user_text = [text_encoding[word] for word in user_text.split(" ")]

    user_features= add_zeros_to_array(encoded_user_text, num_words)

    # Load the saved model
    with open('model.pkl', 'rb') as f:
        loaded_model = pickle.load(f)

    # Use the loaded model to make predictions
    predictions = loaded_model.predict(user_features)

    print(decoding(text_encoding, int(predictions)))

#run("bible.txt", 20)


In [8]:
import json

def fixed_encoding(text):
    # Generate the list of unique words in my text
    unique_words = set(text.split(" "))

    # Get the number of elements in the set
    num_elements = len(unique_words)

    # Create a dictionary where each key is an element from the set and
    # the value is a random integer between 1 and the number of elements

    text_encoding = {elem: random.randint(1, num_elements) for elem in unique_words}

    return text_encoding

# preprocessing
text = preprocessing_text("bible.txt")
encoding = fixed_encoding(text)

json_encoding = json.dumps(encoding, indent=4)

with open("encoding.json", "w") as file:
    file.write(json_encoding)

In [2]:
# preprocessing
text = preprocessing_text("bible.txt")
num_words = 5

#encoding
encoded_text = encoding(text)[0]
text_encoding = encoding(text)[1]

# generating features and targets
features, targets = features_targets(encoded_text, num_words)

# training the scikit learn model
#training_scikit_learn_model(features, targets)

In [None]:
features

In [6]:
#using the model
user_text = input("Enter your text:")

# Write the text to a file
with open("user_text.txt", "w") as f:
  f.write(user_text)

user_text = preprocessing_text("user_text.txt")

encoded_user_text = [text_encoding[word] for word in user_text.split(" ")]

user_features= add_zeros_to_array(encoded_user_text, num_words)

# Load the saved model
with open('model.pkl', 'rb') as f:
  loaded_model = pickle.load(f)

# Use the loaded model to make predictions
predictions = loaded_model.predict(np.array(user_features).reshape(1, -1))

print(decoding(text_encoding, int(predictions)))

Enter your text:Jesus is the
0
