In [1]:
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np


sys.path.append('../utils/')
sys.path.append('..')
from preprocessing import load_dataframes

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
(df_train, df_val, df_test) = load_dataframes()

In [3]:
print(df_train["user_input"])

2663                          Show me how to use Markdown
668     what are low-level and high-level computer vis...
4074    How does function pointer differs from std::fu...
2107           Make a presentation on sports shoes brands
4992    Hi!  Can you help reserarch whether developing...
                              ...                        
4426                                    Hello who are you
466     Write me  positive review of the movie Cocaine...
3092    could you describe the concept of "the directi...
3772    can you parse this address and place comma whe...
860     what is the difference between 2003 and 2022 e...
Name: user_input, Length: 4065, dtype: object


In [50]:
# Assuming df_train, df_val, df_test contain a column 'text' with the sentences.
all_texts = pd.concat([df_train['user_input'], df_val['user_input'], df_test['user_input']])
sentences = all_texts.tolist()
len(sentences)

10165

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b', ngram_range=(2, 2))
X = vectorizer.fit_transform(sentences)


In [6]:

# Count the frequency of each bigram
bigram_frequency = np.asarray(X.sum(axis=0)).ravel()

# Map each bigram to its frequency
bigram_to_freq = dict(zip(vectorizer.get_feature_names_out(), bigram_frequency))

# Sort the bigram_to_freq dictionary by frequency in descending order and print the top 10 bigrams
#for bigram, freq in sorted(bigram_to_freq.items(), key=lambda item: item[1], reverse=True)[:10]:
#    print(f"Bigram: '{bigram}', Frequency: {freq}")

# Function to predict the next word
def predict_next_word(previous_word):
    candidates = {bigram: freq for bigram, freq in bigram_to_freq.items() if bigram.startswith(previous_word + ' ')}
    
    sorted_candidates = {k: v for k, v in sorted(candidates.items(), key=lambda item: item[1], reverse=True)}

    
    #for bigram, freq in sorted_candidates.items():
        #print(f"Bigram: '{bigram}', Frequency: {freq}")

    if not candidates:
        return "No prediction available"
    return max(candidates, key=candidates.get).split()[1]

# Example
print(predict_next_word("you"))


are


In [7]:
vectorizer_3gram = CountVectorizer(token_pattern=r'\b\w+\b', ngram_range=(3, 3))
X_3gram = vectorizer_3gram.fit_transform(sentences)

In [10]:
# Count the frequency of each bigram
trigram_frequency = np.asarray(X_3gram.sum(axis=0)).ravel()

# Map each bigram to its frequency
trigram_to_freq = dict(zip(vectorizer_3gram.get_feature_names_out(), trigram_frequency))

# Sort the bigram_to_frq dictionary by frequency in descending order and print the top 10 bigrams
#for trigram, freq in sorted(trigram_to_freq.items(), key=lambda item: item[1], reverse=True)[:10]:
    #print(f"Bigram: '{trigram}', Frequency: {freq}")

# Function to predict the next word based on the two previous words
def predict_next_word_trigram(previous_words):
    previous_words = previous_words.lower()
    candidates = {trigram: freq for trigram, freq in trigram_to_freq.items() if trigram.startswith(previous_words + ' ')}

    sorted_candidates = {k: v for k, v in sorted(candidates.items(), key=lambda item: item[1], reverse=True)}

    # Now, print the sorted candidates
    i = 0
    for trigram, freq in sorted_candidates.items():
        print(f"Trgram: '{trigram}', Frequency: {freq}")
        i += 1
        if (i == 10):
            break

    if not candidates:
        return "No prediction available"
    # Extracting the last word of the most frequent trigram following the previous_words
    return max(candidates, key=candidates.get).split()[2]

print(predict_next_word_trigram("upon a"))

Trgram: 'upon a mysterious', Frequency: 2
Trgram: 'upon a time', Frequency: 1
mysterious


In [11]:
def complete_the_phrase(starting_phrase, max_length=10):
    current_phrase = starting_phrase.strip()
    words = current_phrase.split()
    
    # Continue until reaching the maximum length
    for _ in range(max_length - len(words)):
        last_word = words[-1]
        # Find candidates that start with the last word of the current phrase
        candidates = {bigram: freq for bigram, freq in bigram_to_freq.items() if bigram.startswith(last_word + ' ')}
        if not candidates:
            break  # No candidates found, stop the loop
        # Pick the most frequent continuation (the second word in the bigram)
        next_word = sorted(candidates.items(), key=lambda item: item[1], reverse=True)[0][0].split()[1]
        words.append(next_word)
    
    return ' '.join(words)

# Example usage:
print(complete_the_phrase("Do you like playing"))

Do you like playing the following text to the following


In [14]:
def complete_the_phrase_trigram(starting_phrase, max_length=15):
    current_phrase = starting_phrase.strip()
    words = current_phrase.split()
    
    # Ensure there are enough words for trigram prediction
    if len(words) < 2:
        return "Please provide at least two words for the initial phrase."
    
    # Continue until reaching the maximum length
    for _ in range(max_length - len(words)):
        # Use the last two words for the trigram prediction
        last_two_words = ' '.join(words[-2:])
        # Find candidates that start with the last two words of the current phrase
        candidates = {trigram: freq for trigram, freq in trigram_to_freq.items() if trigram.startswith(last_two_words + ' ')}
        if not candidates:
            break  # No candidates found, stop the loop
        # Pick the most frequent continuation (the third word in the trigram)
        next_word = sorted(candidates.items(), key=lambda item: item[1], reverse=True)[0][0].split()[2]
        words.append(next_word)
    
    return ' '.join(words)

# Example usage:
print(complete_the_phrase_trigram("once upon"))


once upon a mysterious beach party while walking along the right person if not apologize


In [16]:
class NGramPredictor:
    def __init__(self, n=2):
        self.n = n
        self.vectorizer = CountVectorizer(token_pattern=r'\b\w+\b', ngram_range=(n, n))
        self.ngram_to_freq = {}

    def fit(self, sentences):
        X_ngram = self.vectorizer.fit_transform(sentences)
        ngram_frequency = np.asarray(X_ngram.sum(axis=0)).ravel()
        self.ngram_to_freq = dict(zip(self.vectorizer.get_feature_names_out(), ngram_frequency))

    def predict_next_word(self, previous_words):
        previous_words = ' '.join(previous_words.split()[-(self.n-1):]).lower()
        candidates = {ngram: freq for ngram, freq in self.ngram_to_freq.items() if ngram.startswith(previous_words + ' ')}
        
        if not candidates:
            return None  # Changed to return None for easier checking
        
        return max(candidates, key=candidates.get).split()[-1]

    def complete_the_phrase(self, starting_phrase, max_length=10):
        current_phrase = starting_phrase.strip()
        words = current_phrase.split()
        
        # Adjust for n-gram model
        for _ in range(max_length - len(words)):
            if len(words) < self.n - 1:
                return "Please provide more words for the initial phrase."
            
            # For n-grams, use the last n-1 words as context
            previous_words = ' '.join(words[-(self.n-1):])
            next_word = self.predict_next_word(previous_words)
            
            if not next_word:
                break  # No candidates found, stop the loop
            
            words.append(next_word)
        
        return ' '.join(words)
    
    def print_top_ngrams(self, top=10):
        for ngram, freq in sorted(self.ngram_to_freq.items(), key=lambda item: item[1], reverse=True)[:top]:
            print(f"N-Gram: '{ngram}', Frequency: {freq}")


In [19]:
# Example usage:
ngram_predictor = NGramPredictor(n=4)  # For trigrams
ngram_predictor.fit(sentences)  # Assuming 'sentences' is a list of sentence strings

In [22]:
ngram_predictor.print_top_ngrams()
print(ngram_predictor.complete_the_phrase("once upon a", max_length=10))

N-Gram: 'i want you to', Frequency: 148
N-Gram: 'can you tell me', Frequency: 70
N-Gram: 'can do anything now', Frequency: 64
N-Gram: 'write a story about', Frequency: 59
N-Gram: 'in the style of', Frequency: 56
N-Gram: 'none of your responses', Frequency: 55
N-Gram: 'of your responses should', Frequency: 55
N-Gram: 'can you help me', Frequency: 54
N-Gram: 'a story about a', Frequency: 53
N-Gram: 'you are going to', Frequency: 49
once upon a time in a long time and she


In [51]:
def evaluate_model(model, test_sentences):
    print(len(test_sentences))
    correct_predictions = 0
    total_predictions = 0
    
    counter = 0
    for sentence in test_sentences:
        counter += 1
        words = sentence.split()
        # Ensure the sentence has more words than n-1 to make a prediction and have a target
        if len(words) >= model.n:
            for i in range(model.n - 1, len(words)):
                context = ' '.join(words[max(0, i - model.n + 1):i])
                actual_next_word = words[i]
                predicted_next_word = model.predict_next_word(context)
                
                if predicted_next_word is not None and predicted_next_word == actual_next_word:
                    correct_predictions += 1
                total_predictions += 1

                if (predicted_next_word is None):
                    print ("None ")
                else:
                    print("Context: " + context + " - Expected: " + words[i] + " Predicted: " + predicted_next_word + " " + str(counter))
        if (counter >= 100):
            break
    
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

# Assuming you have an instance of NGramPredictor called 'ngram_predictor' and it's already fit
# And assuming 'sentences' is your list of test sentences:
test_sentences = df_test['user_input'].to_list()
#print(sentences)

# Example usage:
tested_predictor = NGramPredictor(n=4)  # For trigrams
tested_predictor.fit(sentences)  # Assuming 'sentences' is a list of sentence strings
accuracy = evaluate_model(tested_predictor, test_sentences)
print(f"Accuracy: {accuracy:.4f}")

5083
Context: what topics should - Expected: i Predicted: i 1
Context: topics should i - Expected: start Predicted: start 1
Context: should i start - Expected: creating Predicted: with 1
Context: i start creating - Expected: on Predicted: on 1
Context: start creating on - Expected: youtube Predicted: youtube 1
Context: creating on youtube - Expected: to Predicted: to 1
Context: on youtube to - Expected: grow Predicted: grow 1
Context: youtube to grow - Expected: a Predicted: a 1
Context: to grow a - Expected: following Predicted: following 1
Context: grow a following - Expected: and Predicted: and 1
Context: a following and - Expected: does Predicted: does 1
Context: following and does - Expected: not Predicted: not 1
Context: and does not - Expected: take Predicted: have 1
Context: does not take - Expected: much Predicted: much 1
Context: not take much - Expected: effort Predicted: effort 1
Context: Create 20 paraphrases - Expected: of Predicted: of 2
Context: 20 paraphrases of - Expe