In [37]:
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np


sys.path.append('../utils/')
sys.path.append('..')
from preprocessing import load_dataframes

In [38]:
(df_train, df_val, df_test) = load_dataframes()

In [39]:
print(df_train["user_input"])

2663                          Show me how to use Markdown
668     what are low-level and high-level computer vis...
4074    How does function pointer differs from std::fu...
2107           Make a presentation on sports shoes brands
4992    Hi!  Can you help reserarch whether developing...
                              ...                        
4426                                    Hello who are you
466     Write me  positive review of the movie Cocaine...
3092    could you describe the concept of "the directi...
3772    can you parse this address and place comma whe...
860     what is the difference between 2003 and 2022 e...
Name: user_input, Length: 4065, dtype: object


In [40]:
# Assuming df_train, df_val, df_test contain a column 'text' with the sentences.
all_texts = pd.concat([df_train['user_input'], df_val['user_input'], df_test['user_input']])
sentences = all_texts.tolist()
len(sentences)

10165

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b', ngram_range=(2, 2))
X = vectorizer.fit_transform(sentences)


In [42]:

# Count the frequency of each bigram
bigram_frequency = np.asarray(X.sum(axis=0)).ravel()

# Map each bigram to its frequency
bigram_to_freq = dict(zip(vectorizer.get_feature_names_out(), bigram_frequency))

# Sort the bigram_to_freq dictionary by frequency in descending order and print the top 10 bigrams
#for bigram, freq in sorted(bigram_to_freq.items(), key=lambda item: item[1], reverse=True)[:10]:
#    print(f"Bigram: '{bigram}', Frequency: {freq}")

# Function to predict the next word
def predict_next_word(previous_word):
    candidates = {bigram: freq for bigram, freq in bigram_to_freq.items() if bigram.startswith(previous_word + ' ')}
    
    sorted_candidates = {k: v for k, v in sorted(candidates.items(), key=lambda item: item[1], reverse=True)}

    
    #for bigram, freq in sorted_candidates.items():
        #print(f"Bigram: '{bigram}', Frequency: {freq}")

    if not candidates:
        return "No prediction available"
    return max(candidates, key=candidates.get).split()[1]

# Example
print(predict_next_word("you"))


Bigram: 'you are', Frequency: 824
Bigram: 'you will', Frequency: 449
Bigram: 'you can', Frequency: 274
Bigram: 'you to', Frequency: 263
Bigram: 'you know', Frequency: 257
Bigram: 'you have', Frequency: 187
Bigram: 'you write', Frequency: 134
Bigram: 'you do', Frequency: 121
Bigram: 'you should', Frequency: 110
Bigram: 'you tell', Frequency: 89
Bigram: 'you understand', Frequency: 89
Bigram: 'you re', Frequency: 84
Bigram: 'you give', Frequency: 75
Bigram: 'you help', Frequency: 75
Bigram: 'you think', Frequency: 73
Bigram: 'you a', Frequency: 70
Bigram: 'you must', Frequency: 59
Bigram: 'you explain', Frequency: 55
Bigram: 'you speak', Frequency: 50
Bigram: 'you please', Frequency: 49
Bigram: 'you want', Frequency: 47
Bigram: 'you need', Frequency: 42
Bigram: 'you don', Frequency: 38
Bigram: 'you and', Frequency: 37
Bigram: 'you were', Frequency: 37
Bigram: 'you today', Frequency: 34
Bigram: 'you doing', Frequency: 33
Bigram: 'you make', Frequency: 33
Bigram: 'you provide', Frequency: 

In [57]:
vectorizer_3gram = CountVectorizer(token_pattern=r'\b\w+\b', ngram_range=(3, 3))
X_3gram = vectorizer_3gram.fit_transform(sentences)

In [72]:
# Count the frequency of each bigram
trigram_frequency = np.asarray(X_3gram.sum(axis=0)).ravel()

# Map each bigram to its frequency
trigram_to_freq = dict(zip(vectorizer_3gram.get_feature_names_out(), trigram_frequency))

# Sort the bigram_to_frq dictionary by frequency in descending order and print the top 10 bigrams
#for trigram, freq in sorted(trigram_to_freq.items(), key=lambda item: item[1], reverse=True)[:10]:
    #print(f"Bigram: '{trigram}', Frequency: {freq}")

# Function to predict the next word based on the two previous words
def predict_next_word_trigram(previous_words):
    previous_words = previous_words.lower()
    candidates = {trigram: freq for trigram, freq in trigram_to_freq.items() if trigram.startswith(previous_words + ' ')}

    sorted_candidates = {k: v for k, v in sorted(candidates.items(), key=lambda item: item[1], reverse=True)}

    # Now, print the sorted candidates
    i = 0
    for trigram, freq in sorted_candidates.items():
        print(f"Bigram: '{trigram}', Frequency: {freq}")
        i += 1
        if (i == 10):
            break

    if not candidates:
        return "No prediction available"
    # Extracting the last word of the most frequent trigram following the previous_words
    return max(candidates, key=candidates.get).split()[2]

print(predict_next_word_trigram("i hate"))

Bigram: 'i hate dealing', Frequency: 1
Bigram: 'i hate it', Frequency: 1
Bigram: 'i hate jews', Frequency: 1
Bigram: 'i hate this', Frequency: 1
Bigram: 'i hate you', Frequency: 1
dealing
