In [1]:
#%pip install pandas
#%pip install nltk
import os
import kagglehub
import pandas as pd
from nltk.probability import FreqDist
from nltk import ngrams

path = kagglehub.dataset_download("mustfkeskin/turkish-movie-sentiment-analysis-dataset")
dataset_name = os.listdir(path)[0]
print("Path to dataset files:", path)
print("Dataset name:", dataset_name)

Path to dataset files: /home/cilem/.cache/kagglehub/datasets/mustfkeskin/turkish-movie-sentiment-analysis-dataset/versions/1
Dataset name: turkish_movie_sentiment_dataset.csv


In [2]:
df = pd.read_csv(os.path.join(path, dataset_name))
print(df["comment"][4])
df.head()


                      Jean Reno..
bu adam kusursuz biri..
ve oyunculugu müthiş..
film ise başyapıt..
10/10
        
            


Unnamed: 0,comment,film_name,point
0,\n Jean Reno denince zate...,Sevginin Gücü,50
1,\n Ekşın falan izlemek is...,Sevginin Gücü,50
2,\n Bu yapım hakkında öyle...,Sevginin Gücü,50
3,\n finali yeter... (sting...,Sevginin Gücü,50
4,\n Jean Reno..\r\nbu adam...,Sevginin Gücü,50


In [3]:
text_column = "comment" 

print("GİRDİ:", df[text_column][4])

df[text_column] = df[text_column].str.lower()

print("ÇIKTI:", df[text_column][4])

GİRDİ: 
                      Jean Reno..
bu adam kusursuz biri..
ve oyunculugu müthiş..
film ise başyapıt..
10/10
        
            
ÇIKTI: 
                      jean reno..
bu adam kusursuz biri..
ve oyunculugu müthiş..
film ise başyapıt..
10/10
        
            


In [4]:
unigrams = []
for sent in df[text_column]:
    word = sent.split(" ")
    unigrams.extend(word)

unigram_freq = FreqDist(unigrams)

unigram_freq.most_common(10)

[('', 3313153),
 ('\n', 165451),
 ('bir', 136714),
 ('ve', 72910),
 ('bu', 56740),
 ('film', 49167),
 ('çok', 48471),
 ('de', 28795),
 ('ama', 27887),
 ('filmi', 25606)]

In [5]:
bigrams = list(ngrams(unigrams, 2))

bigram_freqs = FreqDist(bigrams)
bigram_freqs.most_common(10)

[(('', ''), 3049118),
 (('\n', ''), 164982),
 (('', '\n'), 164768),
 (('bir', 'film'), 16707),
 (('güzel', 'bir'), 5310),
 (('bu', 'kadar'), 5156),
 (('bu', 'film'), 4993),
 (('bir', 'film.'), 4860),
 (('en', 'iyi'), 4838),
 (('bu', 'filmi'), 4752)]

In [6]:
trigrams = list(ngrams(unigrams, 3))

trigram_freqs = FreqDist(trigrams)
trigram_freqs.most_common(10)

[(('', '', ''), 2801130),
 (('\n', '', ''), 164935),
 (('', '\n', ''), 164592),
 (('', '', '\n'), 164521),
 (('', '', 'film'), 3828),
 (('', '', 'bu'), 3701),
 (('', '', 'çok'), 3417),
 (('', '', 'bence'), 2058),
 (('film.\n', '', ''), 1930),
 (('', '', 'filmi'), 1908)]

In [7]:
import random

def unigram_predict(last_word, unigram_freq):

    most_common = unigram_freq.most_common()
    predictions = [word for word, _ in most_common] 
    return random.choice(predictions)


def bigram_predict(last_word, bigram_freqs):
    possible_bigrams = [bigram for bigram in bigram_freqs if bigram[0] == last_word]
    if possible_bigrams:
        next_words = [bigram[1] for bigram in possible_bigrams]
        return random.choice(next_words)
    else:
        return None 
    

def trigram_predict(last_two_words, trigram_freqs):
    
    possible_trigrams = [trigram for trigram in trigram_freqs if trigram[0] == last_two_words[0] and trigram[1] == last_two_words[1]]
    if possible_trigrams:
        next_words = [trigram[2] for trigram in possible_trigrams]
        return random.choice(next_words)
    else:
        return None


In [8]:
input_word = "güzel" 

unigram_prediction = unigram_predict(input_word, unigram_freq)
print(f"Unigram Tahmini: {unigram_prediction}")

bigram_prediction = bigram_predict(input_word, bigram_freqs)
print(f"Bigram Tahmini: {bigram_prediction}")

input_words = ["film", "güzel"]  
trigram_prediction = trigram_predict(input_words, trigram_freqs)
print(f"Trigram Tahmini: {trigram_prediction}")


Unigram Tahmini: arananlar
Bigram Tahmini: olduğu
Trigram Tahmini: oluyor.
