In [145]:
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.probability import FreqDist
from nltk.classify import accuracy,NaiveBayesClassifier 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
import pandas as pd
import pickle
import random
import string#punctuation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
#13

In [131]:
def load_data():
    df = pd.read_csv("movie-review.csv")
    return df

In [132]:
def get_pos(tag):
    if tag.startswith('J'):
        return 'a'
    if tag.startswith('R'):
        return 'r'
    if tag.startswith('N'):
        return 'n'
    if tag.startswith('V'):#
        return 'v'
    else:
        return 'n'

In [133]:
def preprocessing(words):
    wnl=WordNetLemmatizer#kalau gk isa tambahi()
    stemmer = PorterStemmer()    
    words = [word for word in words if word not in stopwords.words('english')]
    words = [word for word in words if word not in string.punctuation]
    words = [word for word in words if word.isalpha()]
    words = [stemmer.stem(word) for word in words]#stem
    tagging = pos_tag(words)
    words = [wnl.lemmatize(word,get_pos(tag)) for tag,word in tagging]
    
    return words

In [153]:
def train_model():
    #1.Load
    df = load_data().sample(3000)
    #2.Feature Selection
    reviews = [str(review) for review in df['review'].to_list()]#string ->str
    sentiments = [str(sentiment) for sentiment in df['sentimentScore'].to_list()]
        
    #3.Word list
    word_list = []
    for sentences in reviews:
        words = word_tokenize(sentences)
        for word in words:
            word_list.append(word)
            

    #4.Preprocessing
    word_list = preprocessing(word_list)

    # 5Freqdist
    fd=FreqDist()
    word_features = [word for word,_ in fd.most_common(300)]
    
    #6compare tujuannya biar dapat sentiment di feature set
    labeled_list = list(zip(reviews,sentiments))
    features_set=[]
    for review, sentiment in labeled_list:
        features={}
        check_words = word_tokenize(review)
        check_words = preprocessing(check_words)
        
        for word in word_features:
            features[word] = word in check_words
        features_set.append((features,sentiment))
        
    #7.splitting
    random.shuffle(features_set)
    train_count = int(len(features_set) * 0.8)
    train_set = features_set[:train_count]
    test_set = features_set[train_count:]

    #8.train
    classifier = NaiveBayesClassifier.train(train_set)
    print(f'Accuracy score:{accuracy(classifier,test_set)}')

    #9.Write file
    file = open("model.pickle","wb")
    pickle.dump(classifier,file)#
    file.close()
    
    #10
    return classifier




In [135]:
review = ""
category = ""
classifier = None

In [146]:

try:
    file = open("model.pickle", "rb")
    classifier = pickle.load(file)
    file.close()
except:
    classifier = train_model()

In [147]:
def menu1():
    #1.
    global review
    global category
    global classifier
    
    #2
    while True:    
        inputrev = input("Input your review(at least contain 5 words):")
        if len(inputrev.split(' '))>=5:
            break
        else:
            print(f'Input must have at least 5 words')
    #3
    review = inputrev
    category = classifier.classify(FreqDist(review))

In [148]:
def menu2():
    global review
    #1.validasi
    if(review==""):
        print(f"Please input your review first")
        return
    
    #2.load
    df = load_data()
    #3.feature selection
    reviews = [str(reviewd) for reviewd in df['review'].to_list()]
    titles =  [str(title) for title in df['title'].to_list()]
    #4tfidf
    vectorizer = TfidfVectorizer()
    matrix = vectorizer.fit_transform(reviews)
    query_matrix = vectorizer.transform([review])
    #5cosine_similarity
    cosine_similarities = cosine_similarity(query_matrix,matrix).flatten()
    # print(cosine_similarities)
    #6.ambil index
    related_indices_doc = cosine_similarities.argsort()[::-1][:2]
    #7.tampilkan
    for i,idx in enumerate(related_indices_doc):
        print(f'{i+1}: {titles[idx]}')    
        

In [149]:
def menu3():
    #1.load data
    df = load_data().sample(3000)
    #2.feature selection
    reviews = df['review'].to_string()
    # print(reviews)
    #3.load model
    spacy_nlp= spacy.load("en_core_web_sm")
    doc = spacy_nlp(reviews)
    
    
    #4cateries[label].append(text)
    categories={}
    
    for ent in doc.ents:
        label = ent.label_
        if(label not in('LANGUAGE','LOC')):
            continue
        if label not in categories:
            categories[label]=[]
        categories[label].append(ent.text)
    
    #5display 
    for label,entities in categories.items():#
        print(f'Label:{label},Entities:{entities}')
    
    if(categories==0):
        print(f"Entities not found")    
    

In [151]:
def main():
    global review
    global category
    while True:
        print(f'MOVIE RECOMMENDATION APPLICATION BASED ON REVIEWS')
        if(review==""):
            print(f'YOUR REVIEW : NO REVIEW')
            print(f'YOUR REVIEW CATEGORY : NO REVIEW')
        else:
            print(f'YOUR REVIEW : {review}')
            print(f'YOUR REVIEW CATEGORY : {category}')
        print(f'1.Write Your Review')
        print(f'2.View Movie Recommendation')
        print(f'3.View Named Entity Recognition')
        print(f'4.Exit')
        opt = int(input("Input your choice:"))
        if opt == 1:
            menu1()
        elif opt == 2:
            menu2()
        elif opt == 3:
            menu3()
        elif opt == 4:
            break
        else:
            print(f'Please reinput in between 1-4')
        

In [152]:
main()


MOVIE RECOMMENDATION APPLICATION BASED ON REVIEWS
YOUR REVIEW : DEW SDF DFG FFD DF
YOUR REVIEW CATEGORY : POSITIVE
1.Write Your Review
2.View Movie Recommendation
3.View Named Entity Recognition
4.Exit
