<a href="https://colab.research.google.com/github/dvircohen0/NLP/blob/main/semantic_analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
Word_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

positive_reviews =  BeautifulSoup(open("positive.review").read())
positive_reviews =  positive_reviews.findAll('review_text')

negative_reviews =  BeautifulSoup(open("negative.review").read())
negative_reviews =  negative_reviews.findAll('review_text')

np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

def tokenizer(text):
    text = text.lower()
    tokens = nltk.tokenize.word_tokenize(text)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [Word_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stop_words]
    return tokens
    

word_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []

for review in positive_reviews:
    tokens = tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_map:
            word_map[token] = current_index
            current_index +=1
            
for review in negative_reviews:
    tokens = tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_map:
            word_map[token] = current_index
            current_index +=1

def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_map) + 1)
    for t in tokens:
        i = word_map[t]
        x[i] +=1
    x = x/x.sum()
    x[-1] = label
    return x

N = len(negative_tokenized) + len(negative_tokenized)
data = np.zeros((N,len(word_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i +=1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i +=1

X = data[:,:-1]
Y = data[:,-1]

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.05, random_state=42)


LR_model = LogisticRegression()
RF_model =  RandomForestClassifier()
AB_model =  AdaBoostClassifier()
NB_model = MultinomialNB()
SVM_model = svm.LinearSVC()


LR_model.fit(X_train, y_train)
RF_model.fit(X_train, y_train)
AB_model.fit(X_train, y_train)
NB_model.fit(X_train, y_train)
SVM_model.fit(X_train, y_train)


print("Logistic Regression classfifcation score: ", LR_model.score(X_test,y_test))
print("Random Forest classfifcation score: ", RF_model.score(X_test,y_test))
print("AdaBoost classfifcation score: ", AB_model.score(X_test,y_test))
print("MultinomialNB classfifcation score: ", NB_model.score(X_test,y_test))
print("SVM classfifcation score: ", SVM_model.score(X_test,y_test))


Logistic Regression classfifcation score:  0.75
Random Forest classfifcation score:  0.84
AdaBoost classfifcation score:  0.75
MultinomialNB classfifcation score:  0.82
SVM classfifcation score:  0.8


In [7]:
treshold = 0.5
for word, index in word_map.items():
    weight = LR_model.coef_[0][index]
    if weight > treshold or weight < -treshold:
        print(word, weight)

perfect 0.8371926895217009
n't -1.9020767262257412
quality 1.1589759348517612
sound 0.9012381247228022
fast 0.8191768101262733
wa -1.2082294519185264
could -0.5058744251334883
even -0.735701278876155
doe -1.063260700423381
happy 0.5333140262372946
would -0.6735875478115538
recommend 0.6251694842873292
good 1.7891190674046458
well 0.8523617569780264
small 0.5887351701414605
love 0.9121671291662387
time -0.5991885393578545
ha 0.628333378578018
comfortable 0.5284075792835006
use 1.5370817041503742
item -0.9181882400543532
month -0.6692900372643024
lot 0.5569560102313887
paper 0.5607935080727208
price 2.26569697623441
great 3.4999042624686427
little 0.6069733968480449
easy 1.3376612400721475
unit -0.6473419994020221
've 0.5230796404963847
need 0.519577576534241
get -1.0742681758055401
like 0.5772794838896115
two -0.5418838602617079
back -1.4299655143958
speaker 0.7722764757231644
thing -0.8251131547978069
problem 0.5543412472369411
bad -0.5739750080011543
cable 0.5681156600463296
buy -0.92