# Veri Kurulumu ve Kütüphaneler

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup # html etiketlerini kaldırmak için

In [3]:
df = pd.read_csv('IMDB Dataset.csv')

In [4]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
def clearingandconverting(text):   
    
    text=text.lower()                    # Buradan sonraki 4 satırd ise NLP methodlarını uygulayabilmek adına
                                         # bütün veriyi küçük harflere çevirdik ve içlerinden numerik 
    soup = BeautifulSoup(text)           # verileri ve de sembolleri attık
    
    text=text.replace("[^\w\s]","") 
    text=text.replace("\d+","") 
    text=text.replace("\n"," ").replace("\r","") 
    
    text=text.replace('""',"") 
    
    text = soup.get_text()
    
    return text

In [7]:
df['review'] = df['review'].apply(clearingandconverting)

In [8]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Modelleme

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english') 

def split_into_lemmas(text):    # Stemma analiz methodunu tanımladık
    
    text = str(text).lower()   
    
    words = TextBlob(text).words
    
    return [stemmer.stem(word) for word in words]

## Verileri Ayırma ve Vektörize Etme İşlemleri

In [10]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [54]:
df['review'].apply(split_into_lemmas)

0        [one, of, the, other, review, has, mention, th...
1        [a, wonder, littl, product, the, film, techniq...
2        [i, thought, this, was, a, wonder, way, to, sp...
3        [basic, there, 's, a, famili, where, a, littl,...
4        [petter, mattei, 's, love, in, the, time, of, ...
                               ...                        
49995    [i, thought, this, movi, did, a, down, right, ...
49996    [bad, plot, bad, dialogu, bad, act, idiot, dir...
49997    [i, am, a, cathol, taught, in, parochi, elemen...
49998    [i, 'm, go, to, have, to, disagre, with, the, ...
49999    [no, one, expect, the, star, trek, movi, to, b...
Name: review, Length: 50000, dtype: object

In [11]:
x,y=df['review'],df['sentiment']

In [12]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=80)

In [13]:
vect=CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1,2), analyzer=split_into_lemmas)

x_train_dtm=vect.fit_transform(x_train,y_train)
x_test_dtm=vect.transform(x_test)

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [15]:
b=MultinomialNB()
model=b.fit(x_train_dtm,y_train)
b_predict=b.predict(x_test_dtm)

In [16]:
accuracy_score(y_test,b_predict)

0.8376

# Modeli Deneme ve Kullanma

In [17]:
def vectorizing(text):
    
    return vect.transform([text])

In [18]:
model.predict(vectorizing('Hello, world I bad thing'));