In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from bs4 import BeautifulSoup
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [2]:
# Veri setlerimizi yüklüyoruz..
df = pd.read_csv('NLPlabeledData.tsv',  delimiter="\t", quoting=3)

In [3]:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
# stopwords'ü temizlemek için nltk kütüphanesinden stopwords kelime setini bilgisayarımıza indirmemiz gerekiyor. 
# Bu işlemi nltk ile yapıyoruz
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dilarabuker/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
#veri temizleme işlemler
örnek_review= df.review[0]
örnek_review

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [6]:
#HTML taglerini temizleyeceğiz
örnek_review = BeautifulSoup(örnek_review).get_text()

In [7]:
örnek_review

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 2

In [9]:
# noktalama işaretleri ve sayılardan temizliyoruz - regex kullanarak..
örnek_review = re.sub("[^a-zA-Z]",' ',örnek_review)
örnek_review

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    m

In [10]:
# küçük harfe dönüştürüyoruz, makine öğrenim algoritmalarımızın büyük harfle başlayan kelimeleri farklı kelime olarak
# algılamaması için yapıyoruz bunu:
örnek_review = örnek_review.lower()
örnek_review

' with all this stuff going down at the moment with mj i ve started listening to his music  watching the odd documentary here and there  watched the wiz and watched moonwalker again  maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  some of it has subtle messages about mj s feeling towards the press and also the obvious message of drugs are bad m kay visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring  some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him the actual feature film bit when it finally starts is only on for    m

In [11]:
# stopwords (yani the, is, are gibi kelimeler yapay zeka tarafından kullanılmamasını istiyoruz. Bunlar gramer kelimeri..)
# önce split ile kelimeleri bölüyoruz ve listeye dönüştürüyoruz. amacımız stopwords kelimelerini çıkarmak..
örnek_review = örnek_review.split()

In [12]:
örnek_review

['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with',
 'mj',
 'i',
 've',
 'started',
 'listening',
 'to',
 'his',
 'music',
 'watching',
 'the',
 'odd',
 'documentary',
 'here',
 'and',
 'there',
 'watched',
 'the',
 'wiz',
 'and',
 'watched',
 'moonwalker',
 'again',
 'maybe',
 'i',
 'just',
 'want',
 'to',
 'get',
 'a',
 'certain',
 'insight',
 'into',
 'this',
 'guy',
 'who',
 'i',
 'thought',
 'was',
 'really',
 'cool',
 'in',
 'the',
 'eighties',
 'just',
 'to',
 'maybe',
 'make',
 'up',
 'my',
 'mind',
 'whether',
 'he',
 'is',
 'guilty',
 'or',
 'innocent',
 'moonwalker',
 'is',
 'part',
 'biography',
 'part',
 'feature',
 'film',
 'which',
 'i',
 'remember',
 'going',
 'to',
 'see',
 'at',
 'the',
 'cinema',
 'when',
 'it',
 'was',
 'originally',
 'released',
 'some',
 'of',
 'it',
 'has',
 'subtle',
 'messages',
 'about',
 'mj',
 's',
 'feeling',
 'towards',
 'the',
 'press',
 'and',
 'also',
 'the',
 'obvious',
 'message',
 'of',
 'drugs',

In [13]:
#stopwords çıkarılmış hali için
swords = set(stopwords.words("english"))
örnek_review = [w for w in örnek_review if w not in swords]

In [14]:
#Temizleme işlemini açıkladıktan sonra şimdi tüm dataframe'imiz içinde bulunan reviewleri döngü içinde topluca temizliyoruz

In [15]:
def process(review):
    # HTML etiketleri olmadan review
    review = BeautifulSoup(review).get_text()
    
    # noktalama işaretleri ve sayılar olmadan review
    review = re.sub("[^a-zA-Z]",' ',review)
    
    # küçük harfe dönüştürme ve engellenecek sözcükleri ortadan kaldırmak için
    review = review.lower()
    review = review.split()
    
    # stopwords olmadan review 
    swords = set(stopwords.words("english"))                      # conversion into set for fast searching
    review = [w for w in review if w not in swords]   
    
    # splitted paragraph'ları space ile birleştiriyoruz
    return(" ".join(review))

In [16]:
# training datamızı yukardaki fonksiyon yardımıyla temizliyoruz: 
# her 1000 review sonrası bir satır yazdırarak review işleminin durumunu görüyoruz

train_x_tum = []
for r in range(len(df["review"])):        
    if (r+1)%1000 == 0:        
        print("İşlenen inceleme sayısı =", r+1)
    train_x_tum.append(process(df["review"][r]))

  review = BeautifulSoup(review).get_text()


İşlenen inceleme sayısı = 1000
İşlenen inceleme sayısı = 2000
İşlenen inceleme sayısı = 3000
İşlenen inceleme sayısı = 4000
İşlenen inceleme sayısı = 5000
İşlenen inceleme sayısı = 6000
İşlenen inceleme sayısı = 7000
İşlenen inceleme sayısı = 8000
İşlenen inceleme sayısı = 9000
İşlenen inceleme sayısı = 10000
İşlenen inceleme sayısı = 11000
İşlenen inceleme sayısı = 12000
İşlenen inceleme sayısı = 13000
İşlenen inceleme sayısı = 14000
İşlenen inceleme sayısı = 15000
İşlenen inceleme sayısı = 16000
İşlenen inceleme sayısı = 17000
İşlenen inceleme sayısı = 18000
İşlenen inceleme sayısı = 19000
İşlenen inceleme sayısı = 20000
İşlenen inceleme sayısı = 21000
İşlenen inceleme sayısı = 22000
İşlenen inceleme sayısı = 23000
İşlenen inceleme sayısı = 24000
İşlenen inceleme sayısı = 25000


# TRAIN, TEST SPLIT

In [17]:
x = train_x_tum
y = np.array(df["sentiment"])

# train test split
train_x, test_x, y_train, y_test = train_test_split(x,y, test_size = 0.1)


## Bag of Words oluşturuyoruz !
Verilerimizi temizledik ancak yapay zekanın çalışması için bu metin tabanlı verileri sayılara ve bag of words denilen matrise çevirmek gerekiyor. İşte bu amaçla sklearn içinde bulunan CountVectorizer aracını kullanıyoruz:

In [18]:
# sklearn içinde bulunan countvectorizer fonksiyonunu kullanarak max 5000 kelimelik bag of words oluşturuyoruz...
vectorizer = CountVectorizer( max_features = 5000 )

# train verilerimizi feature vektöre matrisine çeviriyoruz
train_x = vectorizer.fit_transform(train_x)


In [19]:
train_x

<22500x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 1778590 stored elements in Compressed Sparse Row format>

In [20]:
# Bunu array'e dönüştürüyoruz çünkü fit işlemi için array istiyor..
train_x = train_x.toarray()
train_y = y_train

In [21]:
train_x.shape, train_y.shape

((22500, 5000), (22500,))

In [22]:
train_y

array([0, 0, 0, ..., 1, 1, 0])

##### Random Forest Modeli oluşturuyoruz ve fit ediyoruz 

In [23]:
model = RandomForestClassifier(n_estimators = 100, random_state=42)
model.fit(train_x, train_y)


In [24]:
# Test verilerimizi feature vektöre matrisine çeviriyoruz
# Yani aynı işlemleri(bag of wordse dönüştürme) tekrarlıyoruz bu sefer test datamız için:
test_xx = vectorizer.transform(test_x)

In [25]:
test_xx

<2500x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 196655 stored elements in Compressed Sparse Row format>

In [26]:
test_xx = test_xx.toarray()

In [27]:
test_xx.shape

(2500, 5000)

In [28]:
test_predict = model.predict(test_xx)
dogruluk = roc_auc_score(y_test, test_predict)

In [30]:
print("Doğruluk oranı : % ", dogruluk * 100)

Doğruluk oranı : %  84.16060580460343
