In [296]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
import re
import string
import nltk
from nltk.corpus import stopwords
from TurkishStemmer import TurkishStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, f_regression
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from keras.models import Sequential
from keras import layers
import tensorflow as tf


In [297]:
train_percentage = 0.3
num_of_classes = 2
max_feature_size = 1000

### LOAD DATA

In [298]:
FILENAME = "data/database_kara.csv"          

dataset = pd.read_csv(FILENAME, header = None)

dataset.rename(columns = {0:'text', 1:'label'}, inplace = True)

dataset.dropna(how='any', inplace=True)
dataset.reset_index(drop=True, inplace=True)

dataset['en_label'] = dataset["label"]
for i in range(dataset.shape[0]):
    label = dataset.loc[i,"en_label"]
    label = re.sub("[Yy]", "", label)
    # delete rows with wrong type of label
    if type(label) == float or label == "" or label == " ":
        dataset.drop([i], axis=0, inplace=True)
        continue
    dataset.loc[i,"en_label"] = label
    
# Shape = (sentence_count, 3) : sentence, label, encoded label

In [299]:
dataset

Unnamed: 0,text,label,en_label
0,Birçok toplum ilham ve sezişlerine rüyalarına ...,Y2,2
1,Günümüzde Aristo’nun bilimsel bilgisinden fazl...,Y2,2
2,Bilimin ilerlemesi bilinmeyenler diyarına deva...,Y2,2
3,İçinde bulunduğumuz kara parçası dünyada birço...,Y1,1
4,Kara duvara aşkını kazırken düşündüğü şey atmo...,Y2,2
...,...,...,...
1514,Şimdi birer düş gibi anımsıyordu bütün bunları...,Y2,2
1515,Ne kendi kendine ne de akasyaların kara gölges...,Y2,2
1516,Her zamanki yolundan değil otlaktan geçerek ay...,Y2,2
1517,Upuzun kara saçları esmer teni dans ettikçe pı...,Y2,2


In [300]:
del dataset["label"]

In [301]:
dataset.reset_index(drop=True, inplace=True)

In [302]:
dataset.isnull().values.any()

False

In [303]:
dataset

Unnamed: 0,text,en_label
0,Birçok toplum ilham ve sezişlerine rüyalarına ...,2
1,Günümüzde Aristo’nun bilimsel bilgisinden fazl...,2
2,Bilimin ilerlemesi bilinmeyenler diyarına deva...,2
3,İçinde bulunduğumuz kara parçası dünyada birço...,1
4,Kara duvara aşkını kazırken düşündüğü şey atmo...,2
...,...,...
1513,Şimdi birer düş gibi anımsıyordu bütün bunları...,2
1514,Ne kendi kendine ne de akasyaların kara gölges...,2
1515,Her zamanki yolundan değil otlaktan geçerek ay...,2
1516,Upuzun kara saçları esmer teni dans ettikçe pı...,2


### PREPROCESSING

In [304]:
s = set(stopwords.words('turkish'))
ps = nltk.wordnet.WordNetLemmatizer()

for i in range(dataset.shape[0]):
    review = dataset.loc[i,'text']
    for punc in string.punctuation:
        review.replace(punc, "")
    review = review.lower()
    review = review.split()
    review = [ps.lemmatize(word) for word in review if not word in s]
    #review = [stemmer.stem(word) for word in review if not word in s]
    review = ' '.join(review)
    dataset.loc[i, 'text'] = review

In [305]:
dataset.dropna(how='any', inplace=True)
dataset.reset_index(drop=True, inplace=True)

In [306]:
dataset

Unnamed: 0,text,en_label
0,birçok toplum ilham sezişlerine rüyalarına gai...,2
1,günümüzde aristo’nun bilimsel bilgisinden fazl...,2
2,bilimin ilerlemesi bilinmeyenler diyarına deva...,2
3,i̇çinde bulunduğumuz kara parçası dünyada birç...,1
4,kara duvara aşkını kazırken düşündüğü atmosfer...,2
...,...,...
1513,şimdi birer düş anımsıyordu bütün bunları çarp...,2
1514,kendi kendine akasyaların kara gölgesinde dişl...,2
1515,zamanki yolundan değil otlaktan geçerek ay ışı...,2
1516,upuzun kara saçları esmer teni dans ettikçe pı...,2


###### VECTORIZE TEXT

In [307]:
count_vect = CountVectorizer()
count_vectors = count_vect.fit_transform(dataset["text"])

In [308]:
print(count_vectors[0])

  (0, 1102)	1
  (0, 6789)	1
  (0, 3491)	1
  (0, 6148)	1
  (0, 5789)	1
  (0, 2466)	1
  (0, 6111)	1
  (0, 2392)	1
  (0, 5732)	1
  (0, 2515)	1
  (0, 1002)	1
  (0, 6379)	1
  (0, 3911)	1
  (0, 5745)	1
  (0, 3549)	1
  (0, 3513)	1


In [309]:
y = dataset["en_label"].to_numpy(dtype=int)

In [310]:
y = y - 1

In [311]:
y

array([1, 1, 1, ..., 1, 1, 1])

In [312]:
y.shape

(1518,)

#### Feature Selection

In [313]:
select = SelectKBest(score_func=chi2, k=max_feature_size)
count_vectors = select.fit_transform(count_vectors, y)

In [314]:
count_vectors

<1518x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 4730 stored elements in Compressed Sparse Row format>

##### SPLIT DATA

In [315]:
X_train, X_test, y_train, y_test = train_test_split(count_vectors, y, test_size=(1-train_percentage))

In [316]:
X_train = X_train.toarray()
X_test = X_test.toarray()
y_train = np.asarray(y_train, dtype=int)
y_test = np.asarray(y_test, dtype=int)

##### Reshape Input

In [317]:
nrows, ncols = X_train.shape
X_train = X_train.reshape(nrows, ncols, 1)

In [318]:
nrows, ncols = X_test.shape
X_test = X_test.reshape(nrows, ncols, 1)

In [319]:
nrows = y_train.shape[0]
y_train = y_train.reshape(nrows, 1)

In [320]:
nrows = y_test.shape[0]
y_test = y_test.reshape(nrows, 1)

### MODEL TRAINING

In [321]:
model = Sequential()
model.add(layers.Embedding(max_feature_size, 32, input_length=max_feature_size))
model.add(layers.Conv1D(32, 3, input_shape=X_train.shape[1:]))
model.add(layers.MaxPooling1D())
#model.add(layers.Conv1D(32, 3, input_shape=X_train.shape[1:]))
#model.add(layers.MaxPooling1D())
model.add(layers.Flatten())
model.add(layers.Dense(128, activation="relu"))
model.add(layers.Dropout(rate=0.5))
if num_of_classes > 2:
    model.add(layers.Dense(6, activation="softmax"))
    model.compile(loss="sparse_categorical_crossentropy",optimizer=tf.keras.optimizers.Adam(learning_rate=10e-4),metrics=["accuracy"])
else:
    model.add(layers.Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy" , optimizer="adam", metrics=["accuracy"])

In [322]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 1000, 32)          32000     
                                                                 
 conv1d_12 (Conv1D)          (None, 998, 32)           3104      
                                                                 
 max_pooling1d_12 (MaxPoolin  (None, 499, 32)          0         
 g1D)                                                            
                                                                 
 flatten_11 (Flatten)        (None, 15968)             0         
                                                                 
 dense_22 (Dense)            (None, 128)               2044032   
                                                                 
 dropout_11 (Dropout)        (None, 128)               0         
                                                     

In [323]:
model.fit(X_train, y_train, epochs=15, validation_data=[X_test, y_test], batch_size=128)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f725a0d3310>

In [324]:
y_prob = model.predict(X_test, batch_size=128)



In [325]:
if num_of_classes > 2:
    y_predicted = y_prob.argmax(axis=-1)
else:
    y_predicted = [1 if x >0.4 else 0 for x in y_prob]

In [326]:
y_predicted

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [327]:
accuracy_score(y_test,y_predicted)

0.7779868297271872

In [328]:
f1_score(y_test, y_predicted, average="weighted")

0.6817888961793561