In [1]:
import json
import csv
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
import itertools
import string
from nltk.corpus import stopwords
from sklearn.utils import resample
import gensim
from sklearn import preprocessing
from sklearn.model_selection import KFold
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import copy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

from sklearn.metrics import plot_confusion_matrix, confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from trtokenizer.tr_tokenizer import SentenceTokenizer, WordTokenizer
word_tokenizer_object = WordTokenizer()

import warnings
warnings.filterwarnings('ignore')

In [2]:
def clean_text(text):
    
    text = str(text).lower()   
    text = text.translate(str.maketrans('', '', string.punctuation)) # !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~
    words = word_tokenizer_object.tokenize(text)
    stopWords = set(stopwords.words('turkish'))
    new_text = ' '.join(word for word in words if word not in stopWords)

    return new_text

In [3]:
test = pd.read_csv('test_10_topic.csv', sep='\t')
df_1000 = pd.read_csv('df_1000.csv', sep='\t')
df_hel = pd.read_csv('df_1000_helsinki.csv', sep='\t')
df_det = pd.read_csv('df_1000_detailed.csv', sep='\t')
df_2000 = pd.concat([df_1000, df_1000])

In [None]:
df_1000['title'][908]

In [None]:
df_hel['title'][1908]

In [None]:
df_det['title'][1908]

In [None]:
class_names = np.unique(df_1000['topic'])
class_names

In [None]:
df_hel[df_hel['title'].isnull()].index.tolist()

In [9]:
df_hel = df_hel.drop(index=[264, 314, 341, 359, 1264, 1314, 1341, 1359])
df_det = df_det.drop(index=[264, 314, 341, 359, 1264, 1314, 1341, 1359])
df_2000 = df_2000.drop(index=[264, 314, 341, 359])
df_1000 = df_1000.drop(index=[264, 314, 341, 359])

In [None]:
len(df_det)

In [None]:
plt.figure(figsize=(13, 4))
sns.countplot(df_1000['topic'])

In [12]:
test.title = [clean_text(text) for text in test.title]
test.head()

Unnamed: 0.1,Unnamed: 0,abstract,title,topic
0,276806,sakarya’nın serdivan ilçesinde serdivanspor ka...,sahada başlayan kavga tribüne sıçradı 8 kırmız...,Spor
1,276807,avustralya başbakanı scott morrison merkezde k...,nauru gözaltı merkezinde sığınmacı çocuk kalma...,Dünya
2,276808,seat soğuk kış günlerinde otomobil camlarındak...,seat’tan kış ayları 4 öneri,Otomobil
3,276811,i lerleyen yaşla birlikte cildin elastikiyetin...,kış güneşi cildinizi hızla yaşlandırıyor,Sağlık
4,276813,sudan dışişleri bakanı ahmed oac hükümeti sila...,oacde nihai barış anlaşması çarşamba imzalanacak,Dünya


### TF-IDF

In [None]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)

for n_fold, (_, val_index) in enumerate(kf.split(df_2000[:len(df_2000)//2])):
    after_1000 = np.asarray([(i + 1000) for i in val_index])
    val_index = np.concatenate((val_index, after_1000))
    train_index = list(set(list(range(0, 2000))) - set(val_index))
    print(val_index, train_index)

In [13]:
def execute_model(model):
    
    scores_f1 = []
    
    pipe = make_pipeline(TfidfVectorizer(), StandardScaler(with_mean=False), model)
    kf = KFold(n_splits=5, random_state=42, shuffle=True)      
    length = len(df)//2

    for n_fold, (_, val_i) in enumerate(kf.split(range(length))):
        
        after_half = np.asarray([(i + length) for i in val_i])
        val_index = np.concatenate((val_i, after_half))
        train_index = list(set(list(range(0, len(df)))) - set(val_index))

        X_train = df.iloc[train_index]['title']
        y_train = df.iloc[train_index]['topic']

        #X_valid = df.iloc[val_index]['title']
        #y_valid = df.iloc[val_index]['topic']
        
        X_valid = test['title']
        y_valid = test['topic']

        m = pipe.fit(X_train, y_train)
        y_pred = m.predict(X_valid)

        f1 = f1_score(y_valid, y_pred, average='macro')        
        scores_f1.append(f1)
        
    print(np.mean(scores_f1), '\t', np.std(scores_f1))
    
    return pipe, np.mean(scores_f1), np.std(scores_f1)

In [14]:
datasets = [df_1000, df_2000, df_hel, df_det]

In [None]:
lsvc_f1s = []
lsvc_stds = []

for dataset in datasets:
    
    df = copy.deepcopy(dataset)
    df.title = [clean_text(text) for text in df.title]
    
    lsvc_pipe, lsvc_f1_mean, lsvc_std = execute_model(LinearSVC())
    lsvc_f1s.append(lsvc_f1_mean)
    lsvc_stds.append(lsvc_std)

lsvc_f1s =  [f"{num:.3f}" for num in lsvc_f1s]
lsvc_stds =  [f"{num:.3f}" for num in lsvc_stds]

In [None]:
rf_f1s = []
rf_stds = []

for dataset in datasets:
    
    df = copy.deepcopy(dataset)    
    df.title = [clean_text(text) for text in df.title]
    
    rf_pipe, rf_f1_mean, rf_std = execute_model(RandomForestClassifier())
    rf_f1s.append(rf_f1_mean)
    rf_stds.append(rf_std)
    
rf_f1s =  [f"{num:.3f}" for num in rf_f1s]
rf_stds =  [f"{num:.3f}" for num in rf_stds]

In [None]:
lr_f1s = []
lr_stds = []

for dataset in datasets:
    
    df = copy.deepcopy(dataset)   
    df.title = [clean_text(text) for text in df.title]
    
    lr_pipe, lr_f1_mean, lr_std = execute_model(LogisticRegression())
    lr_f1s.append(lr_f1_mean)
    lr_stds.append(lr_std)
    
lr_f1s =  [f"{num:.3f}" for num in lr_f1s]
lr_stds =  [f"{num:.3f}" for num in lr_stds]

In [None]:
print("\nF1-Score Table:")
f1_df = {'Dataset': ['ds_1000', 'ds_1000_copy', 'ds_1000_back', 'ds_1000_det'], 
             'Linear_SVC': lsvc_f1s, 
             'Random_forest': rf_f1s, 
             'Logistic_regression': lr_f1s}

f1_df = pd.DataFrame.from_dict(f1_df).set_index('Dataset')
f1_df

In [None]:
print("\nStandard Deviation Table:")
std_df = {'Dataset': ['ds_1000', 'ds_1000_copy', 'ds_1000_back', 'ds_1000_det'], 
             'Linear_SVC': lsvc_stds, 
             'Random_forest': rf_stds, 
             'Logistic_regression': lr_stds}

std_df = pd.DataFrame.from_dict(std_df).set_index('Dataset')
std_df