In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import torch
import time
import datetime
import random
import seaborn as sns
import transformers
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder


In [6]:
data = pd.read_csv('labels.csv')

In [7]:
data.head()

Unnamed: 0,id,text,date,user,rt,fav,followers,verified,label
0,221221,Yıl 2020 Bist100 tarafında bilanço dönemi ve ...,2022-07-11 13:38:08+00:00,borsaparatic,17,180,220342,False,Pozitif#Yargı
1,221222,Bloomberg Businessweek’in 3 Temmuz tarihli öze...,2022-07-10 10:16:06+00:00,ibrahim___ethem,20,343,136873,False,Gerçek#Pozitif
2,221223,"en UCUZ #BIST 100 #hisse'leri\n#SAHOL 18,68&gt...",2022-07-09 20:53:12+00:00,ASIM_YALCINKAYA,107,934,138759,False,Gerçek#Pozitif
3,221224,Kısa notlarımı paylaşmaya başlıyorum.\nBayramd...,2022-07-09 19:44:59+00:00,ibrahim___ethem,134,1529,136873,False,Pozitif#Yargı
4,221225,📍İstanbul Havalimanı dün tarihinin en yüksek u...,2022-07-09 07:01:24+00:00,ibrahim___ethem,23,379,136873,False,Gerçek#Nötr


In [8]:
labels = pd.concat([data.drop('label', axis=1), data['label'].str.get_dummies('#')], axis=1)
labels.head()

Unnamed: 0,id,text,date,user,rt,fav,followers,verified,Gerçek,Negatif,Nötr,Pozitif,Yargı
0,221221,Yıl 2020 Bist100 tarafında bilanço dönemi ve ...,2022-07-11 13:38:08+00:00,borsaparatic,17,180,220342,False,0,0,0,1,1
1,221222,Bloomberg Businessweek’in 3 Temmuz tarihli öze...,2022-07-10 10:16:06+00:00,ibrahim___ethem,20,343,136873,False,1,0,0,1,0
2,221223,"en UCUZ #BIST 100 #hisse'leri\n#SAHOL 18,68&gt...",2022-07-09 20:53:12+00:00,ASIM_YALCINKAYA,107,934,138759,False,1,0,0,1,0
3,221224,Kısa notlarımı paylaşmaya başlıyorum.\nBayramd...,2022-07-09 19:44:59+00:00,ibrahim___ethem,134,1529,136873,False,0,0,0,1,1
4,221225,📍İstanbul Havalimanı dün tarihinin en yüksek u...,2022-07-09 07:01:24+00:00,ibrahim___ethem,23,379,136873,False,1,0,1,0,0


In [9]:
len(labels[labels['Gerçek'] == 1]), len(labels[labels['Yargı'] == 1])

(746, 639)

In [10]:
bin = labels.drop(['Pozitif', 'Negatif', 'Nötr', 'Yargı'], axis=1)

In [11]:
bin.head()

Unnamed: 0,id,text,date,user,rt,fav,followers,verified,Gerçek
0,221221,Yıl 2020 Bist100 tarafında bilanço dönemi ve ...,2022-07-11 13:38:08+00:00,borsaparatic,17,180,220342,False,0
1,221222,Bloomberg Businessweek’in 3 Temmuz tarihli öze...,2022-07-10 10:16:06+00:00,ibrahim___ethem,20,343,136873,False,1
2,221223,"en UCUZ #BIST 100 #hisse'leri\n#SAHOL 18,68&gt...",2022-07-09 20:53:12+00:00,ASIM_YALCINKAYA,107,934,138759,False,1
3,221224,Kısa notlarımı paylaşmaya başlıyorum.\nBayramd...,2022-07-09 19:44:59+00:00,ibrahim___ethem,134,1529,136873,False,0
4,221225,📍İstanbul Havalimanı dün tarihinin en yüksek u...,2022-07-09 07:01:24+00:00,ibrahim___ethem,23,379,136873,False,1


In [12]:
# rename column
# 1 for fact, zero for opinion
bin.rename(columns={'Gerçek': 'label'}, inplace=True)
bin.head()

Unnamed: 0,id,text,date,user,rt,fav,followers,verified,label
0,221221,Yıl 2020 Bist100 tarafında bilanço dönemi ve ...,2022-07-11 13:38:08+00:00,borsaparatic,17,180,220342,False,0
1,221222,Bloomberg Businessweek’in 3 Temmuz tarihli öze...,2022-07-10 10:16:06+00:00,ibrahim___ethem,20,343,136873,False,1
2,221223,"en UCUZ #BIST 100 #hisse'leri\n#SAHOL 18,68&gt...",2022-07-09 20:53:12+00:00,ASIM_YALCINKAYA,107,934,138759,False,1
3,221224,Kısa notlarımı paylaşmaya başlıyorum.\nBayramd...,2022-07-09 19:44:59+00:00,ibrahim___ethem,134,1529,136873,False,0
4,221225,📍İstanbul Havalimanı dün tarihinin en yüksek u...,2022-07-09 07:01:24+00:00,ibrahim___ethem,23,379,136873,False,1


In [13]:
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-128k-uncased', do_lower_case=True)
sentences = bin.text.values
max_len = 250

In [15]:
training = bin.groupby('label').apply(lambda x : x.sample(frac = 0.8))
test = pd.concat([bin,training]).drop_duplicates(keep=False)

print("Training: ", len(training))
print("Test: ", len(test))

training_texts = training.text.values

Training:  1120
Test:  280


In [21]:
training_labels = training.label.values

In [22]:
input_ids = []
attention_masks = []

for text in training_texts:
    encoded_dict = tokenizer.encode_plus(
                        text,                     
                        add_special_tokens = True,
                        max_length = max_len,      
                        pad_to_max_length = True,
                        return_attention_mask = True, 
                        return_tensors = 'pt',
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(training_labels)

print('Original: ', training_texts[0])
print('Token IDs:', input_ids[0])



Original:  Rusya, Avrupa ülkeleri ve Kanada dahil 36 ülkeden havayolu şirketinin Rusya'ya uçuşlarını yasakladı. (IFX)

#thyao satış getirdi bu haber
Token IDs: tensor([     2,   4562,     16,   3038,  87717,   2023,   1946,   9337,   3931,
          5413, 112208,   1009,  14959, 111602,   8117,   4562,     11,   1956,
         11856,  16285, 112189,  48538,     18,     12,   5481,   1070,     13,
             7,  12712,  60783,   6604,   1017,   6014,   1964,   2732,      3,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0, 