In [7]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [9]:
df = pd.read_csv('tweets_cleaned.csv')
df.drop(columns=['timestamp','username','handle','tweet_date','content','content_ment_link','content_wo_punct','content_wo_removed_english','content_wo_normalize','content_wo_tokenize','content_wo_stop','content_wo_lemmatized'], inplace=True)
df.head()

Unnamed: 0,content_no_rare_words
0,"['hep', 'allah', 'razı', 'var', 'deprem']"
1,"['allah', 'o', 'dayan', 'güc', 'ver', 'deprem']"
2,"['adıyaman', 'ali', 'taşı', 'mahalle', 'sokak'..."
3,['deprem']
4,"['arkadaş', 'ulaş', 'kahramanmaraş', 'elbistan..."


# Example

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Metni tokenize edin ve BERT input formatına dönüştürün
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    
    # Sonuçlardan [CLS] token'ının vektörünü alıyoruz
    embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
    
    return embeddings

# Örnek metin
text = "deprem yardim acil"
embedding = get_bert_embeddings(text)

print(embedding)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


[[-3.13332140e-01  1.83408588e-01 -8.18063617e-02 -3.12710404e-01
   2.74097696e-02 -2.15079620e-01  1.06709488e-01  1.82477400e-01
   3.61042731e-02 -2.72820204e-01  3.74240950e-02  4.18287292e-02
  -3.29569578e-02  4.23094392e-01 -2.27677878e-02  3.18799287e-01
  -2.48313293e-01  3.39600921e-01  2.94587940e-01  7.57001564e-02
  -2.44741559e-01 -4.71870661e-01 -1.42636284e-01 -3.17459442e-02
  -1.26161188e-01 -8.14832598e-02  1.42528825e-02 -2.44359933e-02
  -3.12216207e-02  7.79118612e-02 -8.44632089e-02  1.47396162e-01
  -2.33737916e-01 -8.64308253e-02  3.20469350e-01  1.74319908e-01
   1.37383655e-01 -4.86895442e-05  1.55550569e-01  1.05953559e-01
   9.92604494e-02  2.03916281e-01  9.82469916e-02  1.71719864e-02
   1.30365804e-01 -1.32164001e-01 -2.24310565e+00 -1.85067266e-01
  -8.54207695e-01 -2.59964824e-01  2.17268527e-01 -1.43349290e-01
  -1.73467562e-01  2.80877709e-01 -2.27142960e-01  3.53379339e-01
  -4.35609430e-01  6.84625626e-01  3.34568590e-01  4.20008957e-01
  -7.71767

# Embedding

In [10]:
def get_bert_embeddings_from_tokens(tokens):
    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors='pt', padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)
    
    embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()

    return embeddings

In [15]:
labels = {
    "acil": "acil durum, hızlı yardım gerekiyor, bina hasarı, kurtarma, tehlike, yaralı, ağır yaralı, hasar, enkaz altında, yardım bekliyor, yardım çağrısı",
    "çok acil": "çok acil, hemen müdahale, can kaybı riski, yaşam tehlikesi, felaket, kurtarma, ağır yaralı",
    "bilgilendirme": "bilgilendirme, duyuru, açıklama, uyarı, haber, afet bilgilendirmesi, deprem hakkında bilgi",
    "destek": "destek, yardım talepleri, bağış, gıda, barınma, su, tıbbi yardım, psikolojik destek, yardım kampanyası"
}
label_embeddings = {label: get_bert_embeddings_from_tokens(text.split()) for label, text in labels.items()}

In [16]:
for index, row in df.iterrows():
    text_tokens = row["content_no_rare_words"]  # Tokenize edilmiş metin
    text_embedding = get_bert_embeddings_from_tokens(text_tokens)
    
    # Her etikete olan benzerliği hesapla
    similarities = {label: cosine_similarity(text_embedding, label_embedding)[0][0] for label, label_embedding in label_embeddings.items()}
    
    # En yakın etiketi bul
    predicted_label = max(similarities, key=similarities.get)
    
    # Predicted label'ı DataFrame'e ekle
    df.loc[index, "predicted_label"] = predicted_label

In [19]:
print(df[['content_no_rare_words', 'predicted_label']].head())

                               content_no_rare_words predicted_label
0          ['hep', 'allah', 'razı', 'var', 'deprem']   bilgilendirme
1    ['allah', 'o', 'dayan', 'güc', 'ver', 'deprem']   bilgilendirme
2  ['adıyaman', 'ali', 'taşı', 'mahalle', 'sokak'...            acil
3                                         ['deprem']   bilgilendirme
4  ['arkadaş', 'ulaş', 'kahramanmaraş', 'elbistan...          destek


In [20]:
df.to_csv('tweets_with_labels.csv', index=False)