## Metin Temsili Yöntemleri
1. **Bag of Words (BoW)**
2. **TF-IDF (Term Frequency-Inverse Document Frequency)**
3. **N-Gram Modelleri**
4. **Word Embeddings**
5. **Transformers Tabanlı Metin Temsili**

### Bag of Words

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
documents = [
    "kedi bahçede", "kedi evde"
]

vectorizer = CountVectorizer()

In [3]:
X = vectorizer.fit_transform(documents)

In [4]:
# kelime kümesini oluşturma
feature_names = vectorizer.get_feature_names_out() 
feature_names

array(['bahçede', 'evde', 'kedi'], dtype=object)

In [5]:
# vektör temsili
vector_temsili = X.toarray()
vector_temsili

array([[1, 0, 1],
       [0, 1, 1]])

### IMDB veri seti ile Bag of Words

In [6]:
import pandas as pd

# https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
documents = df["review"]
labels = df["sentiment"]

In [8]:
# metin temizleme
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def clean_text(text):

    # büyük küçük harf çevrimi
    text = text.lower()

    # rakamları temizleme
    text = re.sub(r"\d+", "", text)

    # özel karakterlerin kaldırılması
    text = re.sub(r"[^\w\s]","",text)
    
    # kısa kelimeleri kaldır
    words = [word for word in text.split() if len(word) > 2]

    # stopword'leri kaldır
    words = [word for word in words if word not in stop_words]

    # listeyi tekrar string'e çevir
    text = " ".join(words)
    
    return text

In [9]:
cleaned_doc = [clean_text(row) for row in documents]

In [10]:
vectorizer = CountVectorizer()

# metin -> sayısal hale getir
X = vectorizer.fit_transform(cleaned_doc[:75]) 

In [11]:
X # 75 satır, 4041 adet unique kelime var 

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 7434 stored elements and shape (75, 3948)>

In [12]:
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['abbot', 'abetted', 'abiding', ..., 'zone', 'zooms', 'zwick'],
      shape=(3948,), dtype=object)

In [13]:
# vektör temsili
vektor_temsili = X.toarray()
vektor_temsili

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(75, 3948))

In [14]:
df_bow = pd.DataFrame(vektor_temsili, columns=feature_names)
df_bow

Unnamed: 0,abbot,abetted,abiding,ability,able,aboveaverage,abraham,abrahams,absolute,absolutely,...,yuen,zack,zany,zellweger,zerog,zombie,zombiebr,zone,zooms,zwick
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
72,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
73,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# kelime frekansını göster
from collections import Counter

word_counts = X.sum(axis=0).A1
word_frequeny = dict(zip(feature_names, word_counts))

most_common_5_words = Counter(word_frequeny).most_common(5)
print(f"En çok geçen beş kelime: {most_common_5_words}")

En çok geçen beş kelime: [('movie', np.int64(123)), ('film', np.int64(98)), ('one', np.int64(72)), ('like', np.int64(59)), ('good', np.int64(38))]


## TF-IDF

In [16]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
documents = [
    "Köpek çok tatlı bir hayvandır",
    "Köpek ve kuşlar çok tatlı hayvanlardır.",
    "Inekler süt üretirler."
]

tfidf_vectorizer = TfidfVectorizer()

In [18]:
# metinleri sayısal hale çevir
X = tfidf_vectorizer.fit_transform(documents)

feature_names = tfidf_vectorizer.get_feature_names_out()
print(feature_names)

['bir' 'hayvandır' 'hayvanlardır' 'inekler' 'kuşlar' 'köpek' 'süt' 'tatlı'
 've' 'çok' 'üretirler']


In [19]:
# vektör temsili
vektor_temsili = X.toarray()
#print(f"tf-idf:\n{vektör_temsili}")
df_tfidf = pd.DataFrame(vektor_temsili, columns=feature_names)
df_tfidf.head()

Unnamed: 0,bir,hayvandır,hayvanlardır,inekler,kuşlar,köpek,süt,tatlı,ve,çok,üretirler
0,0.51742,0.51742,0.0,0.0,0.0,0.393511,0.0,0.393511,0.0,0.393511,0.0
1,0.0,0.0,0.459548,0.0,0.459548,0.349498,0.0,0.349498,0.459548,0.349498,0.0
2,0.0,0.0,0.0,0.57735,0.0,0.0,0.57735,0.0,0.0,0.0,0.57735


In [20]:
tf_idf = df_tfidf.mean(axis=0)
tf_idf.sort_values(ascending=False)

çok             0.247670
tatlı           0.247670
köpek           0.247670
üretirler       0.192450
süt             0.192450
inekler         0.192450
bir             0.172473
hayvandır       0.172473
hayvanlardır    0.153183
kuşlar          0.153183
ve              0.153183
dtype: float64

### Spam veri seti ile TF-IDF

In [21]:
# https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset

df = pd.read_csv("spam.csv", encoding='ISO-8859-1')
df["type"] = df["v1"]
df["text"] = df["v2"]
df = df.drop(columns = ["v1", "v2", "Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
df.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [22]:
print(df.shape)
print()
print(df["type"].value_counts())

(5572, 2)

type
ham     4825
spam     747
Name: count, dtype: int64


In [23]:
# veri temizleme

def clean_text(text):

    # büyük küçük harf çevrimi
    text = text.lower()

    # rakamları temizleme
    text = re.sub(r"\d+", "", text)

    # özel karakterlerin kaldırılması
    text = re.sub(r"[^\w\s]","",text)
    
    # kısa kelimeleri kaldır
    words = [word for word in text.split() if len(word) > 2]

    # stopword'leri kaldır
    words = [word for word in words if word not in stop_words]

    # listeyi tekrar string'e çevir
    text = " ".join(words)
    
    return text

In [24]:
df["clean_text"] = df["text"].apply(clean_text)
df.head()

Unnamed: 0,type,text,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis great world...
1,ham,Ok lar... Joking wif u oni...,lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win cup final tkts may te...
3,ham,U dun say so early hor... U c already then say...,dun say early hor already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [25]:
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(df["clean_text"])

In [26]:
feature_names = vectorizer.get_feature_names_out()
print(feature_names)
print(len(feature_names))
tfidf_score = X.mean(axis=0).A1 # her kelimenin ortalama tf-idf değerleri

['____' 'aah' 'aaniye' ... 'ìïll' 'û_thanks' 'ûïharry']
8228


In [27]:
df_tfidf = pd.DataFrame({"word":feature_names, "tfidf_score":tfidf_score})
df_tfidf.sort_values(by="tfidf_score", ascending=False).head(10)

Unnamed: 0,word,tfidf_score
964,call,0.021126
2714,get,0.014161
3311,ill,0.012887
1326,come,0.011849
4069,ltgt,0.010905
1925,dont,0.010686
2794,good,0.010171
3734,know,0.010085
2819,got,0.009908
3911,like,0.009748
