# Metin Sınıflandırma (Text Classification)

In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [2]:
nltk.download("stopwords") # Çok kullanılan ve anlam talşımayan sözcükleri metin içerisinden çıkartmak için gerekli 
nltk.download("wordnet") # Lemma(kelime kökleri) bulmak için gerekli olan veriseti
nltk.download("omw-1.4") # Wordnet'e ait farklı dillerin kelime anlamlarını içeren bir veri seti

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emirh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emirh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\emirh\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

Spam veri seti -> spam , ham -> Binary classification - Decision Tree 

In [3]:
data = pd.read_csv("metin_siniflandirma_spam_veri_seti.csv",encoding="latin-1")
data.head

<bound method NDFrame.head of         v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will Ì_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN  

**Gereksiz sütunların çıkarılması**

In [4]:
data.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis = 1 , inplace = True)

In [5]:
data.head

<bound method NDFrame.head of         v1                                                 v2
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham              Will Ì_ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>

**Sütun isimlerinin düzletilmesi**

In [6]:
data.columns = ["label" , "text"]
data.head

<bound method NDFrame.head of      label                                               text
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham              Will Ì_ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>

**EDA : Keşifsel veri analizi : missing value**  (Kayıp veri Var mı ?)

In [7]:
data.isna().sum()

label    0
text     0
dtype: int64

**Text Cleaning and Preproceessing**

Özel karakterler ,lowercase , tokenization , stopwords lemmazite

In [8]:
text = list(data.text)
lemmatizer = WordNetLemmatizer()
corpus = []

for i in range(len(text)):
    # Metin içerisindeki harf olmayan tüm karakterlerden kurtuluyoruz
    r = re.sub("[^a-zA-Z]"," ",text[i])

    # Büyük harfi küçük harf yap.
    r = r.lower()

    # Kelimeleri ayır
    r = r.split()

    # Anlamsız kelimelerden kurtuluyoruz
    r = [word for word in r if word not in stopwords.words("english")]

    r = [lemmatizer.lemmatize(word) for word in r]

    r = " ".join(r)

    corpus.append(r)

In [9]:
data["text2"] = corpus

In [10]:
data["text2"].head

<bound method NDFrame.head of 0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry wkly comp win fa cup final tkts st ...
3                     u dun say early hor u c already say
4                     nah think go usf life around though
                              ...                        
5567    nd time tried contact u u pound prize claim ea...
5568                            b going esplanade fr home
5569                                 pity mood suggestion
5570    guy bitching acted like interested buying some...
5571                                       rofl true name
Name: text2, Length: 5572, dtype: object>

**Model Eğitimi**

In [11]:
x = data["text2"]
y = data["label"]

In [12]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.2 ,random_state = 42)

**Feature Extraction** (Bag of Words)

In [13]:
cv = CountVectorizer()
x_train_cv = cv.fit_transform(x_train)
x_test_cv = cv.transform(x_test)

**Classifier Training** : model training and evaluation

In [14]:
dt = DecisionTreeClassifier()
dt.fit(x_train_cv,y_train) # Eğitim

**Prediction**

In [15]:
prediction = dt.predict(x_test_cv)

In [16]:
c_matrix = confusion_matrix(y_test,prediction)

In [17]:
c_matrix

array([[957,   8],
       [ 24, 126]], dtype=int64)

In [18]:
accuracy = 100 * (sum(sum(c_matrix)) - c_matrix[1,0] - c_matrix[0,1])/sum(sum(c_matrix))

In [19]:
accuracy

97.13004484304933