In [1]:
import pandas as pd
import re # regular expression RE, for example: "[^a-zA-Z]
import nltk as nlp
from sklearn.feature_extraction.text import CountVectorizer # for bag of words 
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer # bag of words yaratmak icin kullandigim metot
from sklearn.tree import DecisionTreeClassifier

### NOTE

Yazılan description'u erkek mi kadın mı yazmış, tahminlemesi yalnızca 2 sütuna göre yapılacak.

# 1-Read Data

In [2]:
data = pd.read_csv(r"data/gender_classifier.csv", encoding = "latin1") # latin harfleri içeriyor.
data = pd.concat([data.gender, data.description] ,axis=1) # Yalnızca bu 2 sütun üzerinde çalışacağız.
data.head()

Unnamed: 0,gender,description
0,male,i sing my own rhythm.
1,male,I'm the author of novels filled with family dr...
2,male,louis whining and squealing and all
3,male,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe..."
4,female,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...


# 2-Preprocessing

### A) Remove NaN and Unnecessary Values

In [3]:
data.isna().sum(axis = 0) # count the NaN(null) values in all columns.

gender           97
description    3744
dtype: int64

In [4]:
# Remove null values
data.dropna(axis = 0, inplace = True)  # inplace=> data=data.dropna dememize gerek yok.

In [5]:
data.describe()

Unnamed: 0,gender,description
count,16224,16224
unique,4,15073
top,female,"You can be spiritually empowered, financially ..."
freq,5725,33


In [6]:
data.gender.unique()

array(['male', 'female', 'brand', 'unknown'], dtype=object)

In [7]:
data.head()

Unnamed: 0,gender,description
0,male,i sing my own rhythm.
1,male,I'm the author of novels filled with family dr...
2,male,louis whining and squealing and all
3,male,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe..."
4,female,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...


In [8]:
# remove other values in gender column
data.drop(data[(data['gender'] == 'brand') | (data['gender'] == 'unknown')].index, inplace = True, axis = 0)

In [9]:
data.gender.unique()

array(['male', 'female'], dtype=object)

In [10]:
data.describe()

Unnamed: 0,gender,description
count,11194,11194
unique,2,10657
top,female,secret little rendezvous
freq,5725,19


### B) Convert from Objects to Numeric Values 

In [11]:
# male = 0
# famele = 1
data.gender = [1 if each == "female" else 0 for each in data.gender]

### C) Cleaning Data <span style="color:yellow">*(Example)*</span>

- [a-zA-Z] = a-z ve A-Z arası tüm karakterleri bulur.
- [^a-zA-Z] = a-z ve A-Z içermeyen tüm karakterleri bulur.

Example Code:

In [12]:
first_description = data.description[4] #example string
first_description

'Ricky Wilson The Best FRONTMAN/Kaiser Chiefs The Best BAND Xxxx Thank you Kaiser Chiefs for an incredible year of gigs and memories to cherish always :) Xxxxxxx'

In [13]:
description = re.sub("[^a-zA-Z]", " ", first_description)  # a dan z ye ve A dan Z ye kadar olan harfleri bulma geri kalanları " " (space) ile degistir.
description = description.lower()   # buyuk harftan kucuk harfe cevirme
description

'ricky wilson the best frontman kaiser chiefs the best band xxxx thank you kaiser chiefs for an incredible year of gigs and memories to cherish always    xxxxxxx'

### D) Split Sentence <span style="color:yellow">*(Example)*</span>

Tokenization: Büyük miktarda metnin simge(token) adı verilen daha küçük parçalara bölünmesi işlemidir.
 
- Bu belirteçler, kalıpları bulmak için çok kullanışlıdır ve kök bulma ve lemmatizasyon için bir temel adım olarak kabul edilir. 
- Bir cümleyi kelimelere bölmek için **word_tokenize()** yöntemini kullanılır.
  
-  Kelime belirtme işleminin çıktısı, makine öğrenimi uygulamalarında metnin daha iyi anlaşılması için DataFrame'e dönüştürülebilir.

In [14]:
# description = description.split() # 2. ama yetersiz yöntem
nlp.download('punkt') 
description = nlp.word_tokenize(description)
print(description)

['ricky', 'wilson', 'the', 'best', 'frontman', 'kaiser', 'chiefs', 'the', 'best', 'band', 'xxxx', 'thank', 'you', 'kaiser', 'chiefs', 'for', 'an', 'incredible', 'year', 'of', 'gigs', 'and', 'memories', 'to', 'cherish', 'always', 'xxxxxxx']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pointo2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### E) Remove Stopwords (and the, i, he, she, to, etc. ...) <span style="color:yellow">*(Example)*</span>

In [15]:
# greksiz kelimeleri cikar
nlp.download("stopwords") # download stopwords to corpus folder.
description = [ word for word in description if not word in set(stopwords.words("english"))] # sample word
print(description)

['ricky', 'wilson', 'best', 'frontman', 'kaiser', 'chiefs', 'best', 'band', 'xxxx', 'thank', 'kaiser', 'chiefs', 'incredible', 'year', 'gigs', 'memories', 'cherish', 'always', 'xxxxxxx']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pointo2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### E) Find Lemmatization (Root of word)  <span style="color:yellow">*(Example)*</span>

loved => love   
gone = > go

In [16]:
nlp.download('wordnet')
lemma = nlp.WordNetLemmatizer()
description = [ lemma.lemmatize(word) for word in description] # root of the word
description = " ".join(description) # merge word as sentence
print(description)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pointo2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ricky wilson best frontman kaiser chief best band xxxx thank kaiser chief incredible year gig memory cherish always xxxxxxx


### F) Apply all operations to the entire dataset.

In [17]:
description_list = []
for description in data.description:
    description = re.sub("[^a-zA-Z]"," ",description)
    description = description.lower()   # buyuk harftan kucuk harfe cevirme
    description = nlp.word_tokenize(description)
    # bag of words kısmında gereksiz kelimeleri(stopwords'u) daha kolay yöntemle ataack.
    # description = [ word for word in description if not word in set(stopwords.words("english"))] 
    lemma = nlp.WordNetLemmatizer()
    description = [ lemma.lemmatize(word) for word in description]
    description = " ".join(description)
    description_list.append(description)
description_list[4]

'ricky wilson the best frontman kaiser chief the best band xxxx thank you kaiser chief for an incredible year of gig and memory to cherish always xxxxxxx'

# G) Bag of Words (add unique words to columns)

- CountVectorizer methodu ile **lowercase** ve **token_pattern** ile de yukarıdaki işlemleri yapabiliriz.

In [18]:
max_features = 8000 # En çok kullanılan 8000 kelimeyi seçer. Sayı arttıkça yavaş çalışır.
count_vectorizer = CountVectorizer(max_features = max_features, stop_words = "english")

- **spar_matrix, x_train**'e karşılık gelir.

In [19]:
sparce_matrix = count_vectorizer.fit_transform(description_list).toarray()  # Modeli uyarla ve 1 ve 0 lardan oluşan sparce_matrixe' eşitle. 
sparce_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
#print("most frequently used {} words : {}".format(max_features, count_vectorizer.get_feature_names()))

# 3) Split Data

data.iloc[:,0] ve data.iloc[:,0].values iki işlem de aynıdır. Values diğerek numpy'a çevirmiş oluruz. Yani tek fark:

- ```type(data.iloc[:,0])``` = pandas.core.series.Series
- ```type(data.iloc[:,0].values)``` = numpy.ndarray

In [21]:
y = data.iloc[:, 0].values  # male or female classes in numpy
x = sparce_matrix

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 42)

# 4) Create DT Model and Train Data

In [23]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

DecisionTreeClassifier()

# 5) Test Data

In [24]:
# y_pred = rf.predict(x_test)

In [25]:
print("accuracy: ", dt.score(x_test, y_test)) 

accuracy:  0.6125
