### Install and Import

In [1]:
#!pip install nltk

In [2]:
import nltk

In [3]:
sample_text= "Oh man, this is pretty cool. We will do more such things. Don't enjoy. 2"

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize
# sent_tokenize sentence tokenize demek. Genellikle word_tokenize kullanılıyor, yani bütün text kelime kelime ayrılıyor.  
# Not: Eğer döküman cümle cümle ayrılmışsa her bir cümle token olarak adlandırılır, kelime kelime ayrılmışsa her bir kelime token olarak adlandırılır. 

### Tokenization

In [5]:
#nltk.download('punkt')

In [6]:
sentence_token = sent_tokenize(sample_text.lower())
sentence_token
# Tüm datamızdaki aynı olan verileri karşılaştırabilmek için bütün kelimeleri küçük harflere dönüştürüyoruz.
# Cümle olarak ayrılan token ler

['oh man, this is pretty cool.',
 'we will do more such things.',
 "don't enjoy.",
 '2']

In [7]:
word_token = word_tokenize(sample_text.lower())
word_token
# Kelime olarak ayrılan token ler

['oh',
 'man',
 ',',
 'this',
 'is',
 'pretty',
 'cool',
 '.',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 '.',
 'do',
 "n't",
 'enjoy',
 '.',
 '2']

### Removing Punctuation and Numbers

In [8]:
tokens_without_punc = [w for w in word_token if w.isalpha()]
tokens_without_punc
# Noktalama ve sayıları datamızdan çıkarıyoruz.
# w.isalpha() datadaki string değerleri seçmeye yarıyor, sayı ve noktalama işaretlerini token edilmiş datadan çıkarıyor. Eğer sayılarda bizim için önemli ise .isalnum() kullanmalıyız. .isalnum() string ve numeric değerleri alıyor. 

['oh',
 'man',
 'this',
 'is',
 'pretty',
 'cool',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 'do',
 'enjoy']

### Removing Stopwords

In [9]:
# nltk.download('stopwords')
# stopwords bir dilde en fazla kullanılan soru, bağlaç, özne vb. kelimeler bu kelimeler cümle içerisinde çok fazla bulunduğu için modelin kelimeleri değerlendirirken modelde yanlış değerlendirmelere neden olabileceğinden genellikle düşürülmesi tercih edilir.   

In [10]:
from nltk.corpus import stopwords

In [11]:
stopwords.words("english")
# words("english") datamız ingilizce olduğu için ingilizcede ki stopwords leri çağırıyoruz.

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [12]:
stop_words = stopwords.words("english")

In [13]:
tokens_without_punc

['oh',
 'man',
 'this',
 'is',
 'pretty',
 'cool',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 'do',
 'enjoy']

In [14]:
token_without_sw = [t for t in tokens_without_punc if t not in stop_words]
token_without_sw
# stopwords lari datamızdan çıkardık.

['oh', 'man', 'pretty', 'cool', 'things', 'enjoy']

In [15]:
[i for i in stop_words if "n't" in i]
# Eğer olumsuz kelimeler bizim için önemli ise olumsuz stopwords ları bu kod ile ayrıştırabiliriz. Burada stopwords datamızdaki Don't u düşürdüğü için kelimenin olumsuz anlamı yok oldu.
# Not : Bert ve benzeri deep learning ile geliştirilmiş modelleri kullanıyorsanız stopword leri temizlememize gerek yoktur. Çünkü bu modeller bunlar için gerekli işlemleri otomatik yapıyorlar.

["don't",
 "aren't",
 "couldn't",
 "didn't",
 "doesn't",
 "hadn't",
 "hasn't",
 "haven't",
 "isn't",
 "mightn't",
 "mustn't",
 "needn't",
 "shan't",
 "shouldn't",
 "wasn't",
 "weren't",
 "won't",
 "wouldn't"]

### Lemmatization

In [16]:
#nltk.download('wordnet')

In [17]:
from nltk.stem import WordNetLemmatizer
# Lemmatization kelimenin kökenine inmeden değerlendirmeye alıyor. Bu yüzden genellikle Lemmatization kullanılıyor. 

In [18]:
WordNetLemmatizer().lemmatize("driving")

'driving'

In [19]:
lem = [WordNetLemmatizer().lemmatize(t) for t in token_without_sw]

In [20]:
lem

['oh', 'man', 'pretty', 'cool', 'thing', 'enjoy']

### Stemming

In [21]:
from nltk.stem import PorterStemmer
# Stemming kelimenin kökenine indiği için her zaman kelimenin tam anlamını veremeyebilir.

In [22]:
PorterStemmer().stem("driving")
# Bizim kelimemiz isim anlamında olduğu halde stemmer onun köküne inerek fiil anlamını getirdi ve anlam değişikliğine neden oldu. 

'drive'

In [23]:
stem = [PorterStemmer().stem(t) for t in token_without_sw]

In [24]:
stem

['oh', 'man', 'pretti', 'cool', 'thing', 'enjoy']

### Joining

In [25]:
" ".join(lem)
# Lemmatizer halindeki kelimelerimizi aralarında bir boşluk(" " sağlıyor) olacak şekilde birleştirdik.

'oh man pretty cool thing enjoy'

In [26]:
" ".join(stem)
# Stemmer halindeki kelimelerimizi aralarında bir boşluk(" " sağlıyor) olacak şekilde birleştirdik.

'oh man pretti cool thing enjoy'

In [27]:
", ".join(stem)
# Kelimeler arasına virgül ve boşluk ekledik.

'oh, man, pretti, cool, thing, enjoy'

### Cleaning Function

In [28]:
#from nltk.tokenize import sent_tokenize, word_tokenize
#from nltk.corpus import stopwords
#stop_words = stopwords.words("english")
#from nltk.stem import WordNetLemmatizer
def cleaning(data):
    #1. Tokenize
    text_tokens = word_tokenize(data.lower())
    #2. Remove Puncs
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    #3. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    #4. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    #joining
    return " ".join(text_cleaned)
# Yukarıda yaptığımız işlemleri tek bir fonksiyona dönüştürüyoruz.

In [29]:
pd.Series(sample_text).apply(cleaning)

<IPython.core.display.Javascript object>

0    oh man pretty cool thing enjoy
dtype: object

### Count Vectorization and TF-IDF Vectorization

In [30]:
import pandas as pd
import numpy as np

In [31]:
df = pd.read_csv("airline_tweets.csv")

In [32]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [33]:
df = df[['airline_sentiment','text']]
df
# Dataset inden havayolu ile ilgili yorum ve değerlendirmeleri seçiyoruz.

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


In [34]:
df = df.iloc[:8, :]
df
# İlk sekiz satırı alıyoruz.

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
7,neutral,@VirginAmerica Really missed a prime opportuni...


In [35]:
df2 = df.copy()

In [36]:
df2["text"] = df2["text"].apply(cleaning)
# Yukarıdaki fonksiyonumuzu buradaki text sütununa uyguluyoruz. 

In [37]:
df2

Unnamed: 0,airline_sentiment,text
0,neutral,virginamerica dhepburn said
1,positive,virginamerica plus added commercial experience...
2,neutral,virginamerica today must mean need take anothe...
3,negative,virginamerica really aggressive blast obnoxiou...
4,negative,virginamerica really big bad thing
5,negative,virginamerica seriously would pay flight seat ...
6,positive,virginamerica yes nearly every time fly vx ear...
7,neutral,virginamerica really missed prime opportunity ...


### CountVectorization

In [38]:
X = df2["text"]
y = df2["airline_sentiment"]
# Not: Vectorization kelimelerin anlamlarına göre değil kelimelerin cümle içerisinde birbirleri ile birlikte kullanılma olasılıklarına göre değerlendirme yapıyor. Bu yüzden genellikle Deep Learning modellerinde Lemmatization ve Stemming uygulanmıyor.

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

In [43]:
vectorizer.get_feature_names()
# Corpus (tüm dataseti) taki bütün unique kelimeleri getiriyor.

['aggressive',
 'amp',
 'another',
 'away',
 'bad',
 'big',
 'blast',
 'ear',
 'entertainment',
 'every',
 'face',
 'fly',
 'go',
 'guest',
 'little',
 'mean',
 'must',
 'nearly',
 'need',
 'obnoxious',
 'really',
 'recourse',
 'take',
 'thing',
 'time',
 'today',
 'trip',
 'virginamerica',
 'vx',
 'worm',
 'yes']

In [44]:
X_test_count.dtype

dtype('int64')

In [45]:
X_train_count.toarray()
# Herbir satırı (yorumu) sayısal değerlere dönüştürüyoruz.

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
        1, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 1, 1, 1, 1]], dtype=int64)

In [46]:
pd.set_option('display.max_columns', 50)

In [47]:
pd.DataFrame(X_train_count.toarray(), columns = vectorizer.get_feature_names())
# Her bir satırı unique kelimelere göre değerlendiriyoruz. Eğer unique kelimelerden biri o satırda bulunuyorsa 1 yoksa 0 olarak gözüküyor. 

Unnamed: 0,aggressive,amp,another,away,bad,big,blast,ear,entertainment,every,face,fly,go,guest,little,mean,must,nearly,need,obnoxious,really,recourse,take,thing,time,today,trip,virginamerica,vx,worm,yes
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1,0,0,1,1,1,0,0,0
1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0
2,1,1,0,0,0,0,1,0,1,0,1,0,0,1,1,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0,1,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,1


In [48]:
X_train[6]
# İlk kelimemiz (0)

'virginamerica yes nearly every time fly vx ear worm go away'

## TF-IDF

sklearn TD-IDF
https://towardsdatascience.com/how-sklearns-tf-idf-is-different-from-the-standard-tf-idf-275fa582e73d

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
# TF kelimelerin döküman içerindaki sayısına göre oran belirliyor. IDF ise kelimenin corpus taki sayısını ve toplam kelime sayısını kullanarak log alıyor. Böylece bir kelime kullanma sıklığı artıkça oranını düşürüyor ki kelime ağırlıkları dengelensin. 

In [50]:
tf_idf_vectorizer = TfidfVectorizer()
X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
X_test_tf_idf = tf_idf_vectorizer.transform(X_test)
# Burada fit corpus taki tüm kelimeleri alıp sayısını belirliyor, transform da ise TF ve IDF formülleri bütün kelimelere uygulanıyor. 

In [51]:
tf_idf_vectorizer.get_feature_names()

['aggressive',
 'amp',
 'another',
 'away',
 'bad',
 'big',
 'blast',
 'ear',
 'entertainment',
 'every',
 'face',
 'fly',
 'go',
 'guest',
 'little',
 'mean',
 'must',
 'nearly',
 'need',
 'obnoxious',
 'really',
 'recourse',
 'take',
 'thing',
 'time',
 'today',
 'trip',
 'virginamerica',
 'vx',
 'worm',
 'yes']

In [52]:
X_train_tf_idf.toarray()

array([[0.        , 0.        , 0.37082034, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37082034, 0.37082034, 0.        , 0.37082034, 0.        ,
        0.        , 0.        , 0.37082034, 0.        , 0.        ,
        0.37082034, 0.37082034, 0.19350944, 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.50676543,
        0.50676543, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.39953968, 0.        , 0.        , 0.50676543, 0.        ,
        0.        , 0.        , 0.26445122, 0.        , 0.        ,
        0.        ],
       [0.31791864, 0.31791864, 0.        , 0.        , 0.        ,
        0.        , 0.31791864, 0.        , 0.31791864, 0.        ,
      

In [53]:
pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names())

Unnamed: 0,aggressive,amp,another,away,bad,big,blast,ear,entertainment,every,face,fly,go,guest,little,mean,must,nearly,need,obnoxious,really,recourse,take,thing,time,today,trip,virginamerica,vx,worm,yes
0,0.0,0.0,0.37082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.37082,0.37082,0.0,0.37082,0.0,0.0,0.0,0.37082,0.0,0.0,0.37082,0.37082,0.193509,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.506765,0.506765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.39954,0.0,0.0,0.506765,0.0,0.0,0.0,0.264451,0.0,0.0,0.0
2,0.317919,0.317919,0.0,0.0,0.0,0.0,0.317919,0.0,0.317919,0.0,0.317919,0.0,0.0,0.317919,0.317919,0.0,0.0,0.0,0.0,0.317919,0.250651,0.317919,0.0,0.0,0.0,0.0,0.0,0.165903,0.0,0.0,0.0
3,0.0,0.0,0.0,0.312008,0.0,0.0,0.0,0.312008,0.0,0.312008,0.0,0.312008,0.312008,0.0,0.0,0.0,0.0,0.312008,0.0,0.0,0.0,0.0,0.0,0.0,0.312008,0.0,0.0,0.162819,0.312008,0.312008,0.312008


In [54]:
X_train[6]

'virginamerica yes nearly every time fly vx ear worm go away'

In [55]:
pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names()).loc[1].sort_values(ascending=False)
# Burada virginamerica bütün cümlelerde geçmesine rağmen TF IDF onun ağırlığını düşürdü. 

bad              0.506765
big              0.506765
thing            0.506765
really           0.399540
virginamerica    0.264451
aggressive       0.000000
nearly           0.000000
worm             0.000000
vx               0.000000
trip             0.000000
today            0.000000
time             0.000000
take             0.000000
recourse         0.000000
obnoxious        0.000000
need             0.000000
mean             0.000000
must             0.000000
amp              0.000000
little           0.000000
guest            0.000000
go               0.000000
fly              0.000000
face             0.000000
every            0.000000
entertainment    0.000000
ear              0.000000
blast            0.000000
away             0.000000
another          0.000000
yes              0.000000
Name: 1, dtype: float64