### Install and Import

In [1]:
!pip install nltk



In [2]:
import nltk

In [3]:
sample_text= "Oh man, this is pretty cool. We will do more such things. Don't enjoy. 2"

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize
# sent_tokenize sentence tokenize demek. Genellikle word_tokenize kullanılıyor.

### Tokenization

In [5]:
#nltk.download('punkt')

In [6]:
sentence_token = sent_tokenize(sample_text.lower())
sentence_token
# Tüm datamızdaki verileri karşılaştırabilmek için bütün kelimeleri küçük harflere dönüştürüyoruz.

['oh man, this is pretty cool.', 'we will do more such things.', '02.08.2021']

In [7]:
word_token = word_tokenize(sample_text.lower())
word_token

['oh',
 'man',
 ',',
 'this',
 'is',
 'pretty',
 'cool',
 '.',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 '.',
 '02.08.2021']

### Removing Punctuation and Numbers

In [8]:
tokens_without_punc = [w for w in word_token if w.isalpha()]
tokens_without_punc
# w.isalpha() datadaki string değerleri seçmeye yarıyor, sayı ve noktalama işaretlerini token edilmiş datadan çıkarıyor. Eğer sayılarda bizim için önemli ise .isalnum() kullanmalıyız. .isalnum() string ve numeric değerleri alıyor. 

['oh',
 'man',
 'this',
 'is',
 'pretty',
 'cool',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things']

### Removing Stopwords

In [9]:
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kurubal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords

In [11]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [12]:
stop_words = stopwords.words("english")

In [13]:
tokens_without_punc

['oh',
 'man',
 'this',
 'is',
 'pretty',
 'cool',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things']

In [20]:
token_without_sw = [t for t in tokens_without_punc if t not in stop_words]
token_without_sw

['oh', 'man', 'pretty', 'cool', 'things']

In [15]:
[i for i in stop_words if "n't" in i]
# Eğer olumsuz kelimeler bizim için önemli ise olumsuz stopwords ları bu kod ile ayrıştırabiliriz.
# Not : Bert ve benzeri deep learning ile geliştirilmiş modelleri kullanıyorsanız stopword leri temizlemeyiniz.

["don't",
 "aren't",
 "couldn't",
 "didn't",
 "doesn't",
 "hadn't",
 "hasn't",
 "haven't",
 "isn't",
 "mightn't",
 "mustn't",
 "needn't",
 "shan't",
 "shouldn't",
 "wasn't",
 "weren't",
 "won't",
 "wouldn't"]

### Lemmatization

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/kurubal/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [17]:
from nltk.stem import WordNetLemmatizer
# Lemmatization kelimenin kökenine inmeden değerlendirmeye alıyor. Bu yüzden genellikle Lemmatization kullanılıyor.

In [26]:
WordNetLemmatizer().lemmatize("driving")

'driving'

In [23]:
lem = [WordNetLemmatizer().lemmatize(t) for t in token_without_sw]

In [24]:
lem

['oh', 'man', 'pretty', 'cool', 'thing']

### Stemming

In [25]:
from nltk.stem import PorterStemmer
# Stemming kelimenin kökenine indiği için her zaman kelimenin tam anlamını veremeyebilir.

In [28]:
PorterStemmer().stem("driving")

'drive'

In [29]:
stem = [PorterStemmer().stem(t) for t in token_without_sw]

In [30]:
stem

['oh', 'man', 'pretti', 'cool', 'thing']

### Joining

In [32]:
" ".join(lem)

'oh man pretty cool thing'

### Cleaning Function

In [33]:
def cleaning(data):
    #1. Tokenize
    text_tokens = word_tokenize(data.lower())
    #2. Remove Puncs
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    #3. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    #4. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    #joining
    return " ".join(text_cleaned)

In [34]:
pd.Series(sample_text).apply(cleaning)

<IPython.core.display.Javascript object>

0    oh man pretty cool thing
dtype: object

### Count Vectorization and TF-IDF Vectorization

In [35]:
df = pd.read_csv("airline_tweets.csv")

<IPython.core.display.Javascript object>

In [37]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [38]:
df = df[['airline_sentiment','text']]
df

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


In [39]:
import pandas as pd
import numpy as np

In [40]:
df = df.iloc[:8, :]
df

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
7,neutral,@VirginAmerica Really missed a prime opportuni...


In [41]:
df2 = df.copy()

In [42]:
df2["text"] = df2["text"].apply(cleaning)

In [43]:
df2

Unnamed: 0,airline_sentiment,text
0,neutral,virginamerica dhepburn said
1,positive,virginamerica plus added commercial experience...
2,neutral,virginamerica today must mean need take anothe...
3,negative,virginamerica really aggressive blast obnoxiou...
4,negative,virginamerica really big bad thing
5,negative,virginamerica seriously would pay flight seat ...
6,positive,virginamerica yes nearly every time fly vx ear...
7,neutral,virginamerica really missed prime opportunity ...


### CountVectorization

In [44]:
X = df2["text"]
y = df2["airline_sentiment"]

In [45]:
from sklearn.model_selection import train_test_split

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [46]:
from sklearn.feature_extraction.text import CountVectorizer

In [49]:
vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

In [53]:
vectorizer.get_feature_names()

['aggressive',
 'amp',
 'another',
 'away',
 'bad',
 'big',
 'blast',
 'ear',
 'entertainment',
 'every',
 'face',
 'fly',
 'go',
 'guest',
 'little',
 'mean',
 'must',
 'nearly',
 'need',
 'obnoxious',
 'really',
 'recourse',
 'take',
 'thing',
 'time',
 'today',
 'trip',
 'virginamerica',
 'vx',
 'worm',
 'yes']

In [50]:
X_train_count.toarray()

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
        1, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 1, 1, 1, 1]])

In [51]:
pd.DataFrame(X_train_count.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,aggressive,amp,another,away,bad,big,blast,ear,entertainment,every,face,fly,go,guest,little,mean,must,nearly,need,obnoxious,really,recourse,take,thing,time,today,trip,virginamerica,vx,worm,yes
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1,0,0,1,1,1,0,0,0
1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0
2,1,1,0,0,0,0,1,0,1,0,1,0,0,1,1,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0,1,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,1


In [54]:
X_train[6]

'virginamerica yes nearly every time fly vx ear worm go away'

## TF-IDF

sklearn TD-IDF
https://towardsdatascience.com/how-sklearns-tf-idf-is-different-from-the-standard-tf-idf-275fa582e73d