# Natural Language Processing with NLTK

Apart from itself, also the other modules are required.

In [15]:
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In order to use the modules above, below are required to be downloaded as well.

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

The corpus is considered to be located on Google Drive

In [28]:
file = open("drive/MyDrive/datasets/hamlet.txt")

text = file.read()

## Segmentation

In [None]:
# Sentence Segmentation - Cümlelere Ayırma
print("Sentence Segmentation")
sentences = sent_tokenize(text)
f1 = open("sentence.txt", "w")
for s in sentences:
    print(s)
    f1.write(s)
f1.close()

## Tokenization

In [None]:
# Tokenization - Sözcüklere Ayırma
print("Tokenization")
words = word_tokenize(text)
f2 = open("tokenization.txt", "w")
for w in words:
    print(w)
    f2.write("".join(w)+"\n")
f2.close()

## Stemming

In [None]:
# Stemming - Kök İndirgeme
print("Stemming")
ps = PorterStemmer()
ss = SnowballStemmer("english")
f3a = open("porter-stemming.txt", "w")
f3b = open("snowball-stemming.txt", "w")
for w in words:
    psresult = w + " : " + ps.stem(w) + "\n"
    ssresult = w + " : " + ss.stem(w) + "\n"
    print("Porter Stemmer")
    print(psresult)
    print("Snowball Stemmer")
    print(ssresult)
    f3a.write(psresult)
    f3b.write(ssresult)
f3a.close()
f3b.close()

## Lemmatization

In [None]:
print("Lemmatization")
lemmatizer = WordNetLemmatizer()
f4 = open("lemmatization.txt", "w")
for w in words:
    lem = w + " : " + lemmatizer.lemmatize(w)
    print(lem)
    f4.write(lem)
f4.close()

## Stopwords

In [None]:
print("Stopwords")
stop_words = stopwords.words('english')
f5a = open("non-stopwords.txt", "w")
f5b = open("stopwords.txt", "w")

for r in words:
    if not r in stop_words:
        nost = r + "\n"
        print(nost)
        f5a.write(nost)
    else:
        st = r + "\n"
        print(st)
        f5b.write(st)
f5a.close()
f5b.close()

## Part-of-Speech

In [None]:
from pprint import pprint

# Part-of-speech - Sözcük Türü Etiketleme
print("Part-of-speech")
f6 = open("part-of-speech.txt", "w")
for i in sentences:
    wordsList = nltk.word_tokenize(i)

    # removing stop words from wordList
    wordsList = [w for w in wordsList if not w in stop_words]

    #  Using a Tagger. Which is part-of-speech
    # tagger or POS-tagger.
    tagged = nltk.pos_tag(wordsList)

    for t in tagged:
        pprint(t)
        f6.write("".join(f'{t[0]} \t {t[1]}')+"\n")
f6.close()

## Removing Punctuation

In [None]:
# Removing Punctuation - Noktalama İşaretini Kaldırma
print("Removing Punctuation")
tokens = nltk.wordpunct_tokenize(text)
txt = nltk.Text(tokens)
f7 = open("removing-punctuation.txt", "w")
for w in txt:
    if w.isalpha():
        print(w.lower())
        f7.write("".join(w.lower()) + "\n")
f7.close()