# Cleaning the dataset

In [1]:
import re

text = "Hello!, I am learning NLP. NLP is great area :)"

text = text.lower()  # Convert to lowercase

text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
print(text)


hello i am learning nlp nlp is great area 


# Tokenization

In [2]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(text)  # Tokenize the text

print(tokens)

['hello', 'i', 'am', 'learning', 'nlp', 'nlp', 'is', 'great', 'area']


# Stopword removal

In [3]:
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))

filtered_tokens = [word for word in tokens if word not in stopwords]  # Remove stopwords
print(filtered_tokens)

['hello', 'learning', 'nlp', 'nlp', 'great', 'area']


# Steamming/ Lemmatization

In [5]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

words = ["running", "flies", "better", "easily"]

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

print("Stemmed words: ", [stemmer.stem(word) for word in words])
print("Lemmatized words: ", [lemmatizer.lemmatize(word) for word in words])

Stemmed words:  ['run', 'fli', 'better', 'easili']
Lemmatized words:  ['running', 'fly', 'better', 'easily']


# Vectorize

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

data = ["Natural Language is fun.", 
        "Leaning NLP opens many doors.", 
        "Studyin NLp makes you feel happy."]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data)
print("Feature names:", vectorizer.get_feature_names_out())
print("TF-IDF matrix:\n", X.toarray())

Feature names: ['doors' 'feel' 'fun' 'happy' 'is' 'language' 'leaning' 'makes' 'many'
 'natural' 'nlp' 'opens' 'studyin' 'you']
TF-IDF matrix:
 [[0.         0.         0.5        0.         0.5        0.5
  0.         0.         0.         0.5        0.         0.
  0.         0.        ]
 [0.46735098 0.         0.         0.         0.         0.
  0.46735098 0.         0.46735098 0.         0.35543247 0.46735098
  0.         0.        ]
 [0.         0.42339448 0.         0.42339448 0.         0.
  0.         0.42339448 0.         0.         0.32200242 0.
  0.42339448 0.42339448]]


### Turkish version of Stem and Lemma 

In [7]:
!pip install snowballstemmer
!pip install trnlp

Collecting snowballstemmer
  Downloading snowballstemmer-3.0.1-py3-none-any.whl.metadata (7.9 kB)
Downloading snowballstemmer-3.0.1-py3-none-any.whl (103 kB)
Installing collected packages: snowballstemmer
Successfully installed snowballstemmer-3.0.1
Collecting trnlp
  Downloading trnlp-0.2.3a0.tar.gz (10.1 MB)
     ---------------------------------------- 0.0/10.1 MB ? eta -:--:--
     ---------------------------------------- 0.0/10.1 MB ? eta -:--:--
     ---------------------------------------- 0.0/10.1 MB ? eta -:--:--
     -- ------------------------------------- 0.5/10.1 MB 2.1 MB/s eta 0:00:05
     ------- -------------------------------- 1.8/10.1 MB 4.2 MB/s eta 0:00:02
     --------------- ------------------------ 3.9/10.1 MB 6.2 MB/s eta 0:00:01
     -------------------- ------------------- 5.2/10.1 MB 7.2 MB/s eta 0:00:01
     --------------------- ------------------ 5.5/10.1 MB 5.2 MB/s eta 0:00:01
     --------------------------------- ------ 8.4/10.1 MB 6.7 MB/s eta 0:00:0

  DEPRECATION: Building 'trnlp' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'trnlp'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [12]:
from snowballstemmer import TurkishStemmer

words = ["koşuyor", "uçuyor", "iyi", "kolayca"]
stemmer = TurkishStemmer()

print("Köklerine ayrılmış kelimeler:", [stemmer.stemWord(word) for word in words])


from trnlp import TrnlpWord

words = ["koşuyor", "uçuyor", "iyi", "kolayca"]

lemmas = []
for word in words:
    tw = TrnlpWord()
    tw.setword(word)
    lemmas.append(tw.get_stem)

print("Lemmatize edilmiş kelimeler:", lemmas)


Köklerine ayrılmış kelimeler: ['koşuyor', 'uçuyor', 'i', 'kolay']
Lemmatize edilmiş kelimeler: ['koş', 'uç', 'iyi', 'kolay']
