<a href="https://colab.research.google.com/github/dD2405/ML_ON1-Acadview/blob/master/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing NLTK 

In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Sentence Tokenization

In [4]:
text = "Backgammon is one of the oldest known board games. Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East.It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice."
sentences = nltk.sent_tokenize(text)
print(sentences)

['Backgammon is one of the oldest known board games.', 'Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East.It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.']


## Word Tokenization

In [5]:
words = nltk.word_tokenize(text)
print(words)  
  

['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games', '.', 'Its', 'history', 'can', 'be', 'traced', 'back', 'nearly', '5,000', 'years', 'to', 'archeological', 'discoveries', 'in', 'the', 'Middle', 'East.It', 'is', 'a', 'two', 'player', 'game', 'where', 'each', 'player', 'has', 'fifteen', 'checkers', 'which', 'move', 'between', 'twenty-four', 'points', 'according', 'to', 'the', 'roll', 'of', 'two', 'dice', '.']


## Text Lemmatization and Stemming

In [6]:
from nltk import PorterStemmer,WordNetLemmatizer

ps = PorterStemmer()
wnl = WordNetLemmatizer()

def compare(ps,wnl,word):
  print('Stemmer:',ps.stem(word))
  print('Lemmatizer:',wnl.lemmatize(word))
  print('----------------------------------------------------------------------------------------------------------------')
  
  
li = ['Seen','Playing','better','worse']
for i in li:
  compare(ps,wnl,i)

Stemmer: seen
Lemmatizer: Seen
----------------------------------------------------------------------------------------------------------------
Stemmer: play
Lemmatizer: Playing
----------------------------------------------------------------------------------------------------------------
Stemmer: better
Lemmatizer: better
----------------------------------------------------------------------------------------------------------------
Stemmer: wors
Lemmatizer: worse
----------------------------------------------------------------------------------------------------------------


## Stop Words

In [7]:
from nltk.corpus import stopwords
sw = stopwords.words('english')
print(sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
stop_words = set(stopwords.words("english"))
sentence = "Backgammon is one of the oldest known board games."

words = nltk.word_tokenize(sentence)
without_stop_words = [word for word in words if not word in stop_words]
print(without_stop_words)

['Backgammon', 'one', 'oldest', 'known', 'board', 'games', '.']


## Regular Expressions(RegEx)

In [9]:
import re
sentence = "The development of snowboarding was inspired by skateboarding, sledding, surfing and skiing."
pattern = r"[^\w]"
print(re.sub(pattern, " ", sentence))

The development of snowboarding was inspired by skateboarding  sledding  surfing and skiing 


## Bag of Words

In [0]:
documents = ["I like this movie, it's funny very funny.", 'I hate this movie.', 'This was awesome! I like it.', 'Nice one. I love it.']

#### The task here is to convert each raw text into a vector of numbers. After that, we can use these vectors as input for a machine learning model. The simplest scoring method is to mark the presence of words with 1 for present and 0 for absence.

In [22]:
# Importing the necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Design the Vocabulary
# The default token pattern removes tokens of a single character. 
# That's why we don't have the "I" and "s" tokens in the output.
cv = CountVectorizer()

# Create the bag-of-words model
bag_of_words = cv.fit_transform(documents)

feature_names = cv.get_feature_names()
pd.DataFrame(bag_of_words.toarray(), columns = feature_names)

Unnamed: 0,awesome,funny,hate,it,like,love,movie,nice,one,this,very,was
0,0,2,0,1,1,0,1,0,0,1,1,0
1,0,0,1,0,0,0,1,0,0,1,0,0
2,1,0,0,1,1,0,0,0,0,1,0,1
3,0,0,0,1,0,1,0,1,1,0,0,0


## TF-IDF

![tf](https://cdn-images-1.medium.com/max/800/1*V9ac4hLVyms79jl65Ym_Bw.png)

In [0]:
document = ["I like this movie, it's really very funny" ,"I hate this movie,I really hate it"]

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()
values = tf.fit_transform(document)

feature_names = tf.get_feature_names()
pd.DataFrame(values.toarray(),columns=feature_names)

Unnamed: 0,funny,hate,it,like,movie,really,this,very
0,0.446101,0.0,0.317404,0.446101,0.317404,0.317404,0.317404,0.446101
1,0.0,0.814802,0.289869,0.0,0.289869,0.289869,0.289869,0.0
