In [3]:
# Sentence tokenization
import nltk
text = "Backgammon is one of the oldest known board games. Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East. It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice."
nltk.download('punkt')
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
    print(sentence)
    print()

Backgammon is one of the oldest known board games.

Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East.

It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.



[nltk_data] Downloading package punkt to /home/elia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
#Word segmentation
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    print(words)
    print()

['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games', '.']

['Its', 'history', 'can', 'be', 'traced', 'back', 'nearly', '5,000', 'years', 'to', 'archeological', 'discoveries', 'in', 'the', 'Middle', 'East', '.']

['It', 'is', 'a', 'two', 'player', 'game', 'where', 'each', 'player', 'has', 'fifteen', 'checkers', 'which', 'move', 'between', 'twenty-four', 'points', 'according', 'to', 'the', 'roll', 'of', 'two', 'dice', '.']



In [15]:
# Lemmatization 
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemm = WordNetLemmatizer()
words = [w for w in nltk.word_tokenize(text) if w.isalnum()]
words_tags = nltk.pos_tag(words)

lemmatized_words = [lemm.lemmatize(word, pos=get_wordnet_pos(pos)) for word, pos in words_tags]
print(lemmatized_words)

['Backgammon', 'be', 'one', 'of', 'the', 'old', 'know', 'board', 'game', 'Its', 'history', 'can', 'be', 'trace', 'back', 'nearly', 'year', 'to', 'archeological', 'discovery', 'in', 'the', 'Middle', 'East', 'It', 'be', 'a', 'two', 'player', 'game', 'where', 'each', 'player', 'have', 'fifteen', 'checker', 'which', 'move', 'between', 'point', 'accord', 'to', 'the', 'roll', 'of', 'two', 'dice']


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/elia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/elia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))
without_stop_words = [word for word in lemmatized_words if not word in stop_words]
print(without_stop_words)

['Backgammon', 'one', 'old', 'know', 'board', 'game', 'Its', 'history', 'trace', 'back', 'nearly', 'year', 'archeological', 'discovery', 'Middle', 'East', 'It', 'two', 'player', 'game', 'player', 'fifteen', 'checker', 'move', 'point', 'accord', 'roll', 'two', 'dice']


[nltk_data] Downloading package stopwords to /home/elia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
# Così uso solo le parole che ho pulito prima.
count_vectorizer = CountVectorizer(vocabulary=set(without_stop_words))
bag_of_words = count_vectorizer.fit_transform(sentences)

feature_names = count_vectorizer.get_feature_names()
pd.DataFrame(bag_of_words.toarray(), columns = feature_names)

Unnamed: 0,Backgammon,East,It,Its,Middle,accord,archeological,back,board,checker,...,move,nearly,old,one,player,point,roll,trace,two,year
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,2,0,1,0,2,0
