In [1]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
text = open("Text.txt").read()

In [3]:
text

'Football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal. Unqualified, the word football normally means the form of football that is the most popular where the word is used. Sports commonly called football include association football (known as soccer in North America, Ireland and Australia); gridiron football (specifically American football or Canadian football); Australian rules football; rugby union and rugby league; and Gaelic football. These various forms of football share to varying extents common origins and are known as "football codes".\n\nThere are a number of references to traditional, ancient, or prehistoric ball games played in many different parts of the world. Contemporary codes of football can be traced back to the codification of these games at English public schools during the 19th century. The expansion and cultural influence of the British Empire allowed these rules of football to spread to areas of British influence outs

## Tokenization

In [4]:
tokens = word_tokenize(text)

In [5]:
print(tokens)

['Football', 'is', 'a', 'family', 'of', 'team', 'sports', 'that', 'involve', ',', 'to', 'varying', 'degrees', ',', 'kicking', 'a', 'ball', 'to', 'score', 'a', 'goal', '.', 'Unqualified', ',', 'the', 'word', 'football', 'normally', 'means', 'the', 'form', 'of', 'football', 'that', 'is', 'the', 'most', 'popular', 'where', 'the', 'word', 'is', 'used', '.', 'Sports', 'commonly', 'called', 'football', 'include', 'association', 'football', '(', 'known', 'as', 'soccer', 'in', 'North', 'America', ',', 'Ireland', 'and', 'Australia', ')', ';', 'gridiron', 'football', '(', 'specifically', 'American', 'football', 'or', 'Canadian', 'football', ')', ';', 'Australian', 'rules', 'football', ';', 'rugby', 'union', 'and', 'rugby', 'league', ';', 'and', 'Gaelic', 'football', '.', 'These', 'various', 'forms', 'of', 'football', 'share', 'to', 'varying', 'extents', 'common', 'origins', 'and', 'are', 'known', 'as', '``', 'football', 'codes', "''", '.', 'There', 'are', 'a', 'number', 'of', 'references', 'to',

## POS Tagging

In [6]:
postags = pos_tag(tokens)

In [7]:
print(postags)

[('Football', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('family', 'NN'), ('of', 'IN'), ('team', 'NN'), ('sports', 'NNS'), ('that', 'WDT'), ('involve', 'VBP'), (',', ','), ('to', 'TO'), ('varying', 'VBG'), ('degrees', 'NNS'), (',', ','), ('kicking', 'VBG'), ('a', 'DT'), ('ball', 'NN'), ('to', 'TO'), ('score', 'VB'), ('a', 'DT'), ('goal', 'NN'), ('.', '.'), ('Unqualified', 'VBN'), (',', ','), ('the', 'DT'), ('word', 'NN'), ('football', 'NN'), ('normally', 'RB'), ('means', 'VBZ'), ('the', 'DT'), ('form', 'NN'), ('of', 'IN'), ('football', 'NN'), ('that', 'WDT'), ('is', 'VBZ'), ('the', 'DT'), ('most', 'RBS'), ('popular', 'JJ'), ('where', 'WRB'), ('the', 'DT'), ('word', 'NN'), ('is', 'VBZ'), ('used', 'VBN'), ('.', '.'), ('Sports', 'NNS'), ('commonly', 'RB'), ('called', 'VBD'), ('football', 'NN'), ('include', 'NN'), ('association', 'NN'), ('football', 'NN'), ('(', '('), ('known', 'VBN'), ('as', 'IN'), ('soccer', 'NN'), ('in', 'IN'), ('North', 'NNP'), ('America', 'NNP'), (',', ','), ('Ireland', 'NNP

## Removing stop words

In [8]:
print(set(stopwords.words('english')))

{'needn', "don't", 'your', 'be', 'if', 'until', 'further', 'only', 'than', 'then', 'wouldn', 'him', 'ours', 'a', 'once', 'ain', 'some', 'against', 'won', 'll', 'theirs', 'shan', 'yourselves', "mightn't", 'all', "wouldn't", 'from', 'o', 'hadn', 'these', 'most', 'its', 'being', 'who', 'of', 'just', 'how', 'she', 'that', 'there', 'this', 'doesn', 'you', 'been', "you've", 'in', 'into', 're', 'are', 'such', 'about', 'over', "haven't", 'out', 'me', 'them', "mustn't", "should've", 'what', "it's", 'mustn', 'myself', 'didn', 'y', 'when', 'not', 'having', 'or', "didn't", 'couldn', 'so', 'weren', 'our', 'we', "you're", 'isn', 'have', 'because', 'more', 'while', 'hers', 'is', 'am', 'above', 'had', 'both', 'her', 't', 'should', "she's", "you'd", "hadn't", 'off', 'ourselves', "shan't", 'same', 'during', 'herself', 'here', 'nor', 'ma', 'does', 'under', 'for', 'aren', 'i', "hasn't", 've', 'my', "wasn't", 'where', 'his', 'between', "won't", 'did', 'hasn', 'their', 'with', 'each', 's', 'will', 'an', "co

In [9]:
stop_words = set(stopwords.words('english'))

In [10]:
print(stop_words)

{'needn', "don't", 'your', 'be', 'if', 'until', 'further', 'only', 'than', 'then', 'wouldn', 'him', 'ours', 'a', 'once', 'ain', 'some', 'against', 'won', 'll', 'theirs', 'shan', 'yourselves', "mightn't", 'all', "wouldn't", 'from', 'o', 'hadn', 'these', 'most', 'its', 'being', 'who', 'of', 'just', 'how', 'she', 'that', 'there', 'this', 'doesn', 'you', 'been', "you've", 'in', 'into', 're', 'are', 'such', 'about', 'over', "haven't", 'out', 'me', 'them', "mustn't", "should've", 'what', "it's", 'mustn', 'myself', 'didn', 'y', 'when', 'not', 'having', 'or', "didn't", 'couldn', 'so', 'weren', 'our', 'we', "you're", 'isn', 'have', 'because', 'more', 'while', 'hers', 'is', 'am', 'above', 'had', 'both', 'her', 't', 'should', "she's", "you'd", "hadn't", 'off', 'ourselves', "shan't", 'same', 'during', 'herself', 'here', 'nor', 'ma', 'does', 'under', 'for', 'aren', 'i', "hasn't", 've', 'my', "wasn't", 'where', 'his', 'between', "won't", 'did', 'hasn', 'their', 'with', 'each', 's', 'will', 'an', "co

In [11]:
li = []
for words in tokens:
    if words not in stop_words:
        li.append(words)

In [12]:
print(li)

['Football', 'family', 'team', 'sports', 'involve', ',', 'varying', 'degrees', ',', 'kicking', 'ball', 'score', 'goal', '.', 'Unqualified', ',', 'word', 'football', 'normally', 'means', 'form', 'football', 'popular', 'word', 'used', '.', 'Sports', 'commonly', 'called', 'football', 'include', 'association', 'football', '(', 'known', 'soccer', 'North', 'America', ',', 'Ireland', 'Australia', ')', ';', 'gridiron', 'football', '(', 'specifically', 'American', 'football', 'Canadian', 'football', ')', ';', 'Australian', 'rules', 'football', ';', 'rugby', 'union', 'rugby', 'league', ';', 'Gaelic', 'football', '.', 'These', 'various', 'forms', 'football', 'share', 'varying', 'extents', 'common', 'origins', 'known', '``', 'football', 'codes', "''", '.', 'There', 'number', 'references', 'traditional', ',', 'ancient', ',', 'prehistoric', 'ball', 'games', 'played', 'many', 'different', 'parts', 'world', '.', 'Contemporary', 'codes', 'football', 'traced', 'back', 'codification', 'games', 'English',

## Stemming

In [13]:
ps = PorterStemmer()

In [14]:
stemlist = []
for words in li:
    stemlist.append([words, ps.stem(words)])

In [15]:
print(stemlist)

[['Football', 'footbal'], ['family', 'famili'], ['team', 'team'], ['sports', 'sport'], ['involve', 'involv'], [',', ','], ['varying', 'vari'], ['degrees', 'degre'], [',', ','], ['kicking', 'kick'], ['ball', 'ball'], ['score', 'score'], ['goal', 'goal'], ['.', '.'], ['Unqualified', 'unqualifi'], [',', ','], ['word', 'word'], ['football', 'footbal'], ['normally', 'normal'], ['means', 'mean'], ['form', 'form'], ['football', 'footbal'], ['popular', 'popular'], ['word', 'word'], ['used', 'use'], ['.', '.'], ['Sports', 'sport'], ['commonly', 'commonli'], ['called', 'call'], ['football', 'footbal'], ['include', 'includ'], ['association', 'associ'], ['football', 'footbal'], ['(', '('], ['known', 'known'], ['soccer', 'soccer'], ['North', 'north'], ['America', 'america'], [',', ','], ['Ireland', 'ireland'], ['Australia', 'australia'], [')', ')'], [';', ';'], ['gridiron', 'gridiron'], ['football', 'footbal'], ['(', '('], ['specifically', 'specif'], ['American', 'american'], ['football', 'footbal'

## Lemmitization

In [16]:
wl = WordNetLemmatizer()

In [17]:
lemilist = []
for words in li:
    lemilist.append([words, wl.lemmatize(words)])

In [18]:
print(lemilist)

[['Football', 'Football'], ['family', 'family'], ['team', 'team'], ['sports', 'sport'], ['involve', 'involve'], [',', ','], ['varying', 'varying'], ['degrees', 'degree'], [',', ','], ['kicking', 'kicking'], ['ball', 'ball'], ['score', 'score'], ['goal', 'goal'], ['.', '.'], ['Unqualified', 'Unqualified'], [',', ','], ['word', 'word'], ['football', 'football'], ['normally', 'normally'], ['means', 'mean'], ['form', 'form'], ['football', 'football'], ['popular', 'popular'], ['word', 'word'], ['used', 'used'], ['.', '.'], ['Sports', 'Sports'], ['commonly', 'commonly'], ['called', 'called'], ['football', 'football'], ['include', 'include'], ['association', 'association'], ['football', 'football'], ['(', '('], ['known', 'known'], ['soccer', 'soccer'], ['North', 'North'], ['America', 'America'], [',', ','], ['Ireland', 'Ireland'], ['Australia', 'Australia'], [')', ')'], [';', ';'], ['gridiron', 'gridiron'], ['football', 'football'], ['(', '('], ['specifically', 'specifically'], ['American', '

## Term Frequency

In [19]:
fre = dict()
for words in li:
    if words in fre:
        fre[words] += 1
    else:
        fre[words] = 1

In [20]:
print(fre)

{'Football': 2, 'family': 1, 'team': 2, 'sports': 2, 'involve': 1, ',': 12, 'varying': 2, 'degrees': 1, 'kicking': 1, 'ball': 2, 'score': 1, 'goal': 1, '.': 10, 'Unqualified': 1, 'word': 2, 'football': 17, 'normally': 1, 'means': 1, 'form': 1, 'popular': 2, 'used': 1, 'Sports': 1, 'commonly': 1, 'called': 1, 'include': 1, 'association': 1, '(': 2, 'known': 2, 'soccer': 1, 'North': 1, 'America': 1, 'Ireland': 1, 'Australia': 1, ')': 2, ';': 4, 'gridiron': 1, 'specifically': 1, 'American': 1, 'Canadian': 1, 'Australian': 1, 'rules': 3, 'rugby': 2, 'union': 1, 'league': 1, 'Gaelic': 2, 'These': 1, 'various': 2, 'forms': 1, 'share': 1, 'extents': 1, 'common': 1, 'origins': 1, '``': 1, 'codes': 3, "''": 1, 'There': 1, 'number': 1, 'references': 1, 'traditional': 2, 'ancient': 1, 'prehistoric': 1, 'games': 3, 'played': 1, 'many': 2, 'different': 1, 'parts': 1, 'world': 2, 'Contemporary': 1, 'traced': 1, 'back': 1, 'codification': 1, 'English': 1, 'public': 1, 'schools': 1, '19th': 2, 'centur

## Inverse document frequency

In [22]:
tfidf = TfidfVectorizer()
result = tfidf.fit_transform(li)
print(result)

  (0, 45)	1.0
  (1, 43)	1.0
  (2, 93)	1.0
  (3, 91)	1.0
  (4, 59)	1.0
  (6, 103)	1.0
  (7, 29)	1.0
  (9, 61)	1.0
  (10, 14)	1.0
  (11, 86)	1.0
  (12, 51)	1.0
  (14, 100)	1.0
  (16, 104)	1.0
  (17, 45)	1.0
  (18, 69)	1.0
  (19, 68)	1.0
  (20, 46)	1.0
  (21, 45)	1.0
  (22, 77)	1.0
  (23, 104)	1.0
  (24, 101)	1.0
  (26, 91)	1.0
  (27, 25)	1.0
  (28, 19)	1.0
  (29, 45)	1.0
  :	:
  (155, 0)	1.0
  (157, 94)	1.0
  (158, 45)	1.0
  (159, 64)	1.0
  (160, 48)	1.0
  (161, 38)	1.0
  (163, 16)	1.0
  (164, 44)	1.0
  (165, 67)	1.0
  (166, 79)	1.0
  (167, 45)	1.0
  (168, 10)	1.0
  (170, 35)	1.0
  (171, 2)	1.0
  (172, 21)	1.0
  (174, 87)	1.0
  (175, 102)	1.0
  (176, 62)	1.0
  (177, 45)	1.0
  (178, 52)	1.0
  (179, 15)	1.0
  (180, 77)	1.0
  (181, 93)	1.0
  (182, 91)	1.0
  (183, 105)	1.0
