In [6]:
paragraph="""
Context
The wikipedia dump is a giant XML file and contains loads of not-so-useful content. I needed some english text for some unsupervised learning so I spent quite a bit of time extracting and cleaning up the text.

Content
Each line of the txt file is a 'sentence'. I put sentence in quote because the content in these files haven't been read all the way through for errors. Here is what I did:

Parsed out the opening text on non-disambiguation and non-table-of-contents pages.
Removed sentences requiring citations, because these were usually poorly formed.
Parse each block of text into sentences using SpaCy. I then checked for bracket and quote correctness, filtering out sentences that didn't quite match up.
Removed sentences shorter than 3 letters and longer than 255 characters. This covers 97% of the data.
Remove duplicate sentences, and, as a byproduct, sorted alphabetically.
"""

In [7]:
sentences = nltk.sent_tokenize(paragraph)

In [9]:
print(sentences)

['\nContext\nThe wikipedia dump is a giant XML file and contains loads of not-so-useful content.', 'I needed some english text for some unsupervised learning so I spent quite a bit of time extracting and cleaning up the text.', "Content\nEach line of the txt file is a 'sentence'.", "I put sentence in quote because the content in these files haven't been read all the way through for errors.", 'Here is what I did:\n\nParsed out the opening text on non-disambiguation and non-table-of-contents pages.', 'Removed sentences requiring citations, because these were usually poorly formed.', 'Parse each block of text into sentences using SpaCy.', "I then checked for bracket and quote correctness, filtering out sentences that didn't quite match up.", 'Removed sentences shorter than 3 letters and longer than 255 characters.', 'This covers 97% of the data.', 'Remove duplicate sentences, and, as a byproduct, sorted alphabetically.']


In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer=WordNetLemmatizer()
corpus=[]

for sentence in sentences:
    text = re.sub('[^a-zA-Z]', ' ', sentence)  # Remove non-alphabetic characters
    text = text.lower()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]
    text = " ".join(text)
    corpus.append(text)



In [11]:
corpus

['context wikipedia dump giant xml file contains load useful content',
 'needed english text unsupervised learning spent quite bit time extracting cleaning text',
 'content line txt file sentence',
 'put sentence quote content file read way error',
 'parsed opening text non disambiguation non table content page',
 'removed sentence requiring citation usually poorly formed',
 'parse block text sentence using spacy',
 'checked bracket quote correctness filtering sentence quite match',
 'removed sentence shorter letter longer character',
 'cover data',
 'remove duplicate sentence byproduct sorted alphabetically']

In [12]:
#bag of words
from sklearn.feature_extraction.text import CountVectorizer 
cv=CountVectorizer(binary=True,ngram_range=(2,3))   
x=cv.fit_transform(corpus)

In [13]:
cv.vocabulary_

{'context wikipedia': 20,
 'wikipedia dump': 120,
 'dump giant': 27,
 'giant xml': 42,
 'xml file': 122,
 'file contains': 35,
 'contains load': 13,
 'load useful': 50,
 'useful content': 115,
 'context wikipedia dump': 21,
 'wikipedia dump giant': 121,
 'dump giant xml': 28,
 'giant xml file': 43,
 'xml file contains': 123,
 'file contains load': 36,
 'contains load useful': 14,
 'load useful content': 51,
 'needed english': 53,
 'english text': 31,
 'text unsupervised': 107,
 'unsupervised learning': 113,
 'learning spent': 44,
 'spent quite': 99,
 'quite bit': 68,
 'bit time': 0,
 'time extracting': 109,
 'extracting cleaning': 33,
 'cleaning text': 12,
 'needed english text': 54,
 'english text unsupervised': 32,
 'text unsupervised learning': 108,
 'unsupervised learning spent': 114,
 'learning spent quite': 45,
 'spent quite bit': 100,
 'quite bit time': 69,
 'bit time extracting': 1,
 'time extracting cleaning': 110,
 'extracting cleaning text': 34,
 'content line': 17,
 'line t

In [14]:
corpus[0] #first sentence

'context wikipedia dump giant xml file contains load useful content'

In [15]:
x[0].toarray()# Bag of words for first sentence

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1]], dtype=int64)

In [16]:
## TFIDF 
from sklearn.feature_extraction.text import TfidfVectorizer
cv=TfidfVectorizer(ngram_range=(2,3))
x=cv.fit_transform(corpus)

In [17]:
cv.vocabulary_

{'context wikipedia': 20,
 'wikipedia dump': 120,
 'dump giant': 27,
 'giant xml': 42,
 'xml file': 122,
 'file contains': 35,
 'contains load': 13,
 'load useful': 50,
 'useful content': 115,
 'context wikipedia dump': 21,
 'wikipedia dump giant': 121,
 'dump giant xml': 28,
 'giant xml file': 43,
 'xml file contains': 123,
 'file contains load': 36,
 'contains load useful': 14,
 'load useful content': 51,
 'needed english': 53,
 'english text': 31,
 'text unsupervised': 107,
 'unsupervised learning': 113,
 'learning spent': 44,
 'spent quite': 99,
 'quite bit': 68,
 'bit time': 0,
 'time extracting': 109,
 'extracting cleaning': 33,
 'cleaning text': 12,
 'needed english text': 54,
 'english text unsupervised': 32,
 'text unsupervised learning': 108,
 'unsupervised learning spent': 114,
 'learning spent quite': 45,
 'spent quite bit': 100,
 'quite bit time': 69,
 'bit time extracting': 1,
 'time extracting cleaning': 110,
 'extracting cleaning text': 34,
 'content line': 17,
 'line t

In [18]:
x[0].toarray()# Bag of words for first sentence

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.24253563, 0.24253563,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.24253563, 0.24253563, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.24253563, 0.24253563, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.24253563, 0.24253563, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.24253563, 0.24253563, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.24253563, 0.24253563, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  