### Bag of words model

In [None]:
# load all necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('max_colwidth', 100)

#### Let's build a basic bag of words model on three sample documents

In [None]:
documents = ["Gangs of Wasseypur is a great movie.", "The success of a movie depends on the performance of the actors.", "There are no new movies releasing this week."]
print(documents)

In [None]:
def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # join words to make sentence
    document = " ".join(words)
    
    return document

documents = [preprocess(document) for document in documents]
print(documents)


#### Creating bag of words model using count vectorizer function

In [None]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(documents)
print(bow_model)  # returns the rown and column number of cells which have 1 as value

In [None]:
# print the full sparse matrix
print(bow_model.toarray())

In [None]:
print(bow_model.shape)
print(vectorizer.get_feature_names())

### Let's create a bag of words model on the spam dataset.

In [None]:
# load data
spam = pd.read_csv("SMSSpamCollection.txt", sep = "\t", names=["label", "message"])
spam.head()

##### Let's take a subset of data (first 50 rows only) and create bag of word model on that.

In [None]:
spam = spam.iloc[0:50,:]
print(spam)

In [None]:
# extract the messages from the dataframe
messages = spam.message
print(messages)
print(messages.shape)

In [None]:
# convert messages into list
messages = [message for message in messages]
print(len(messages))
print(messages)

In [None]:
# preprocess messages using the preprocess function
messages = [preprocess(message) for message in messages]
print(messages)

In [None]:
len(messages)

In [None]:
# bag of words model
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)

In [None]:
# look at the dataframe
dataframeOneHot = pd.DataFrame(bow_model.toarray(), columns = vectorizer.get_feature_names())

In [None]:
print(vectorizer.get_feature_names())

* A lot of duplicate tokens such as 'win'and 'winner'; 'reply' and 'replying'; 'want' and 'wanted' etc. 

In [None]:
bow_model.sum()

In [None]:
dataframeOneHot.to_csv("CountVetorizer.csv")