In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/home/rajdeep/Downloads/archive/Question_Classification_Dataset.csv")

In [3]:
# Get the dimension of the data
df.shape

(5452, 5)

In [4]:
# get the column names
df.columns

Index(['Unnamed: 0', 'Questions', 'Category0', 'Category1', 'Category2'], dtype='object')

In [7]:
# let's check how many different catergories are present in each category column

unq_cat_in_cat1 = set(df["Category0"].to_list())
unq_cat_in_cat2 = set(df["Category1"].to_list())
unq_cat_in_cat3 = set(df["Category2"].to_list())

print("Unique categories in Category1:: {0} Count:: {1}".format(unq_cat_in_cat1, len(unq_cat_in_cat1)))
print("Unique categories in Category2:: {0} Count:: {1}".format(unq_cat_in_cat2, len(unq_cat_in_cat2)))
print("Unique categories in Category3:: {0} Count:: {1}".format(unq_cat_in_cat3, len(unq_cat_in_cat3)))

Unique categories in Category1:: {'NUMERIC', 'LOCATION', 'ENTITY', 'HUMAN', 'DESCRIPTION', 'ABBREVIATION'} Count:: 6
Unique categories in Category2:: {'HUM', 'LOC', 'ENTY', 'DESC', 'ABBR', 'NUM'} Count:: 6
Unique categories in Category3:: {'substance', 'termeq', 'country', 'food', 'city', 'manner', 'plant', 'sport', 'event', 'abb', 'exp', 'mount', 'title', 'gr', 'dist', 'state', 'desc', 'product', 'temp', 'veh', 'currency', 'period', 'lang', 'perc', 'religion', 'word', 'reason', 'body', 'dismed', 'symbol', 'ord', 'cremat', 'animal', 'volsize', 'ind', 'date', 'money', 'count', 'techmeth', 'speed', 'color', 'def', 'weight', 'code', 'letter', 'instru', 'other'} Count:: 47


In [8]:
# we must also check the distribution of the data across different classes
# If the data set is balanced, there will be approximately equal number of questions per category
# this will help prevent the model from being biased

df.groupby(["Category0"])["Questions"].count()

Category0
ABBREVIATION      86
DESCRIPTION     1162
ENTITY          1250
HUMAN           1223
LOCATION         835
NUMERIC          896
Name: Questions, dtype: int64

### We can see that the questions of the category ABBREVIATION is really less in number, whereas those in the other categories are more or less balanced. 
### This knowledge will be essential while trying to determine the performance of any algorithm applied on this data

### Let us now try to explore the text data of our questions. 
### As with any Machine Learning algorithm, the data needs to be converted into some form of vector, to be processed and made sense of. 
### One approach to convert text to vector is Bag of Words, which is simply making a sense from the frequency of the use of words in the data. The scikit-learn package has some useful tools to deal with the text data

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
# first, we will extract the list of questions corresponding to each category

group_wise_ques_list = df.groupby(["Category0"])["Questions"].apply(list)

In [17]:
# let us view the questions for LOCATION category

group_wise_ques_list["LOCATION"]

['What sprawling U.S. state boasts the most airports ?',
 'What is the highest waterfall in the United States ?',
 'Which two states enclose Chesapeake Bay ?',
 "Where do the adventures of `` The Swiss Family Robinson '' take place ?",
 'What country do the Galapagos Islands belong to ?',
 'What U.S. state lived under six flags ?',
 'Where is the Loop ?',
 "What country 's capital is Tirana ?",
 'Which city has the oldest relationship as a sisterðcity with Los Angeles ?',
 'What are the names of the tourist attractions in Reims ?',
 'What body of water does the Danube River flow into ?',
 'What country did the Nazis occupy for 1 , CD NNS IN NNP NNP NNP .',
 'What is the nickname for the state of Mississippi ?',
 'What country did King Wenceslas rule ?',
 "What country 's national passenger rail system is called Via ?",
 'What U.S. state is Fort Knox in ?',
 'Where is the highest point in Japan ?',
 "Where does Barney Rubble go to work after he drops Fred off in the `` Flintstones '' ca

### Similarly we can explore the questions of other categories.
### The first step towards transforming the text into vector would be to identify the unique words (vocabulary) present in the entire corpus.
### The next step would be to identify which of those words are frequent in which of the target classes

In [58]:
# How many unique words are there in our corpus?
# let's take an example of the LOCATION category questions and figure out the procedure

word_list = []
for ques_split in [ques.split() for ques in group_wise_ques_list["LOCATION"]]:
    # convert words to lower case
    word_list.extend(list(map(str.lower,ques_split)))
print(len(set(word_list)))


1962


### So there are 1962 unique tokens (words, punctuations etc.) in the data that corresponds to the LOCATION category

### Note: The CountVectoizer module of sklearn by deafult converts the words into lowercase and performs other character normalization because of which the count of unique words as calculated by CountVectorizer may not be exactly same as our count, but it will be somewhere close


### Now let us try to figure out, how these vocabulary of 1962 tokens are distributed across our data for LOCATION category

In [59]:
# initialise
count_vec = CountVectorizer()

In [60]:
# transform will normalise the data (if required) and fit will perform the matrix creation
loc_tdm = count_vec.fit_transform(group_wise_ques_list["LOCATION"])

In [61]:
# convert the raw matrix into a dataframe
loc_tdm_df = pd.DataFrame(loc_tdm.toarray(), columns=count_vec.get_feature_names())

In [62]:
# display
loc_tdm_df

Unnamed: 0,000,000th,10,11,11th,123,13,139,14,15,...,yellow,yellowstone,york,you,your,yukon,zebras,zero,zones,zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
830,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
832,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
833,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Observations:
### 1.There were 835 questions of LOCATION category, which explains that the number of rows in this DataFrame is 835. The number of columns refer to the number of unique words in the corpus of LOCATION - questions
### 2. The matrix is very sparse, meaning that some words have really low frequency. These are words which are very specific to a question (like the numbers 10, 11, 000 etc.) and do not necessarily contribute to the overall classification of the question
### 3. Words like 'you', 'your' etc. are different forms of the same base word - 'you', which practically does not have any difference in their contribution to the classification of the question. It will be more meaningful to treat these words as the one and same

## Now, let's get the frequency distribution of each word in the LOCATION- questions corpus

In [65]:
# for NLP, words itself are the features, hence get_feature_names will simply return the set of words
word_list_s = count_vec.get_feature_names(); 
# count the frequency of each word
count_list_s = loc_tdm.toarray().sum(axis=0) 
# create a dictionary with each word as key and its frequency as the value
freq_s = dict(zip(word_list_s,count_list_s))
# sort the dictionary in reverse order
freq_s = sorted(freq_s.items(), key = lambda x : x[1], reverse=True)
# display
freq_s

[('the', 669),
 ('what', 554),
 ('is', 304),
 ('where', 258),
 ('in', 221),
 ('of', 201),
 ('country', 123),
 ('city', 101),
 ('can', 77),
 ('to', 75),
 ('was', 69),
 ('on', 65),
 ('state', 65),
 ('are', 63),
 ('world', 62),
 ('and', 57),
 ('did', 54),
 ('find', 52),
 ('largest', 46),
 ('for', 43),
 ('has', 39),
 ('does', 35),
 ('most', 34),
 ('from', 33),
 ('name', 32),
 ('countries', 26),
 ('river', 26),
 ('capital', 25),
 ('do', 25),
 ('located', 25),
 ('which', 25),
 ('first', 24),
 ('highest', 24),
 ('two', 23),
 ('live', 22),
 ('get', 20),
 ('information', 20),
 ('mountain', 19),
 ('island', 18),
 ('states', 18),
 ('america', 17),
 ('american', 17),
 ('by', 17),
 ('you', 17),
 ('that', 16),
 ('airport', 15),
 ('boasts', 15),
 ('home', 15),
 ('south', 15),
 ('as', 14),
 ('with', 14),
 ('about', 13),
 ('called', 13),
 ('cities', 13),
 ('address', 12),
 ('all', 12),
 ('european', 12),
 ('its', 12),
 ('new', 12),
 ('park', 12),
 ('have', 11),
 ('internet', 11),
 ('north', 11),
 ('oce

### This output also tells a very practical story, where our language construct has a lot of filler words (prepositions, conjunctions etc.) that do not contribute to the overall meaning of the sentence. These words are also known as StopWords in NLP lingo

## Based on all of the analysis that we made till now, here's our action item
### 1. Remove the Stopwords (is, the, an, to etc.)
### 2. Remove the words that have very low frequency (000, 10, 11 etc.)
### 3. Convert every word to it's base form ("your" -> "you" etc. )