# Monthly Challenge May 2019 - The Ontotext Case 💼

## Week 3

## III.The Ontotext Case - Text Representation

In [None]:
# Data processing
import pandas as pd
import numpy as np
import dill
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
import time
import datetime

# Data vizualizations
import plotly
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected=True)
import plotly.offline as offline
import plotly.graph_objs as go
import warnings
warnings.filterwarnings("ignore")

In [None]:
dill.load_session('Week2_env.db')

In [None]:
train.info()

In [None]:
test.info()

In [None]:
vectorizer = CountVectorizer(ngram_range = (1,1), # only unigrams will be used
                             max_df = 0.90, # any word appearing in more than 90% of the sample will be discarded
                             max_features = 5000,
                             binary = True # if we want features to be binary (the default is counts)  
                             ) 

In [None]:
X_train = vectorizer.fit_transform(train.descriptions)

In [None]:
features = vectorizer.get_feature_names()  # list of extracted features

In [None]:
print('Extracted features (in alphabetical order)')
print(features)

In [None]:
len(features) 

## IV.The Ontotext Case - Feature Selection

In [None]:
start = time.time()

FS_results = []
vocabulary = []
for cat in categories:
    mi = mutual_info_classif(X_train, train.industry1 == cat, discrete_features=True, random_state = 42)
    indices = np.argsort(mi) # get the indices of features
    feature_names = np.array(vectorizer.get_feature_names())[indices] # get the features' names
    values = mi[indices]
    # Create a dataframe with the most important 20 features in each class
    df = pd.DataFrame(list(reversed(feature_names))[:20], columns = ['feature'])
    df['MI value'] = list(reversed(values))[:20]
    df['Category'] = cat
    FS_results.append(df)
    vocabulary.append(list(reversed(feature_names))[:20]) # save all the features in a separate list 
    
end = time.time()
execution_time = end - start
print(datetime.timedelta(seconds=execution_time)) # ~ 0:49:47.837228

In [None]:
FS_results[:2] 

In [None]:
trace = go.Table(header=dict(values=['Word', 'MI Value', 'Category'],
                             fill = dict(color=['#da80ec']), 
                             align = ['left'] * 5),
                 cells=dict(values=[FS_results[1].feature.head(10), FS_results[1]['MI value'].head(10), FS_results[1].Category.head(10)], 
                            align = ['left'] * 5))

layout = go.Layout(title='Feature importance by category',
                   titlefont = dict(size = 20),
                   width=500, height=500, 
                   paper_bgcolor =  'rgba(0,0,0,0)',
                   plot_bgcolor = 'rgba(0,0,0,0)'
                   )
data = [trace]
fig = dict(data=data, layout=layout)
iplot(fig)

Some findings after analyzing the results: 

- Generally speaking, our feature selection procedure managed to pick up terms that definitely seem relevant to the respective categories.
- In almost all of the target classes we can find more specialized (associated directly with the category) terms. However, in a few of the categories we find mainly broad terms – ‘Marketing’, ‘Conglomerate_(company)’ and ‘Commercial_and_professional_ services’.
- There are a lot of terms having the same root (manufacture-manufacturer-manufacturing) or terms appearing in both singular and plural form (school-schools). However, in most of these cases the terms have roughly the same meaning, so having them presented in so many different forms may add unnecessary noise in data. In such cases, stemming and lemmatization techniques may be applied in order to normalize the data.
- There are certain broad terms that appear in most of the target classes – ex. ‘company’, ’services’, ’firm’, ‘products’. This may lead to loss of discriminative power.
- The word ‘manufacture’ or its derivatives appears to be one of the most important terms in several categories – ‘Manufacturing’, ‘Aerospace_and_defense’, ‘Chemical_industry’, ‘Engineering’, ‘Metal’ and ‘Automotive’. Its appearance in the mentioned categories makes sense but will probably lead to loss of discriminative power and high number of misclassifications between these categories.

In [None]:
vocabulary[0:2] 

In [None]:
vocabulary_set = []
for featureset in vocabulary:
    for term in featureset:
        vocabulary_set.append(term)

In [None]:
vocabulary_set = list(sorted(set(vocabulary_set))) 

In [None]:
vocabulary_set

In [None]:
len(vocabulary_set) # 410 unique words 

In [None]:
dill.dump_session('Week3_env.db')