# Monthly Challenge May 2019 - The Ontotext Case 💼

# Week 4

## IV. Modeling and Validation

In [None]:
# Data processing
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import dill
pd.set_option('display.max_colwidth', -1) # Show full text columns of pandas dataframe

# Data vizualizations
import plotly
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected=True)
import plotly.offline as offline
import plotly.graph_objs as go
np.set_printoptions(suppress=True) # suppress scientific notation
import plotly.figure_factory as ff
import warnings
warnings.filterwarnings("ignore")

# Data modeling
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [None]:
dill.load_session('Week3_env.db')

## *1) Prepare for modeling - features*

In [None]:
vocabulary_set

In [None]:
vectorizer = CountVectorizer(binary = True, 
                             vocabulary = vocabulary_set) 

In [None]:
X_train = vectorizer.fit_transform(train.descriptions)

In [None]:
len(vectorizer.get_feature_names())

In [None]:
vectorizer.get_feature_names()

In [None]:
count_vect_df = pd.DataFrame(X_train.todense(), columns=vectorizer.get_feature_names())

In [None]:
count_vect_df.head()

In [None]:
train.descriptions.iloc[0]

In [None]:
count_vect_df.iloc[0]

## *2) Prepare for modeling - target*

In [None]:
lb = LabelEncoder()
Y_train = lb.fit_transform(train.industry1)

In [None]:
lb.classes_

In [None]:
Y_train

## *3) Apply a machine learning algorithm*

In [None]:
clf = LinearSVC(random_state=42, dual = False)

More about how to apply algorithms when the target variable is multi-label: https://scikit-learn.org/stable/modules/multiclass.html

In [None]:
clf

In [None]:
clf.fit(X_train, Y_train)  

In [None]:
print('Mean accuracy of train sample:')
clf.score(X_train,Y_train) # 0.9108350038225609

## *4) Validation*

### *4.1) Inspect the classification report on training sample* 

In [None]:
predictions_ontrain = clf.predict(X_train)

In [None]:
predictions_ontrain

In [None]:
predictions_ontrain = lb.inverse_transform(predictions_ontrain)

In [None]:
predictions_ontrain

In [None]:
Y_train= lb.inverse_transform(Y_train)

In [None]:
Y_train

In [None]:
metrics.accuracy_score(Y_train, predictions_ontrain) # 0.9108350038225609

In [None]:
print(metrics.classification_report(Y_train, predictions_ontrain, 
                                    categories))

In [None]:
conf_mat = confusion_matrix(Y_train, predictions_ontrain, labels = categories)

In [None]:
conf_mat

In [None]:
colorscale = [[0, '#66475e'], [1, '#ecbfe0']]
font_colors = ['#efecee', '#3c3636']
fig = ff.create_annotated_heatmap(z = conf_mat, x= categories, y = categories, colorscale= colorscale,
                                  font_colors=font_colors, showscale = True )
#Layout
for i in range(len(fig.layout.annotations)): 
    fig.layout.annotations[i].font.size = 7
    
fig['layout'].update(
    title='Confusion matrix - train sample',
    width=1000,
    height=700,
    xaxis=dict(title='Predicted', titlefont = dict(size = 20), side = 'bottom'),
    yaxis=dict(title='Actual', titlefont = dict(size = 20)),
    autosize=False, margin=dict(l=220,r=30,b=220,t=50,pad=1))
    
iplot(fig, filename='annotated_heatmap_numpy')

### *4.2) Validate on test sample* 

In [None]:
test_X = vectorizer.transform(test.descriptions)

In [None]:
predicted = clf.predict(test_X)

In [None]:
predicted

In [None]:
predicted = lb.inverse_transform(predicted)

In [None]:
predicted

In [None]:
metrics.accuracy_score(test.industry1, predicted) # 0.9042475489507543

In [None]:
print(metrics.classification_report(test.industry1, predicted, 
                                    categories))

In [None]:
conf_mat = confusion_matrix(test.industry1, predicted, labels = categories)

In [None]:
colorscale = [[0, '#66475e'], [1, '#ecbfe0']]
font_colors = ['#efecee', '#3c3636']
fig = ff.create_annotated_heatmap(z = conf_mat, x= categories, y = categories, colorscale= colorscale,
                                  font_colors=font_colors, showscale = True )
#Layout
for i in range(len(fig.layout.annotations)): 
    fig.layout.annotations[i].font.size = 7
    
fig['layout'].update(
    title='Confusion matrix - test sample',
    width=1000,
    height=700,
    xaxis=dict(title='Predicted', titlefont = dict(size = 20), side = 'bottom'),
    yaxis=dict(title='Actual', titlefont = dict(size = 20)),
    autosize=False, margin=dict(l=220,r=30,b=220,t=50,pad=1))
    
iplot(fig, filename='annotated_heatmap_numpy')

#### *4.2.1) Extract false negatives* 

In [None]:
test['predicted'] = predicted

In [None]:
test.head()

In [None]:
# Education category
print("'Education' category is incorrectly classified as 'Entertainment_and_publishing':" + '\n')
test[(test.industry1=='Education') & (test.predicted=='Entertainment_and_publishing')].head(10)

In [None]:
# Transport category
print("'Transport' category is incorrectly classified as 'Public_sector':" + '\n')
test[(test.industry1=='Transport') & (test.predicted=='Public_sector')]

#### *4.2.2) Extract false positives*

In [None]:
# Education category
print("'Education' category is predicted but the actual category is 'Public_sector':" + '\n')
test[(test.industry1=='Public_sector') & (test.predicted=='Education')].head(10)

In [None]:
# Transport category
print("'Transport' category is predicted but the actual category is 'Travel_and_sport':" + '\n')
test[(test.industry1=='Travel_and_sport') & (test.predicted=='Transport')]