# Monthly Challenge May 2019 - The Ontotext Case 💼

## Week 1

## I. Data Understanding

###  *1) Load necessary libraries*

In [None]:
# Data manipulation
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', -1) # Show full text columns of pandas dataframe
import dill # saving the envir

# Data vizualizations
import random
import plotly
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected=True)
import plotly.offline as offline
import plotly.graph_objs as go

### *2) Import the dataset*

In [None]:
train_data = pd.read_csv("dt18-ontotext-simple.csv")

In [None]:
train_data.info() 

In [None]:
train_data.isna().sum() 

In [None]:
train_data.shape 

In [None]:
print("First five elements in our training sample:")
train_data.head()

In [None]:
print(any(train_data['org'].duplicated())) 

### 3) *Now let's explore a little bit more the target variable*

In [None]:
print("First 20 elements in the target column:")
train_data.industries.head(20)

In [None]:
print("Last 20 elements in the target column:")
train_data.industries.tail(20)

In [None]:
print('Example of a company that falls into more than one industry category:' + '\n')
print(train_data.iloc[7,:])

### *3.1) Examine industry categories*
<a id='examine-categories'></a>

In [None]:
print("Number of unique labels: {}".format(len(train_data.industries.unique())))
train_data.industries.unique()

In [None]:
print(train_data.industries.value_counts()[0:20])

In [None]:
industries_df = train_data['industries'].str.split(';', expand=True).rename(columns = lambda x: "industry"+str(x+1))

In [None]:
industries_df.head(20)

In [None]:
industries_df.count() # NA-s are omitted in this result

### *3.2) Find the number of industry categories*

In [None]:
categories = list(pd.unique(industries_df[list(industries_df.columns)].values.ravel('K')))

In [None]:
categories

In [None]:
categories = [cat for cat in categories if cat != None and str(cat) != 'nan'] # # remove the None and NaN values

In [None]:
categories

In [None]:
print('We have {} industry categories in our sample.'.format(len(categories)))

### *3.3) What to do with the NaN values in the target?* 

In [None]:
missinglabel = train_data[train_data.industries.isnull()] # only observations for which the target is NaN.

In [None]:
missinglabel.head()

In [None]:
missinglabel.info()

In [None]:
print("It turns out that we don't have information for the industry category of {} companies part of our sample (or {:.2%} of the observations in our sample).".format(len(missinglabel), len(missinglabel)/len(train_data)))

In [None]:
missinglabel.isna().sum()

In [None]:
missinglabel.types.value_counts()

In [None]:
missinglabel.categories.value_counts()

### *3.4) Filter out the remaining NaN values in the target* 

In [None]:
train_data.info()

In [None]:
train_data = train_data[train_data.industries.notnull()] 

In [None]:
train_data.info()

In [None]:
print('Number of missing values in each column of the filtered sample:')
train_data.isnull().sum()

### *3.5) How many companies fall exactly into one industry category, two industry categories etc.?* 
<a id='companies-exact-one-categories'></a>

In [None]:
industries_df = train_data['industries'].str.split(';', expand=True).rename(columns = lambda x: "industry"+str(x+1))

In [None]:
industries_df.head(20)

In [None]:
industries_df.count()

In [None]:
numofcategories = industries_df.apply(lambda x: x.count(), axis=1) # row-wise count of unique elements (excluding None values)

In [None]:
type(numofcategories)

In [None]:
numofcategories.head(20) # number of categories for the first 20 companies

In [None]:
train_data.head(10)

In [None]:
numofcategories.value_counts().rename_axis('Num of categories').to_frame('Count of companies')

In [None]:
trace = go.Table(header=dict(values=['Number of industry categories', 'Count of companies', 'Percentage of companies'],
                             fill = dict(color=['#da80ec']), 
                             align = ['left'] * 5),
                 cells=dict(values=[[str(i) + ' category'  for i in [x for x in range(1,13) if x != 11]],
                                    numofcategories.value_counts(),['{:.4%}'.format(x) for x in numofcategories.value_counts()/len(train_data)]], 
                            align = ['left'] * 5))

layout = go.Layout(title='In how many categories the companies fall into?',
                   titlefont = dict(size = 20),
                   width=500, height=500, 
                   paper_bgcolor =  'rgba(0,0,0,0)',
                   plot_bgcolor = 'rgba(0,0,0,0)',
                   autosize = True,
                   yaxis=go.layout.YAxis(automargin = True),
                   )
data = [trace]
fig = dict(data=data, layout=layout)
iplot(fig)

### *3.6) How many companies fall into each industry category?* 

In [None]:
indoutput = industries_df.apply(pd.value_counts)

In [None]:
indoutput

In [None]:
industrycounts = indoutput.apply(np.sum, axis = 1) # apply sum row-wise to the dataframe displayed above 

In [None]:
print('Total counts:' + '\n' )
print(industrycounts)

In [None]:
trace = go.Table(header=dict(values=['Category', 'Count'],
                             fill = dict(color=['#da80ec']), 
                             align = ['left'] * 5),
                 cells=dict(values=[industrycounts.index,
                                    industrycounts.values],
                            align = ['left'] * 5))

layout = go.Layout(title='Number of companies in each industry category',
                   titlefont = dict(size = 20),
                   width=700, height=900, 
                   paper_bgcolor =  'rgba(0,0,0,0)',
                   plot_bgcolor = 'rgba(0,0,0,0)',
                   autosize = True,
                   yaxis=go.layout.YAxis(automargin = True),
                   )
data = [trace]
fig = dict(data=data, layout=layout)
iplot(fig)

In [None]:
# Create a function for generating random colours
def random_colours(number_of_colors):
    '''
    Simple function for random colours generation.
    Input:
        number_of_colors - integer value indicating the number of colours which are going to be generated.
    Output:
        Color in the following format: ['#da80ec'] .
    '''
    colors = []
    for i in range(number_of_colors):
        colors.append("#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
    return colors

In [None]:
#  Calculate Industry Label distribution in percentages 
labelpercents = []
for i in industrycounts.values:
    percent = ((i/len(train_data))*100) 
    percent = "%.2f" % percent
    percent = str(percent + '%')
    labelpercents.append(percent)

In [None]:
trace = go.Bar(
    x= industrycounts.values[::-1], # keep them in alphabetical order
    y= industrycounts.index[::-1],
    text =labelpercents[::-1],  textposition = 'outside', 
    orientation = 'h',marker = dict(color = random_colours(32)))

layout = go.Layout(title='Percentage of companies falling into each category',
                   titlefont = dict(size = 25),
                   width=950, height=650, 
                   plot_bgcolor = 'rgba(0,0,0,0)',
                   paper_bgcolor = 'rgba(255, 219, 227, 0.88)',
                   margin=dict(l=240,r=0,b=50,t=60),
                   autosize = False
                   )

data = [trace]
fig = dict(data=data, layout=layout)
iplot(fig, filename='horizontal-bar')

### 4) *Now let's explore a little bit more the features - 'descriptions' column only*

In [None]:
len(train_data[train_data.descriptions.isnull()])

In [None]:
'{:.2%}'.format(len(train_data[train_data.descriptions.isnull()])/len(train_data))

In [None]:
train_data.descriptions = train_data.descriptions.fillna('missingdescrip')

In [None]:
# Check the result
len(train_data[train_data.descriptions.isnull()]) # 0 obs.

In [None]:
# Check the result
train_data.descriptions.value_counts()['missingdescrip'] # 11 228 obs.

### *4.1) Find the length of company descriptions*

In [None]:
train_data['word_count'] = train_data['descriptions'].apply(lambda x: len(str(x).split(" ")))

In [None]:
train_data[['descriptions','word_count']].head()

In [None]:
print('Number of words in the corpus (before processing):')
sum(train_data.word_count) # 22 030 957 words

### *4.2) Take a closer look at the extremes*

In [None]:
longest_d = train_data[train_data['word_count'] == max(train_data['word_count'])]

In [None]:
print('The longest company description in our sample has {} words.'.format(max(train_data['word_count'])))

In [None]:
print('Longest description text:' + '\n')
print(longest_d.descriptions) 

In [None]:
shortest_d = train_data[train_data['word_count'] == min(train_data['word_count'])]

In [None]:
len(shortest_d)  # 11 231

In [None]:
shortest_d.descriptions.value_counts()

### *4.3) Explore company's description length distribution*

In [None]:
trace = go.Histogram(
    x= train_data['word_count'],
    xbins=dict(start=0,end=3900, size = 100),
    marker=dict(color='#7CFDF0'),
    opacity=0.75)
data = [trace]
layout = go.Layout(
    title='Distribution of Company Description Length',
    xaxis=dict(title='Number of words'),
    yaxis=dict(title='Count of companies'),
    bargap=0.1,
    bargroupgap=0.2,
    width = 800)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
outliers = train_data[train_data['word_count'] > 500]
print("It seems that {} companies have very long descriptions!".format(len(outliers)))

### *4.4) Now, let's explore the company's description length distribution in each industry*

In [None]:
boxplotcolors = random_colours(32)
data = []

for i in range(32):
    trace = go.Box(
    y=train_data[train_data['industries'].str.contains(categories[i], regex = False)]['word_count'] , 
        name = categories[i],
        marker = dict(color = boxplotcolors[i]))
    data.append(trace)

layout = go.Layout(
    height=1000, 
    width = 1200,
    title = "Company description - Length Distribution by industry type",
    margin = dict(l = 30, r = 30, b = 150, t = 90)
)

fig = go.Figure(data=data,layout=layout)
iplot(fig, filename = "Box Plot Styling Outliers")

In [None]:
# Delete a few of the objects that will not be used in next stages of the experiment
del outliers
del shortest_d
del longest_d
del indoutput
del numofcategories
del missinglabel

In [None]:
dill.dump_session('Week1_env.db')