# Monthly Challenge May 2019 - The Ontotext Case 💼

## Week 2 

## II.The Ontotext Case - Data Preparation

In [None]:
# Data processing
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', -1) # Show full text columns of pandas dataframe
from nltk.corpus import stopwords
import re
import dill
from collections import Counter

# Data vizualizations
import wordcloud
from wordcloud import WordCloud
import plotly
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected=True)
import plotly.offline as offline
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from PIL import Image
import warnings
warnings.filterwarnings("ignore")

# Data modeling
from sklearn.model_selection import train_test_split

In [None]:
dill.load_session('Week1_env.db')

## *1)Text Processing*

In [None]:
print("How the development sample looks like at this stage (example of one description - these descriptions will be our main source of features):" + '\n')
print(str(train_data.descriptions[0]) + '\n' )
print("And the target variable associated with this observation:")
print(train_data.industries[0] + '\n')

print("Number of observations in dev sample: "+ str(len(train_data)))

###  *1.1) Transform to lower case*

In [None]:
#processeddata = train_data.copy() #  Example of how to make a copy of a dataset

In [None]:
train_data['descriptions'] = train_data['descriptions'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [None]:
train_data.descriptions[0] 

###  *1.2) Punctuation and non-ASCII characters removal*

In [None]:
train_data.descriptions[0] # arabic

In [None]:
train_data.descriptions[14] # cyrillic

In [None]:
train_data.descriptions[47] # chinese characters

In [None]:
train_data['descriptions'] = train_data['descriptions'].str.replace('[^a-zA-Z0-9]',' ') 

In [None]:
train_data.descriptions[0] # arabic

In [None]:
train_data.descriptions[14] # cyrillic

In [None]:
train_data.descriptions[47] # chinese characters

### *1.3) Remove numerical values*

In [None]:
train_data['numerics'] = train_data['descriptions'].apply(lambda x: len([x for x in x.split() if x.isdigit()])) # counting numericals

In [None]:
train_data.numerics[0] 

In [None]:
train_data.descriptions[0]

In [None]:
sum(train_data.numerics) # 911300

In [None]:
print('Numerics as percentage of all words in the corpus: {:.2%} '.format(sum(train_data.numerics)/sum(train_data.word_count)))

In [None]:
train_data['descriptions'] = train_data['descriptions'].str.replace('[0-9]',' ') # removing the numerics

In [None]:
train_data.descriptions[0]

###  *1.4) Remove stop words*

In [None]:
stop = stopwords.words('english')

In [None]:
stop

In [None]:
print('The list of stop words consists of {} words.'.format(len(stop)))

In [None]:
# Count the number of stop words
train_data['stopwords'] = train_data['descriptions'].apply(lambda x: len([x for x in x.split() if x in stop]))

In [None]:
print('The number of stop words in development sample: {}'.format(sum(train_data.stopwords)))

In [None]:
print('Stop words as percentage of all words in the corpus: {:.2%} '.format(sum(train_data.stopwords)/sum(train_data.word_count)))

In [None]:
train_data.descriptions[0]

In [None]:
train_data.stopwords[0]

In [None]:
# Remove the stop words
train_data['descriptions'] = train_data['descriptions'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [None]:
train_data.descriptions[0]

###  *1.5) Find most common words*

In [None]:
corpus = pd.Series(' '.join(train_data['descriptions']).split())

In [None]:
# Separately for industry categories
#corpus = pd.Series(' '.join(train_data[train_data['industries'].str.contains('Travel_and_sport', regex = False)]['descriptions']).split())

In [None]:
len(corpus.unique()) # 356 122

In [None]:
corpus_counts = Counter(corpus) # contains all unique words+their frequency

In [None]:
len(corpus_counts)

In [None]:
mostcommon = pd.DataFrame(corpus_counts.most_common(100),  columns=['Word', 'Frequency'])

In [None]:
mostcommon[0:50]

In [None]:
mostrare = {x : corpus_counts[x] for x in corpus_counts if corpus_counts[x] == 1 } # a dictionary comprehension

In [None]:
len(mostrare) # 162 343 words

In [None]:
mostrare

In [None]:
mostcommon = mostcommon.set_index('Word').to_dict()['Frequency'] # dictionary

In [None]:
wordcloud = WordCloud(max_words = 100, width=800, height=800, background_color='white', random_state = 42).generate_from_frequencies(mostcommon)  
plt.figure(figsize=(8,8), dpi=80)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

## *2) Sampling - split to train and test sample*

### 2.1) Prepare the target variable

In [None]:
train_data = train_data.join(industries_df)

In [None]:
train_data.info()

In [None]:
train_data_cut = train_data.loc[train_data['industry2'].isnull()]

In [None]:
len(train_data_cut)

In [None]:
train_data_cut = train_data_cut.loc[:,['descriptions','industry1']] 

In [None]:
train_data_cut.info()

### 2.2) Split the dataset

In [None]:
train, test = train_test_split(train_data_cut, train_size = 0.70, random_state=42, stratify = train_data_cut.industry1)

In [None]:
test.info() # 71194 entries, 162093 to 197018

In [None]:
train.info() # 166119 entries, 154297 to 244605

In [None]:
trace = go.Table(header=dict(values=['Industry category', 'Number of companies', 'As Percentage'],
                             fill = dict(color=['#da80ec']), 
                             align = ['left'] * 5),
                 cells=dict(values=[train.industry1.value_counts().keys(),
                                    train.industry1.value_counts(),['{:.2%}'.format(x) for x in train.industry1.value_counts()/len(train)]], 
                            align = ['left'] * 5))

layout = go.Layout(title='Target distribution in the train sample',
                   titlefont = dict(size = 20),
                   width=800, height=900, 
                   paper_bgcolor =  'rgba(0,0,0,0)',
                   plot_bgcolor = 'rgba(0,0,0,0)',
                   autosize = True,
                   yaxis=go.layout.YAxis(automargin = True),
                   )
data = [trace]
fig = dict(data=data, layout=layout)
iplot(fig)

In [None]:
trace = go.Table(header=dict(values=['Industry category', 'Number of companies', 'As Percentage'],
                             fill = dict(color=['#da80ec']), 
                             align = ['left'] * 5),
                 cells=dict(values=[test.industry1.value_counts().keys(),
                                    test.industry1.value_counts(),['{:.2%}'.format(x) for x in test.industry1.value_counts()/len(test)]], 
                            align = ['left'] * 5))

layout = go.Layout(title='Target distribution in the test sample',
                   titlefont = dict(size = 20),
                   width=800, height=900, 
                   paper_bgcolor =  'rgba(0,0,0,0)',
                   plot_bgcolor = 'rgba(0,0,0,0)',
                   autosize = True,
                   yaxis=go.layout.YAxis(automargin = True),
                   )
data = [trace]
fig = dict(data=data, layout=layout)
iplot(fig)

In [None]:
del mostcommon
del mostrare
del corpus
del corpus_counts

In [None]:
dill.dump_session('Week2_env.db')