### Predicting Lack of Firm URL

In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
import re

In [4]:
def clean_firm_name (firm):
    firm_clnd = re.sub('(\.|,| corporation| incorporated| llc| inc| international| gmbh| ltd)', '', firm, flags=re.IGNORECASE).rstrip()
    return firm_clnd

In [6]:
all_firms_data = pd.read_csv('../../data/orgs/all_firms.csv', header = None)
all_firms_data.columns = ['Company', 'url']
print (all_firms_data.shape)
all_firms_data.head()

(1492, 2)


Unnamed: 0,Company,url
0,Two Blades Foundation,2blades.org/
1,3M Innovative Properties Company,3m.com/
2,Advanced Aqua Group,aadvancedaqua.com/
3,ABB AB,abb.com/
4,AbbVie Inc.,abbvie.com/


In [5]:
firm_data = pd.read_csv('../../data/patents/measures/firm_level_patent_measures.csv')
firm_data['firm_clnd'] = firm_data['organization_clnd'].apply(clean_firm_name)
firm_data.head()

Unnamed: 0,organization_clnd,mean_citations_3,first_year,num_patents_all,num_patents_3,mean_assignees_all,mean_assignees_3,mean_inventors_all,mean_inventors_3,firm_clnd
0,22nd Century Limited,0.0,2014,21,2,1.0,1.0,1.952381,2.0,22nd Century Limited
1,3M Innovative Properties Company,1.666667,1999,10263,58,1.009744,1.017544,3.465458,4.754386,3M Innovative Properties Company
2,AAC Technologies Pte,1.0,2012,206,1,1.038835,1.0,1.951456,1.0,AAC Technologies Pte
3,Aadigen,3.0,2016,5,1,1.0,1.0,3.4,4.0,Aadigen
4,ABB AB,0.0,2000,380,1,1.005263,2.0,2.781579,4.0,ABB AB


In [54]:
emp_data_loc = '../../..'
no_url_emp_data = pd.read_excel('{}/bing-firm-final-urls-out_v6-new emps.xlsx'.format(emp_data_loc), sheetname = 'bing with emp data_no urls only')
no_url_emp_data['firm_clnd'] = no_url_emp_data['firm'].apply(clean_firm_name)
no_url_emp_data = no_url_emp_data[['firm_clnd', 'firm_length', 'acquired_merged', 'LinkedIn employees']]
#is this assumption that those two columns are the same correct?
no_url_emp_data  = no_url_emp_data.rename(columns = {'LinkedIn employees':'max_emps'})
no_url_emp_data['url'] = 0
no_url_emp_data.head()

Unnamed: 0,firm_clnd,firm_length,acquired_merged,max_emps,url
993,CAPAT,5.0,0,387.0,0
641,Aseptia,7.0,0,263.0,0
791,BOE Technology Group Co,23.0,0,6199.0,0
1340,Crepaco,,0,3512.0,0
200,Abraxis Bioscience,18.0,1,204.0,0


In [119]:
# url_emp_data = pd.read_excel('{}/bing-firm-final-urls-out_v7-li-gw.xlsx'.format(emp_data_loc), sheetname = 'NEW_bing with li-gw data')
# url_emp_data.head()

In [55]:
url_emp_data = pd.read_excel('{}/bing-firm-final-urls-out_v7-li-gw.xlsx'.format(emp_data_loc), sheetname = 'NEW_bing with li-gw data')
url_emp_data['firm_clnd'] = url_emp_data['firm'].apply(clean_firm_name)
url_emp_data = url_emp_data[['firm_clnd', 'firm_length', 'acquired_merged', 'max_emps']]
url_emp_data['url'] = 1
url_emp_data.head()

Unnamed: 0,firm_clnd,firm_length,acquired_merged,max_emps,url
0,H R D,5,0,76.0,1
1,King Electric Vehicles,22,0,90.0,1
2,Integrated Solar Technology,27,0,71.0,1
3,Graphene Technologies,21,0,79.0,1
4,Proton Power,12,0,74.0,1


In [56]:
all_emp_data = pd.concat([no_url_emp_data, url_emp_data])
all_emp_data.head()

Unnamed: 0,firm_clnd,firm_length,acquired_merged,max_emps,url
993,CAPAT,5.0,0,387.0,0
641,Aseptia,7.0,0,263.0,0
791,BOE Technology Group Co,23.0,0,6199.0,0
1340,Crepaco,,0,3512.0,0
200,Abraxis Bioscience,18.0,1,204.0,0


In [110]:
len(all_emp_data)

1487

In [57]:
#about 50% merge, which we can use for the preliminary analysis
data = pd.merge(firm_data, all_emp_data, on = 'firm_clnd', how = 'outer', indicator = True)
data._merge.value_counts()

both          1314
left_only      229
right_only     173
Name: _merge, dtype: int64

In [114]:
len(data[data._merge != 'right_only'][data['url'] == 1])

  """Entry point for launching an IPython kernel.


1144

In [115]:
len(data[data._merge == 'both'][data['url'] == 1])

  """Entry point for launching an IPython kernel.


1144

In [116]:
len(data[data._merge != 'right_only'][data['url'] == 0])

  """Entry point for launching an IPython kernel.


170

In [58]:
full_data = data[data._merge == 'both']
#we do have some both with and without urls
full_data['url'].value_counts()

1.0    1144
0.0     170
Name: url, dtype: int64

In [59]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [60]:
x_cols = [col for col in full_data.columns if not col in ['_merge', 'firm_clnd', 'organization_clnd', 'url', 'acquired_merged']]
xs = full_data[x_cols].fillna(0)
y = full_data['url']

In [61]:
clf = DecisionTreeClassifier()
grid = GridSearchCV(clf, {'max_depth' : [2, 3, 4, 5,6,10]}, scoring='f1')
grid.fit(xs, y)
grid.best_params_

{'max_depth': 2}

In [14]:
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)

In [62]:
clf = DecisionTreeClassifier(random_state =42, max_depth=2)
clf.fit(xs, y)
precision = np.mean(cross_val_score(clf, xs, y, scoring = 'precision', cv=cv))
recall = np.mean(cross_val_score(clf, xs, y, scoring = 'recall', cv=cv))
f1 = np.mean(cross_val_score(clf, xs, y, scoring = 'f1', cv=cv))

In [63]:
y.value_counts()

1.0    1144
0.0     170
Name: url, dtype: int64

In [64]:
recall

0.99770579371895174

In [65]:
f1

0.93356889538553389

In [66]:
feature_imp = sorted(list(zip(x_cols, clf.feature_importances_)) , key=lambda x: x[1])
feature_imp

[('mean_citations_3', 0.0),
 ('first_year', 0.0),
 ('num_patents_all', 0.0),
 ('num_patents_3', 0.0),
 ('mean_assignees_all', 0.0),
 ('mean_assignees_3', 0.0),
 ('mean_inventors_all', 0.0),
 ('mean_inventors_3', 0.0),
 (u'firm_length', 0.16903136308356881),
 ('max_emps', 0.83096863691643119)]

In [109]:
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold


# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  # seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

    # If we have a test node
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
      "the following tree structure:"
      % n_nodes)
for i in range(n_nodes):
    if is_leaves[i]:
        print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
    else:
        print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
              "node %s."
              % (node_depth[i] * "\t",
                 i,
                 children_left[i],
                 feature[i],
                 threshold[i],
                 children_right[i],
                 ))
print()

The binary tree structure has 7 nodes and has the following tree structure:
node=0 test node: go to node 1 if X[:, 9] <= 2.5 else to node 4.
	node=1 test node: go to node 2 if X[:, 8] <= 22.5 else to node 3.
		node=2 leaf node.
		node=3 leaf node.
	node=4 test node: go to node 5 if X[:, 9] <= 791.5 else to node 6.
		node=5 leaf node.
		node=6 leaf node.
()


In [79]:
#sanity check
relevant_xs = xs[['firm_length', 'max_emps']]
relevant_xs['preds'] = preds

In [102]:
relevant_xs[relevant_xs['max_emps'] < 2.5][relevant_xs['firm_length']<22.5].head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,firm_length,max_emps,preds
3,7.0,0.0,1.0
12,7.0,2.0,1.0
28,13.0,0.0,1.0
44,7.0,0.0,1.0
62,22.0,0.0,1.0


In [104]:
relevant_xs[relevant_xs['max_emps'] < 2.5][relevant_xs['firm_length']>22.5].head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,firm_length,max_emps,preds
34,31.0,2.0,0.0
37,28.0,2.0,0.0
173,40.0,0.0,0.0
180,24.0,0.0,0.0
183,38.0,0.0,0.0


In [106]:
relevant_xs[2.5 < relevant_xs['max_emps']][relevant_xs['max_emps'] < 791.5].head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,firm_length,max_emps,preds
0,20.0,734.0,1.0
6,29.0,706.0,1.0
11,14.0,365.0,1.0
13,18.0,204.0,1.0
14,21.0,11.0,1.0


In [108]:
relevant_xs[relevant_xs['max_emps'] > 2.5][relevant_xs['max_emps']>791.5].head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,firm_length,max_emps,preds
1,32.0,60593.0,1.0
2,20.0,2141.0,1.0
4,6.0,100487.0,1.0
5,12.0,134800.0,1.0
7,16.0,99000.0,1.0
