### Predicting Lack of Firm URL

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import os

In [2]:
url_data = pd.read_csv('../../data/orgs/depth0.csv', header = None)
url_data.columns = ['Company', 'url']
url_data.head()

Unnamed: 0,Company,url
0,Two Blades Foundation,2blades.org/
1,3M Innovative Properties Company,3m.com/
2,Advanced Aqua Group,aadvancedaqua.com/
3,ABB AB,abb.com/
4,AbbVie Inc.,abbvie.com/


In [3]:
len(url_data)

1294

In [4]:
firm_data = pd.read_csv('../../data/patents/measures/firm_level_patent_measures.csv')
firm_data.head()

Unnamed: 0,organization_clnd,mean_citations_3,first_year,num_patents_all,num_patents_3,mean_assignees_all,mean_assignees_3,mean_inventors_all,mean_inventors_3
0,22nd Century Limited,0.0,2014,21,2,1.0,1.0,1.952381,2.0
1,3M Innovative Properties Company,1.666667,1999,10263,58,1.009744,1.017544,3.465458,4.754386
2,AAC Technologies Pte,1.0,2012,206,1,1.038835,1.0,1.951456,1.0
3,Aadigen,3.0,2016,5,1,1.0,1.0,3.4,4.0
4,ABB AB,0.0,2000,380,1,1.005263,2.0,2.781579,4.0


In [5]:
emp_data_loc = '../../..'
no_url_emp_data = pd.read_excel('{}/bing-firm-final-urls-out_v6-new emps.xlsx'.format(emp_data_loc), sheetname = 'bing with emp data_no urls only')
no_url_emp_data = no_url_emp_data[['firm', 'firm_length', 'acquired_merged', 'LinkedIn employees']]
#is this assumption that those two columns are the same correct?
no_url_emp_data  = no_url_emp_data.rename(columns = {'LinkedIn employees':'max_emps'})
no_url_emp_data['url'] = 0
no_url_emp_data.head()

Unnamed: 0,firm,firm_length,acquired_merged,max_emps,url
993,CAPAT,5.0,0,387.0,0
641,Aseptia,7.0,0,263.0,0
791,BOE Technology Group Co.,23.0,0,6199.0,0
1340,Crepaco,,0,3512.0,0
200,Abraxis Bioscience,18.0,1,204.0,0


In [6]:
url_emp_data = pd.read_excel('{}/bing-firm-final-urls-out_v7-li-gw.xlsx'.format(emp_data_loc), sheetname = 'NEW_bing with li-gw data')
url_emp_data = url_emp_data[['firm', 'firm_length', 'acquired_merged', 'max_emps']]
url_emp_data['url'] = 1
url_emp_data.head()

Unnamed: 0,firm,firm_length,acquired_merged,max_emps,url
0,H R D CORPORATION,5,0,76.0,1
1,King Electric Vehicles Inc.,22,0,90.0,1
2,Integrated Solar Technology,27,0,71.0,1
3,Graphene Technologies,21,0,79.0,1
4,Proton Power,12,0,74.0,1


In [7]:
all_emp_data = pd.concat([no_url_emp_data, url_emp_data])
all_emp_data.head()

Unnamed: 0,firm,firm_length,acquired_merged,max_emps,url
993,CAPAT,5.0,0,387.0,0
641,Aseptia,7.0,0,263.0,0
791,BOE Technology Group Co.,23.0,0,6199.0,0
1340,Crepaco,,0,3512.0,0
200,Abraxis Bioscience,18.0,1,204.0,0


In [8]:
#about 50% merge, which we can use for the preliminary analysis
data = pd.merge(firm_data, all_emp_data, left_on = 'organization_clnd', right_on = 'firm', how = 'outer', indicator = True)
data._merge.value_counts()

both          780
left_only     759
right_only    707
Name: _merge, dtype: int64

In [9]:
full_data = data[data._merge == 'both']
#we do have some both with and without urls
full_data['url'].value_counts()

1.0    668
0.0    112
Name: url, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
#for now, fill nan with 0, probably want to use mean imputation 
x_cols = [col for col in full_data.columns if not col in ['_merge', 'firm', 'organization_clnd', 'url']]
xs = full_data[x_cols].fillna(0)
y = full_data['url']

X_train, X_test, y_train, y_test = train_test_split(xs, y, test_size=0.25, random_state=42)

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score

In [23]:
clf = RandomForestClassifier()
cv_accuracy = cross_val_score(clf, X_train, y_train,cv =5)
print(cv_accuracy )
print(np.mean(cv_accuracy ))

[ 0.83050847  0.85470085  0.88034188  0.83760684  0.87068966]
0.85476954048


In [24]:
cv_recall =cross_val_score(clf, X_train, y_train,cv =5, scoring = 'recall')
print(cv_recall)
print(np.mean(cv_recall))

[ 0.93137255  0.95049505  0.96039604  0.92079208  0.99009901]
0.950630945447


In [25]:
#TODO: is this high?
cv_precision = cross_val_score(clf, X_train, y_train,cv =5, scoring = 'precision')
print(cv_precision)
print(np.mean(cv_precision))

[ 0.89320388  0.875       0.89814815  0.87272727  0.87610619]
0.883037099812


In [15]:
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
#basically this shows that we are getting a lot of false positives
#ie firms without an url that we said had one
confusion_matrix(y_test, preds)

array([[ 10,  23],
       [ 13, 149]])

In [26]:
y_test.value_counts()

1.0    162
0.0     33
Name: url, dtype: int64

In [17]:
feature_imp = sorted(list(zip(x_cols, clf.feature_importances_)) , key=lambda x: x[1])
feature_imp

[('mean_assignees_3', 0.022412629013985125),
 ('mean_citations_3', 0.028533477062707212),
 (u'acquired_merged', 0.033102419241252005),
 ('num_patents_3', 0.041768553533907946),
 ('mean_inventors_3', 0.062838239028315146),
 ('mean_assignees_all', 0.064241748706560173),
 ('mean_inventors_all', 0.0983730661834753),
 ('first_year', 0.10884304339763348),
 ('num_patents_all', 0.12024099309936816),
 (u'firm_length', 0.18064693519322064),
 ('max_emps', 0.23899889553957485)]

In [None]:
## We should try some over sampling of the firms without urls to improve prediciton