# predict-lack-of-url
skelley@air.org<br> 
sarora@air.org<br>
February 2019<br>

## Description
This notebook tries to explain bias in missing observations, which are mostly caused to a lack of an identified URL or if a URL exists, lack of successfully scraped website data.

In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import os

In [3]:
all_firms_data = pd.read_csv('../../data/orgs/all_firms.csv', header = None)
all_firms_data.columns = ['Company', 'url']
print (all_firms_data.shape)
all_firms_data.head()

(1492, 2)


Unnamed: 0,Company,url
0,Two Blades Foundation,2blades.org/
1,3M Innovative Properties Company,3m.com/
2,Advanced Aqua Group,aadvancedaqua.com/
3,ABB AB,abb.com/
4,AbbVie Inc.,abbvie.com/


In [4]:
firm_data = pd.read_csv('../../data/patents/measures/firm_level_patent_measures.csv')
print(firm_data.shape)
firm_data.head()

(1538, 9)


Unnamed: 0,organization_clnd,mean_citations_3,first_year,num_patents_all,num_patents_3,mean_assignees_all,mean_assignees_3,mean_inventors_all,mean_inventors_3
0,22nd Century Limited,0.0,2014,21,2,1.0,1.0,1.952381,2.0
1,3M Innovative Properties Company,1.666667,1999,10263,58,1.009744,1.017544,3.465458,4.754386
2,AAC Technologies Pte,1.0,2012,206,1,1.038835,1.0,1.951456,1.0
3,Aadigen,3.0,2016,5,1,1.0,1.0,3.4,4.0
4,ABB AB,0.0,2000,380,1,1.005263,2.0,2.781579,4.0


In [51]:
all_emp_data = pd.read_csv('../../data/orgs/emps/eager_emps_v3.csv')
all_emp_data['url'] = 1
all_emp_data.loc[pd.isna(all_emp_data['hit_url']), ['url']] = 0
print ('Firms with missing urls:', len(all_emp_data.loc[all_emp_data['url'] == 0]))
all_emp_data = all_emp_data[['firm', 'firm_length', 'url', 'public', 'acquired_merged', 'nano', 'green', 'synbio', 'max_emps']]
print (all_emp_data.shape)
all_emp_data.head()

# unlabeled.loc[np.isnan(unlabeled['share_of_sentences_ftr']), 'share_of_sentences_ftr'] = 0 

Firms with missing urls: 199
(1487, 9)


Unnamed: 0,firm,firm_length,url,public,acquired_merged,nano,green,synbio,max_emps
0,Integrated Solar Technology,27.0,1,0,0,0,1,0,71
1,Graphene Technologies,21.0,1,0,0,1,0,0,79
2,Proton Power,12.0,1,0,0,0,1,0,74
3,Renewable Algal Energy,22.0,1,0,0,0,1,0,90
4,Metabolix,9.0,1,0,0,1,0,0,66


In [52]:
#about 50% merge, which we can use for the preliminary analysis
data = pd.merge(firm_data, all_emp_data, left_on = 'organization_clnd', right_on = 'firm', how = 'outer', indicator = True)
data._merge.value_counts()

both          780
left_only     759
right_only    707
Name: _merge, dtype: int64

In [53]:
full_data = data[data._merge == 'both']
#we do have some both with and without urls
full_data['url'].value_counts()

1.0    667
0.0    113
Name: url, dtype: int64

In [54]:
from sklearn.model_selection import train_test_split

In [55]:
#for now, fill nan with 0, probably want to use mean imputation 
x_cols = [col for col in full_data.columns if not col in ['_merge', 'firm', 'organization_clnd', 'url']]
xs = full_data[x_cols].fillna(0)
y = full_data['url']

X_train, X_test, y_train, y_test = train_test_split(xs, y, test_size=0.25, random_state=42)

In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score

In [57]:
clf = RandomForestClassifier()
cv_accuracy = cross_val_score(clf, X_train, y_train,cv =5)
print(cv_accuracy )
print(np.mean(cv_accuracy ))

[0.88888889 0.84615385 0.88888889 0.83760684 0.85470085]
0.8632478632478632


In [58]:
cv_recall =cross_val_score(clf, X_train, y_train,cv =5, scoring = 'recall')
print(cv_recall)
print(np.mean(cv_recall))

[0.94059406 0.99009901 1.         0.94059406 0.95049505]
0.9643564356435643


In [59]:
#TODO: is this high?
cv_precision = cross_val_score(clf, X_train, y_train,cv =5, scoring = 'precision')
print(cv_precision)
print(np.mean(cv_precision))

[0.89814815 0.89908257 0.88392857 0.87850467 0.89090909]
0.8901146104380692


In [60]:
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
#basically this shows that we are getting a lot of false positives
#ie firms without an url that we said had one
confusion_matrix(y_test, preds)

array([[  7,  26],
       [ 11, 151]])

In [61]:
y_test.value_counts()

1.0    162
0.0     33
Name: url, dtype: int64

In [62]:
feature_imp = sorted(list(zip(x_cols, clf.feature_importances_)) , key=lambda x: x[1])
feature_imp

[('public', 0.006435601375527576),
 ('synbio', 0.010747005872551221),
 ('nano', 0.01801049781399134),
 ('green', 0.018856145806630073),
 ('mean_assignees_3', 0.019476085107334253),
 ('mean_citations_3', 0.029103886822104665),
 ('num_patents_3', 0.03401515450684444),
 ('acquired_merged', 0.03833485023807132),
 ('mean_assignees_all', 0.04391007311404847),
 ('mean_inventors_3', 0.07461158046768232),
 ('first_year', 0.10118754738906668),
 ('mean_inventors_all', 0.1016847768344927),
 ('num_patents_all', 0.13387896360011187),
 ('firm_length', 0.1454362751100862),
 ('max_emps', 0.22431155594145688)]

In [None]:
## We should try some over sampling of the firms without urls to improve prediciton