Goal:

1. Identify jobs having service_type = interpreting (in-person)

2. Introduce a temporary variable called match_percentage

3. For each job, calculate match_percentage (detail below) for all pandas

4. Return a sorted array (in descending order of match percentage) of the name and

5. target_price fields of the top matching pandas

In [49]:
import pandas as pd

In [50]:
df_jobs = pd.read_csv('../result/extracted_jobs.csv', index_col=0)

In [51]:
df_jobs.head(2)

Unnamed: 0,id,requester_id,inserted_at,job_details,industry,service_type,industry_depth,location,prep_file_URL,deadline
0,1,22,2015-11-05 14:51:43,{u'list_9048040_choice': u'In the next 24 hour...,Agriculture,interpreting (in-person),Critical. Being able to understand terminology...,Ningbo,,2015-11-06 14:51:43
1,2,22,2015-11-07 03:46:38,{u'list_9048040_choice': u'In the next 24 hour...,Archaeology,interpreting (in-person),A low priority. This only requires everyday fa...,Shanghai,,2015-11-08 03:46:38


In [52]:
df_jobs['service_type'].unique()

array(['interpreting (in-person)', 'event hosting',
       'interpreting (on the phone)', 'translation'], dtype=object)

In [53]:
target_service = 'interpreting (in-person)'

In [54]:
df_jobs['match_percentage'] = df_jobs['service_type']==target_service

In [55]:
df_jobs.head(4)

Unnamed: 0,id,requester_id,inserted_at,job_details,industry,service_type,industry_depth,location,prep_file_URL,deadline,match_percentage
0,1,22,2015-11-05 14:51:43,{u'list_9048040_choice': u'In the next 24 hour...,Agriculture,interpreting (in-person),Critical. Being able to understand terminology...,Ningbo,,2015-11-06 14:51:43,True
1,2,22,2015-11-07 03:46:38,{u'list_9048040_choice': u'In the next 24 hour...,Archaeology,interpreting (in-person),A low priority. This only requires everyday fa...,Shanghai,,2015-11-08 03:46:38,True
2,3,38,2015-11-10 02:57:34,{u'list_9048040_choice': u'In the next 24 hour...,Advertising,event hosting,Critical. Being able to understand terminology...,Hong Kong,,2015-11-11 02:57:34,False
3,4,53,2015-11-12 10:51:31,"{u'list_9048040_choice': u'In the next week', ...",Medical devices,interpreting (on the phone),Important. I'm willing to tolerate some amount...,,,2015-11-19 10:51:31,False


In [56]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 657 entries, 0 to 660
Data columns (total 11 columns):
id                  657 non-null int64
requester_id        657 non-null int64
inserted_at         657 non-null object
job_details         657 non-null object
industry            657 non-null object
service_type        657 non-null object
industry_depth      657 non-null object
location            196 non-null object
prep_file_URL       14 non-null object
deadline            657 non-null object
match_percentage    657 non-null bool
dtypes: bool(1), int64(2), object(8)
memory usage: 57.1+ KB


In [57]:
df_pandas = pd.read_csv('../result/extracted_pandas.csv', index_col=0)

In [58]:
df_pandas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 255 entries, 0 to 346
Data columns (total 8 columns):
id              255 non-null int64
inserted_at     255 non-null object
name            255 non-null object
basecity        254 non-null object
prices          255 non-null object
target_price    255 non-null float64
specialties     255 non-null object
capabilities    255 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 17.9+ KB


## Formula

match_percentage = industry_weight \* industry_performance * location_match

In [59]:
df_pandas.head(3)

Unnamed: 0,id,inserted_at,name,basecity,prices,target_price,specialties,capabilities
0,1,11/2/2015 5:39:06,test,test,"{u'interpretation_base': u'111', u'interpretat...",111,"{accounting,advertising}",{architecture}
1,2,11/2/2015 15:46:11,David,Taipei,"{u'interpretation_base': u'13', u'interpretati...",12,"{computers_and_computing,it_software_programmi...","{technology,toxicology,photography,music,it,hu..."
5,6,11/3/2015 14:38:13,Christy ZHENG,Beijing,"{u'interpretation_base': u'50', u'interpretati...",120,"{art,entertainment,social_science}","{environment,fashion,finance,human_rights,it,i..."


In [60]:
df_pandas['basecity'].unique()

array(['test', 'Taipei', 'Beijing', 'CHONGQING', 'Shanghai',
       'Huzhou, Zhejiang ', 'Los Angeles', 'Sofia', 'London', "Xi'an",
       'Seattle', 'New York', 'Washington DC', 'Chicago', 'Monterey',
       'Milan', 'San Francisco', 'santa clara ', 'Long Beach, California',
       'Chongqing', 'Chengdu', 'Dublin', 'Guangzhou', 'Hong Kong',
       'Qingdao', 'Paris', 'Moscow', 'Washington D.C.', 'Woodinville',
       'Seoul', 'Berlin, Germany', 'Monterey, California', 'Mexico',
       'Phoenix', 'Indianapolis, IN', '\xe4\xb8\x8a\xe6\xb5\xb7',
       'wenzhou', 'beijing', 'Endor', 'Atlanta', 'Beijing ',
       'Singapore, Shanghai', 'Jakarta', 'Shanghai ', 'Osaka',
       'Pittsburgh', 'Toronto, Canada', 'los angeles', 'Nanjing,Guangzhou',
       'Shenzhen', 'Beijing, Nanjing', 'Tianjin', 'College Park',
       'New York City', 'San Francisco ', 'Changsha', 'West Fargo',
       'chicago', 'Hong Kong, Toronto', 'Boston', 'Shanghai, China',
       'Wuhan', 'Chengdu, chongqing, shenyang, 

In [61]:
df_pandas[df_pandas['basecity']=='Wuhan']

Unnamed: 0,id,inserted_at,name,basecity,prices,target_price,specialties,capabilities
166,169,1/2/2016 15:11:29,Gloria,Wuhan,"{u'interpretation_base': u'25', u'interpretati...",40,{},{}
167,170,1/2/2016 16:19:48,Gloria,Wuhan,"{u'interpretation_base': u'25', u'interpretati...",40,"{fashion,finance,humanities}","{business_negotiations_business_development,ed..."
246,249,2/14/2016 4:42:10,Jimmy,Wuhan,"{u'interpretation_base': u'40', u'interpretati...",100,"{marketing,politics,machinery}","{advertising,agriculture,construction,cosmetic..."


In [62]:
job_sample = df_jobs[df_jobs['id']==2]

In [63]:
job_sample

Unnamed: 0,id,requester_id,inserted_at,job_details,industry,service_type,industry_depth,location,prep_file_URL,deadline,match_percentage
1,2,22,2015-11-07 03:46:38,{u'list_9048040_choice': u'In the next 24 hour...,Archaeology,interpreting (in-person),A low priority. This only requires everyday fa...,Shanghai,,2015-11-08 03:46:38,True


In [64]:
df_pandas['location_match'] = df_pandas['basecity'].apply(lambda x: x==job_sample['location'])

Metrics used for match_percentage: industry_weight, industry_performance

In [65]:
industry_weight_metric={'Critical': 0.8, 'Important': 0.5}
industry_performance_metric = {'specialties': 1, 'capabilities': 0.5}

In [66]:
'Archaeology' in job_sample['industry'].iloc[0]

True

In [67]:
def check_match(field='industry', 
                metric=industry_performance_metric,
                job=job_sample,
                panda=df_pandas[df_pandas['id']==2],
                other=0.1):
    for k, v in metric.iteritems():
        if job[field].iloc[0] in panda[k].iloc[0]:
            return v
    else:
        return other

In [68]:
check_match()

0.1

In [69]:
df_pandas['industry_performance_match']=df_pandas['id'].apply(lambda x: check_match(panda=df_pandas[df_pandas['id']==x]))

In [70]:
df_pandas.head(3)

Unnamed: 0,id,inserted_at,name,basecity,prices,target_price,specialties,capabilities,location_match,industry_performance_match
0,1,11/2/2015 5:39:06,test,test,"{u'interpretation_base': u'111', u'interpretat...",111,"{accounting,advertising}",{architecture},False,0.1
1,2,11/2/2015 15:46:11,David,Taipei,"{u'interpretation_base': u'13', u'interpretati...",12,"{computers_and_computing,it_software_programmi...","{technology,toxicology,photography,music,it,hu...",False,0.1
5,6,11/3/2015 14:38:13,Christy ZHENG,Beijing,"{u'interpretation_base': u'50', u'interpretati...",120,"{art,entertainment,social_science}","{environment,fashion,finance,human_rights,it,i...",False,0.1


In [71]:
df_pandas['match_percentage'] = df_pandas['location_match'] * df_pandas['industry_performance_match']

In [77]:
df_pandas['match_percentage'].value_counts()

0.0    225
0.1     30
dtype: int64

In [79]:
temp = df_pandas.sort(columns='match_percentage', axis=0, ascending=False)

In [81]:
temp[:5]

Unnamed: 0,id,inserted_at,name,basecity,prices,target_price,specialties,capabilities,location_match,industry_performance_match,match_percentage
51,53,11/20/2015 2:39:06,Silvia,Shanghai,"{u'interpretation_base': u'150', u'interpretat...",200,"{economics,finance,stocks_and_investment}","{banking,consumer_goods,fashion,government_and...",True,0.1,0.1
203,206,1/21/2016 4:11:43,Yaling Jiang,Shanghai,"{u'interpretation_base': u'100', u'interpretat...",120,"{media_information_and_comms,art,journalism}","{business_negotiations_business_development,ed...",True,0.1,0.1
86,89,12/5/2015 13:48:48,Kate XIAO,Shanghai,"{u'interpretation_base': u'70', u'interpretati...",90,"{finance,government_and_politics,plant_shipbui...","{banking,business_negotiations_business_develo...",True,0.1,0.1
28,29,11/13/2015 6:00:45,ZI YE,Shanghai,"{u'interpretation_base': u'180', u'interpretat...",200,"{banking,finance,business_negotiations_busines...","{accounting,advertising,clothing,consumer_good...",True,0.1,0.1
83,86,12/4/2015 13:12:37,Eva Chen,Shanghai,"{u'interpretation_base': u'40', u'interpretati...",50,"{social_science,travel_and_tourism,business_ne...","{entertainment,environment,fashion,media_infor...",True,0.1,0.1
