In [1]:
#Import basic packages
import os
import numpy as np
import pandas as pd
import csv
import regex as re
import datetime as dt 

In [2]:
#import outcome file with org_uuid
main_df = pd.read_csv(R"d:/msc-project/data/final/outcome_final_v1.csv",encoding='utf-8',
                         parse_dates= ['founded_on','seed_date','series_a_date','series_b_date','series_c_date'])
main_processing_df = main_df[['org_uuid']]

In [3]:
#import main organisation csv
org_ori_df = pd.read_csv(R"d:/msc-project/data/pre-processed/organisations_preprocessed.csv",encoding='utf-8',
                         parse_dates= ['founded_on','went_public_on','acquired_on','closed_on'])

In [4]:
org_ori_df.columns

Index(['org_uuid', 'name', 'legal_name', 'homepage_url', 'country_code',
       'state_code', 'region', 'city', 'address', 'postal_code', 'status',
       'short_description', 'category_list', 'category_groups_list',
       'num_funding_rounds', 'total_funding_usd', 'total_funding',
       'total_funding_currency_code', 'founded_on', 'last_funding_on',
       'closed_on', 'employee_count', 'email', 'phone', 'facebook_url',
       'linkedin_url', 'twitter_url', 'logo_url', 'primary_role', 'num_exits',
       'description', 'ipo_uuid', 'stock_exchange_symbol', 'stock_symbol',
       'went_public_on', 'share_price_usd', 'share_price',
       'share_price_currency_code', 'valuation_price_usd', 'valuation_price',
       'valuation_price_currency_code', 'money_raised_usd', 'money_raised',
       'money_raised_currency_code', 'acquisition_uuid', 'acquirer_uuid',
       'acquirer_name', 'acquirer_country_code', 'acquirer_state_code',
       'acquirer_region', 'acquirer_city', 'acquisition_type

# Social Media Presence (LinkedIn, Facebook, Twitter)

In [5]:
org_processing_df_v1 = org_ori_df[['org_uuid','facebook_url','linkedin_url','twitter_url']]

In [6]:
org_processing_df_v1.head(10)

Unnamed: 0,org_uuid,facebook_url,linkedin_url,twitter_url
0,e1393508-30ea-8a36-3f96-dd3226033abd,https://www.facebook.com/Wetpaint,https://www.linkedin.com/company/wetpaint,https://twitter.com/wetpainttv
1,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,http://www.facebook.com/digg,http://www.linkedin.com/company/digg,http://twitter.com/digg
2,df662812-7f97-0b43-9d3e-12f64f504fbb,https://www.facebook.com/facebook/,http://www.linkedin.com/company/facebook,https://twitter.com/facebook
3,60485007-8856-bbac-aa1b-c535c41f5f47,http://www.facebook.com/Nomadesk,,http://twitter.com/Nomadesk
4,4111dc8b-c0df-2d24-ed33-30cd137b3098,,,http://twitter.com/geni
5,5da6106f-0d27-0d37-e9d7-dcfeccc1f709,https://www.facebook.com/twitterinc,http://www.linkedin.com/company/twitter,http://twitter.com/twitter
6,3d16cb4c-911e-75c0-de5a-15c316b39f98,https://www.facebook.com/StumbleUpon,http://www.linkedin.com/company/stumbleupon,http://twitter.com/stumbleupon
7,21e77067-5537-408e-cad7-e5e72bb6ad86,http://www.facebook.com/Scribd,http://www.linkedin.com/company/scribd,http://www.twitter.com/scribd
8,ea091a8c-40e0-0607-e05a-86e734f94ade,http://www.facebook.com/SlackerRadio,https://www.linkedin.com/company/slacker,http://twitter.com/SlackerRadio
9,59fada33-1595-de45-b362-062a04cf51cf,,,


In [7]:
org_processing_df_v1 = pd.merge(main_processing_df,
                               org_processing_df_v1,
                               on='org_uuid',
                               how='left')
org_processing_df_v1.head(3)

Unnamed: 0,org_uuid,facebook_url,linkedin_url,twitter_url
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,,,
1,9c8adac6-5c8e-9344-b763-6beab966c63c,,,
2,6749cc07-efed-ff09-4efe-43faf6f580de,,,


In [8]:
len(org_processing_df_v1)

28727

In [9]:
#fill NA value with 0 and non NA with 1
org_processing_df_v1['facebook_url'][org_processing_df_v1['facebook_url'].notnull()] = 1
org_processing_df_v1['linkedin_url'][org_processing_df_v1['linkedin_url'].notnull()] = 1
org_processing_df_v1['twitter_url'][org_processing_df_v1['twitter_url'].notnull()] = 1
org_processing_df_v1.fillna(0,inplace=True)

In [10]:
org_processing_df_v1.head(3)

Unnamed: 0,org_uuid,facebook_url,linkedin_url,twitter_url
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,0,0,0
1,9c8adac6-5c8e-9344-b763-6beab966c63c,0,0,0
2,6749cc07-efed-ff09-4efe-43faf6f580de,0,0,0


In [11]:
org_processing_df_v1.set_index('org_uuid',inplace=True)

In [12]:
columns_name = list()

for column in list(org_processing_df_v1.columns):
    columns_name.append('has_'+ column)
    
columns_name_dict = dict (zip(org_processing_df_v1.columns,columns_name))
org_processing_df_v1.rename(columns=columns_name_dict,inplace=True)

In [13]:
org_processing_df_v1.head(3)

Unnamed: 0_level_0,has_facebook_url,has_linkedin_url,has_twitter_url
org_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9dc17185-743d-5fbc-f03b-ad0c81c8795d,0,0,0
9c8adac6-5c8e-9344-b763-6beab966c63c,0,0,0
6749cc07-efed-ff09-4efe-43faf6f580de,0,0,0


# Homepage URL

In [14]:
org_processing_df_v2 = org_ori_df[['org_uuid','homepage_url']]

In [15]:
org_processing_df_v2 = pd.merge(main_processing_df,
                               org_processing_df_v2,
                               on='org_uuid',
                               how='left')
org_processing_df_v2.head(3)

Unnamed: 0,org_uuid,homepage_url
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,
1,9c8adac6-5c8e-9344-b763-6beab966c63c,http://nauticusnet.com/
2,6749cc07-efed-ff09-4efe-43faf6f580de,http://www.bioprocessors.com


In [16]:
org_processing_df_v2['homepage_url'][org_processing_df_v2['homepage_url'].notnull()] = 1
org_processing_df_v2.fillna(0,inplace=True)
org_processing_df_v2.head(5)

Unnamed: 0,org_uuid,homepage_url
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,0
1,9c8adac6-5c8e-9344-b763-6beab966c63c,1
2,6749cc07-efed-ff09-4efe-43faf6f580de,1
3,bbc29f4a-a798-4f74-f0a0-b1d8267692ac,1
4,9bbaf540-5994-029a-3346-501ad2d1fb89,1


In [17]:
org_processing_df_v2.rename(columns={'homepage_url':'has_homepage'},inplace=True)
org_processing_df_v2.head(5)

Unnamed: 0,org_uuid,has_homepage
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,0
1,9c8adac6-5c8e-9344-b763-6beab966c63c,1
2,6749cc07-efed-ff09-4efe-43faf6f580de,1
3,bbc29f4a-a798-4f74-f0a0-b1d8267692ac,1
4,9bbaf540-5994-029a-3346-501ad2d1fb89,1


In [18]:
org_processing_df_v2.set_index('org_uuid',inplace=True)
org_processing_df_v2.head(5)

Unnamed: 0_level_0,has_homepage
org_uuid,Unnamed: 1_level_1
9dc17185-743d-5fbc-f03b-ad0c81c8795d,0
9c8adac6-5c8e-9344-b763-6beab966c63c,1
6749cc07-efed-ff09-4efe-43faf6f580de,1
bbc29f4a-a798-4f74-f0a0-b1d8267692ac,1
9bbaf540-5994-029a-3346-501ad2d1fb89,1


# Event Appearance

In [19]:
#import event appearance file csv
event_appear_df = pd.read_csv(R"d:/msc-project/data/pre-processed/event_appearances_preprocessed.csv",encoding='utf-8')

In [20]:
event_appear_df.head(5)

Unnamed: 0,event_appear_uuid,event_appear_name,type,event_uuid,event_name,participant_uuid,participant_name,participant_type,appearance_type,short_description
0,30c4023d-b694-4704-97d7-cd99cd06ecce,Adobe MAX 2014's sponsor - CDW Corporation,event_appearance,135a927a-b238-037e-50e2-d3a2e3511ed2,Adobe MAX 2014,ae883a31-3739-7eb9-4ce0-463ee070ed79,CDW Corporation,organization,sponsor,
1,dffe0542-3dac-4228-a997-3eed42cbd67e,Adobe MAX 2014's sponsor - Microsoft,event_appearance,135a927a-b238-037e-50e2-d3a2e3511ed2,Adobe MAX 2014,fd80725f-53fc-7009-9878-aeecf1e9ffbb,Microsoft,organization,sponsor,
2,21884e6c-d097-4c54-8513-10536a0ea60d,Adobe MAX 2014's sponsor - Twitter,event_appearance,135a927a-b238-037e-50e2-d3a2e3511ed2,Adobe MAX 2014,5da6106f-0d27-0d37-e9d7-dcfeccc1f709,Twitter,organization,sponsor,
3,c3782f8f-4684-47d6-8649-d0b3e1825eb5,Adobe MAX 2014's sponsor - Hewlett-Packard,event_appearance,135a927a-b238-037e-50e2-d3a2e3511ed2,Adobe MAX 2014,8adadbfb-be63-6602-8ca0-b037397a7038,Hewlett-Packard,organization,sponsor,
4,55e2ee8b-b5b7-49d3-bd21-a63eedf6b7f2,Adobe MAX 2014's sponsor - Intel,event_appearance,135a927a-b238-037e-50e2-d3a2e3511ed2,Adobe MAX 2014,1e4f199c-363b-451b-a164-f94571075ee5,Intel,organization,sponsor,


In [21]:
#show unique count of event
event_appear_df['event_uuid'].value_counts()

44f32ad8-b8b4-4f33-9240-25ac4e64c5a3    3424
55b8f2a5-0cea-be6c-a674-a2242a6d36a4    3119
44ed4b3c-3410-a53c-ae20-bd64a63a890c    2076
746dad2c-783e-2234-f4e9-f83cdfdb28db    1627
55bf117e-47b1-4578-99e2-0f22f9d364d2    1408
                                        ... 
398ea9e9-24da-4174-b3f7-d0cab98bedce       1
9fd35cc0-9ece-4aaa-adde-2eb679448097       1
79325ff2-4d9e-9038-9524-a19095fec068       1
36c36538-ca0b-493b-a857-ec757c66efad       1
097bce4e-eb66-4471-8287-8bbedd9d50cc       1
Name: event_uuid, Length: 19445, dtype: int64

In [22]:
#get number of events for top 1 percentile
percentile_1 = int(0.01*19445)
top1percentile_events_uuid = event_appear_df['event_uuid'].value_counts().nlargest(percentile_1)

In [23]:
top1percentile_events_uuid

44f32ad8-b8b4-4f33-9240-25ac4e64c5a3    3424
55b8f2a5-0cea-be6c-a674-a2242a6d36a4    3119
44ed4b3c-3410-a53c-ae20-bd64a63a890c    2076
746dad2c-783e-2234-f4e9-f83cdfdb28db    1627
55bf117e-47b1-4578-99e2-0f22f9d364d2    1408
                                        ... 
77efb84d-60bf-487e-963e-d5ec4360bf70     306
608ce886-0c3c-4662-b583-b43e481dafb1     306
4bc7a540-bbfb-42ef-9331-c06325a80f6a     304
27cf5c6b-0abb-4613-812b-8e344aa6426f     304
758a5b72-bde7-3250-510b-427a16c2456c     304
Name: event_uuid, Length: 194, dtype: int64

In [24]:
#Create df with top 1% percentile events (more than 100 occurences)
df=event_appear_df
top1percentile_events_df = df[df.groupby('event_uuid')['event_uuid'].transform('size') >= 304]

In [25]:
top1percentile_events_df.head(5)

Unnamed: 0,event_appear_uuid,event_appear_name,type,event_uuid,event_name,participant_uuid,participant_name,participant_type,appearance_type,short_description
304,dd9da104-32dd-40ea-a947-73dd875504b6,CES 2015 Las Vegas's exhibitor - 3D Systems,event_appearance,9e828ece-85a3-1b8a-8634-0886c770f488,CES 2015 Las Vegas,dcf536bb-6612-908f-dbdb-4ad44e9c27bd,3D Systems,organization,exhibitor,
305,6e4007b4-5f3b-42e5-b89a-d897a02f2917,CES 2015 Las Vegas's exhibitor - Analogix Semi...,event_appearance,9e828ece-85a3-1b8a-8634-0886c770f488,CES 2015 Las Vegas,e014dc5d-374c-da88-c500-7840600d8d53,Analogix Semiconductor,organization,exhibitor,
306,52b71cd7-cc1a-421d-8bf3-8878a8f6c325,CES 2015 Las Vegas's exhibitor - Bosch,event_appearance,9e828ece-85a3-1b8a-8634-0886c770f488,CES 2015 Las Vegas,dcf152a1-23fc-6a0e-9a76-68c3fc2ec472,Bosch,organization,exhibitor,
307,08ed3a64-d82b-4a0d-957c-01341aae5d99,CES 2015 Las Vegas's exhibitor - Intel,event_appearance,9e828ece-85a3-1b8a-8634-0886c770f488,CES 2015 Las Vegas,1e4f199c-363b-451b-a164-f94571075ee5,Intel,organization,exhibitor,
7323,50f8ec76-b2bf-496b-ac5d-457dc3e52146,CES 2015 Las Vegas's exhibitor - PEOPLE PEOPLE,event_appearance,9e828ece-85a3-1b8a-8634-0886c770f488,CES 2015 Las Vegas,eba013ad-ca03-e5b1-a837-a8a067e189c2,PEOPLE PEOPLE,organization,exhibitor,


In [26]:
top1percentile_attendees = list (set(top1percentile_events_df['participant_uuid']))
len(top1percentile_attendees)

69325

In [None]:
is_attend_top1percentile_event = list()

for org_uuid in list (main_processing_df['org_uuid']):
    if org_uuid in top1percentile_attendees:
        is_attend_top1percentile_event.append(1)
    else:
        is_attend_top1percentile_event.append(0)

In [None]:
org_processing_df_v3 = main_processing_df.copy()

In [None]:
org_processing_df_v3['is_attend_top_event'] = is_attend_top1percentile_event
org_processing_df_v3.head(3)

In [None]:
org_processing_df_v3['is_attend_top_event'].value_counts()

In [None]:
org_processing_df_v3.set_index('org_uuid',inplace=True)

# Organisation Description

In [None]:
org_processing_df_v4 = org_ori_df[['org_uuid','description']]

In [None]:
org_processing_df_v4 = pd.merge(main_processing_df,
                               org_processing_df_v4,
                               on='org_uuid',
                               how='left')
org_processing_df_v4.head(3)

In [None]:
org_processing_df_v4['description'][org_processing_df_v4['description'].notnull()] = 1
org_processing_df_v4['description'].fillna(0,inplace=True)

In [None]:
org_processing_df_v4.head(3)

In [None]:
org_processing_df_v4.rename(columns={'description':'has_description'},inplace=True)

In [None]:
org_processing_df_v4.head(3)

In [None]:
org_processing_df_v4.set_index('org_uuid',inplace=True)

# Merge features, rename columns and save file

In [None]:
main_processing_df.set_index('org_uuid',inplace=True)

In [None]:
main_processing_df = pd.concat([org_processing_df_v1,
                               org_processing_df_v2,
                               org_processing_df_v3,
                               org_processing_df_v4],axis=1)
main_processing_df.head(3)

In [None]:
len(main_processing_df)

In [None]:
new_column_name = list()
for column in list(main_processing_df.columns):
    new_column_name.append('network_' + column)
new_column_dict = dict(zip(main_processing_df.columns,new_column_name))
main_processing_df.rename(columns = new_column_dict,inplace=True)
main_processing_df.head(3)

In [None]:
main_processing_df.to_csv(r'd:\msc-project\data\final\features_network.csv',encoding='utf=8')