In [None]:
!pip install -U -q PyDrive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_list = drive.ListFile({'q': "'1DIF8C7O9SUS3puW3NHXymKa2Nz3GG8L7' in parents and trashed=false"}).GetList()
for file1 in file_list:
    if file1['title'] == 'wikidata_raw.zip':
        preprocessorFile = drive.CreateFile({'id': file1['id']})
        preprocessorFile.GetContentFile('wikidata_raw.zip')
        print('title: %s, id: %s is downloaded' % (file1['title'], file1['id']))
        break

In [None]:
!unzip wikidata_raw.zip

In [None]:
!rm wikidata_raw.zip

In [None]:
import os
import re

In [None]:
wikidata_filepath = 'wikidata_raw'
wikidata_entries_filepath = os.path.join(wikidata_filepath, 'wikidata_entries')
# Entity numbers
# NOTE: SPLP could not be found on Wikidata
name_to_num = {}
num_to_name = {}
entities = set()
with open(os.path.join(wikidata_filepath, 'links.csv')) as file:
    next(file)
    for line in file:
        name, url = line.split(',')
        entity_num = int(re.sub('\D', '', url.split(',')[-1]))
        name_to_num[name] = entity_num
        num_to_name[entity_num] = name
        entities.add(entity_num)
    # Building first-order relationships
    graph = {}
    for name in name_to_num.keys():
        graph[name] = []
    graph['SPLP'] = []

for filename in os.listdir(wikidata_entries_filepath):
    with open(os.path.join(wikidata_entries_filepath, filename)) as file:
        if filename == '.DS_Store': continue
            
        orig_entity = filename.split('_')[0]
        # print('\nSearching entity', orig_entity)
        
        for e_id in entities:
            file.seek(0, 0)
            if str(e_id) in file.read():
                # print(f'{orig_entity} contains entity {num_to_name[e_id]}')
                graph[orig_entity].append(num_to_name[e_id])

# Make graph non-directional
for co1 in graph.keys():
    co2s = graph[co1]
    for co2 in co2s:
        if co1 not in graph[co2]:
            graph[co2].append(co1)

# Build graph mapping companies to all their related entities
entity_regex = re.compile(".+Q[0-9].+")

company_to_entities = {}
for name in name_to_num.keys():
    company_to_entities[name] = []
company_to_entities['SPLP'] = []

for filename in os.listdir(wikidata_entries_filepath):
    with open(os.path.join(wikidata_entries_filepath, filename)) as file:
        if filename == '.DS_Store': continue
            
        orig_entity = filename.split('_')[0]
        # print('\nSearching entity', orig_entity, name_to_num[orig_entity])
        
        for line in file:
            if entity_regex.match(line):
                try:
                    q_index = line.index('Q')
                    s_index = line[q_index:].index(' ') + q_index
                    related_entity = line[q_index + 1:s_index]
                    
                    if '-' not in related_entity:
                        # print('>', related_entity)
                        if related_entity not in company_to_entities[orig_entity]:
                            company_to_entities[orig_entity].append(related_entity)
                except ValueError:
                    # print('substring err')
                    pass

# Build second-order relations
graph_2 = {}
for name in name_to_num.keys():
    graph_2[name] = set()
graph_2['SPLP'] = set()

def common_member(a, b): 
    a_set = set(a) 
    b_set = set(b) 
    if len(a_set.intersection(b_set)) > 0: 
        return(True)  
    return(False)  

for company in company_to_entities.keys():
    for other_company in company_to_entities.keys():
        if common_member(company_to_entities[company], company_to_entities[other_company]):
            graph_2[company].add(other_company)

# Make graph non-directional
for co1 in graph_2.keys():
    co2s = graph_2[co1]
    for co2 in co2s:
        if co1 not in graph_2[co2]:
            graph[co2].add(co1)

# First and second order graphs combined
graph_1_2 = {}
for name in name_to_num.keys():
    graph_1_2[name] = set()
graph_1_2['SPLP'] = set()

for company in graph_1_2.keys():
    graph_1_2[company].update(graph[company])
    graph_1_2[company].update(graph_2[company])



In [None]:
for file1 in file_list:
    if file1['title'] == 'tweet_output.zip':
        preprocessorFile = drive.CreateFile({'id': file1['id']})
        preprocessorFile.GetContentFile('tweet_output.zip')
        print('title: %s, id: %s is downloaded' % (file1['title'], file1['id']))
        break

In [None]:
!unzip tweet_output.zip

In [None]:
!rm tweet_output.zip

In [None]:
import numpy as np

In [None]:
with open('tweet_output/stock_key_seq.np','rb') as f:
    tweetseq = np.load(f)

In [None]:
output = np.empty((0,88))
for i in tweetseq:
    temp = np.array([0]*88)
    for j in graph_1_2[i]:
        out=np.where(tweetseq == j)
        temp[out[0][0]] = 1
    output=np.vstack((output, temp))

In [None]:
with open('tweet_output/graph_relation.npy','wb') as f:
    np.save(f, output)

In [None]:
!zip -r tweet_output.zip tweet_output