### Applies google w2v model to extracted phrases, save vector and meta data to tsv file for embedding projector

In [1]:
import pandas as pd
import numpy as np
import re
import os
from nltk.corpus import stopwords
import gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

#### load and clean keyphrase dataframe

In [2]:
output_path = '../Output'

In [3]:
raw_df = pd.read_csv('../Output/Submissions_with_Keyword_Extraction_Graphical.csv')

In [4]:
# Some inspection
print(raw_df.columns)
raw_df.Webstorm.unique()

Index(['Unnamed: 0', 'Webstorm', 'Webstorm ID', 'Code', 'Category',
       'Submitted', 'Submitter', 'Submitter email', 'Title', 'Description',
       'Status', 'Stage', 'Step', 'Score', 'Count of Comments',
       'Number of Views', 'Financial Outcomes', 'Projected Net Benefit',
       'Comments', 'Associated Tags', 'Has Attachment?', 'Linked?', 'URL',
       'Previous Status', 'Date Entered Status', 'Days In Current Status',
       'Merged', 'Merged With', 'Screen Name', 'Email', 'First Name',
       'Last Name', 'Job Title', 'Department', 'Address', 'Phone', 'Mobile',
       'Location', 'Position', 'Work History', 'Expert Skills', 'Photo',
       'Submitted From', 'External ID', 'Internal_External', 'Title_modified',
       'Team_Submission', 'text rank'],
      dtype='object')


array(['IMF Knowledge Sharing Challenge', 'Innovation Wall of Fame',
       'Middle East Inclusive Growth Forum',
       'SME Financial Inclusion Forum', 'Virtual Mentorship',
       'AI & ML Challenge', 'Big Data',
       'Box Knowledge Management Success Stories',
       'Capacity Development Technology Challenge',
       'Challenge Topic Discovery (March 2015)',
       'CSF Continuous Improvement Challenge',
       'Digital Advisory Marketplace', 'GFSR Analytic Chapter Topics',
       'Health & Wellness Framework',
       'Health and Wellness Event Challenge Feb 2015',
       'Ideas@Work Challenge Template', 'iLab Accelerator Bootcamp',
       'iLab Accelerator Projects', 'iLab Requests',
       'IMF External Training Challenge', 'InnoFest Contest',
       'Innovation Awards', 'Innovation Community Board',
       'ITD Branding Challenge', 'Multi-Sports Day',
       'Problem Solving Challenge', 'Recognition', 'Shark Tank',
       'Template WebStorm', 'The Innovation Fund', 'Volunteer

In [5]:
def clean(old_list):
    
    if( type(old_list) is not float):
        extracted_list = old_list.split(',')
        new_list = list(map(lambda element: re.sub('[^A-Za-z0-9]+', ' ', element).strip(' '), extracted_list))
    else:
        new_list = []
        
    return new_list

new_df = raw_df['text rank'].apply(lambda x: clean(x))
new_df = pd.DataFrame(new_df.tolist(), columns= ['p1','s1','p2','s2','p3','s3','p4','s4','p5','s5','extra'])
new_df = new_df[[x for x in new_df.columns if x !='extra']]

# select relevant meta data
meta_data_columns = ['Webstorm','Department','Number of Views','Count of Comments','Internal_External']
selected_df = raw_df[meta_data_columns]
new_df = pd.concat([selected_df,new_df], axis = 1)

new_df = pd.melt(new_df, value_vars=['p1','p2','p3','p4','p5'], id_vars= meta_data_columns)

# remove empty phrasee
new_df = new_df[~new_df.value.isnull()]
new_df.shape

(4244, 7)

#### remove stop words

In [6]:
import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')

new_df['cleaned phrase'] = new_df.value.apply(lambda x: [w for w in x.split() if w not in (stop_words)])

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


####  get vector from pre-trained google w2v model [downloaded manually]('https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing')

In [7]:
model = gensim.models.KeyedVectors.load_word2vec_format('../Data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [8]:
def get_vector(phrase):
    
    #vector = np.zeros((300,1))
    vector = [model[word] for word in phrase if word in model.vocab]
    vector = np.array(vector)
    
    if(vector.shape[0]>0):
        vector = np.mean(vector, axis = 0)
    
    return vector
new_df['vector'] = new_df['cleaned phrase'].apply(lambda x : get_vector(x))

new_df.tail()

Unnamed: 0,Webstorm,Department,Number of Views,Count of Comments,Internal_External,variable,value,cleaned phrase,vector
4359,Volunteer Challenge,CSFDF,25,3,Internal,p5,blood type,"[blood, type]","[-0.1854248, 0.115478516, 0.08956909, 0.073974..."
4360,Volunteer Challenge,CSFDFSP,17,0,Internal,p5,our volunteers,[volunteers],"[0.16015625, 0.021972656, -0.114746094, 0.2060..."
4361,Volunteer Challenge,ITDIS,35,0,Internal,p5,create teams that,"[create, teams]","[-0.16186523, 0.105529785, 0.03894043, 0.06335..."
4362,Volunteer Challenge,ICDAIBU,28,1,Internal,p5,activity,[activity],"[0.068359375, -0.079589844, 0.12988281, 0.2832..."
4363,Volunteer Challenge,ITDISCS,27,1,Internal,p5,free imf t,"[free, imf]","[0.052856445, -0.060913086, 0.050598145, 0.194..."


#### save meta data and vector data to tsv files for google embedding projector

In [9]:
new_df['Department_Short'] = new_df['Department'].apply(lambda x: x[0:3] if type(x) != float else x)

# filter out non-matching phrases
new_df = new_df[new_df.vector.apply(len) == 300]

new_df.shape

(4218, 10)

In [10]:
def save_vector(v_list, output_path, vector_file = "ilab_w2v_vectordata.tsv"):

    with open(os.path.join(output_path,vector_file), 'wb') as file_vector:
        for v in v_list:
            #file_metadata.write('{0}'.format(word).encode('utf-8') + '\n'.encode('utf-8'))                
            vector_row = '\t'.join(map(str, v.tolist()))
            file_vector.write(gensim.utils.to_utf8(vector_row) + '\n'.encode('utf-8'))

    return None

save_vector(v_list = new_df['vector'].tolist(), output_path = output_path)

In [11]:
def save_meta(metadata, output_path, meta_file = "ilab_w2v_metadata.tsv"):

    with open(os.path.join(output_path,meta_file), 'wb') as file_metadata:  
        name_row = '\t'.join(map(str, metadata.columns.tolist()))
        file_metadata.write('{0}'.format(name_row).encode('utf-8') + '\n'.encode('utf-8'))  
        
        for row in metadata.iterrows():
            meta_row = '\t'.join(map(str, row[1].tolist()))
            file_metadata.write('{0}'.format(meta_row).encode('utf-8') + '\n'.encode('utf-8'))                
           
    return None

save_meta(metadata = new_df[['cleaned phrase','Department_Short','Webstorm']], output_path = output_path)

In [12]:
new_df[['cleaned phrase','Department_Short','Webstorm','Number of Views','Count of Comments','Internal_External']].columns.tolist()

['cleaned phrase',
 'Department_Short',
 'Webstorm',
 'Number of Views',
 'Count of Comments',
 'Internal_External']