In [1]:
import numpy as np
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
import pandas as pd
from string import punctuation

Slow version of gensim.models.doc2vec is being used


### String preprocessing functions

In [2]:
def name_to_list(name):
    try:
        stripped = name.strip()
    except:
        name = str(name)
    word_list = name.split(" ")
    return list(map(lambda x: x.strip().lower(), word_list))

def remove_symbols(word_list):
    for w in range(len(word_list)):
        for char in punctuation:
            if char in word_list[w]:
                word_list[w] = word_list[w].replace(char," ")
    return word_list

def remove_stop_words(word_list):
    words = stopwords.words('spanish')
    return list(filter(lambda x: x not in words, word_list))
    
def preprocess_name(name):
    return remove_stop_words(remove_symbols(name_to_list(name)))
    
def perfect_split(name):
    words = name.split(" ")
    return list(filter(lambda x: x != "",map(lambda y: y.strip(),words)))

### Word2Vec functions

In [3]:
def get_average_vec(words, model):
    vectors = []
    errors = 0
    for word in words:
        if word != " ":
            try:
                vectors.append(model[word])
            except KeyError:
                errors += 1
                print("Error: {}".format(word))
                if errors == len(words):
                    return None
                continue
    #print(errors / len(words))
    return np.mean(np.array(vectors), axis=0)

def create_vector_data(df,model,column,output):
    result = np.zeros((df.shape[0],model.vector_size + 1))
    result[:,0] = df['ID']
    errors = []
    for i in range(df.shape[0]):
        vector = get_average_vec(df[column][i],model)
        if vector is not None:
            result[i,1:] = vector
        else:
            errors.append(i)
    #print(len(errors)) #0
    print(result)
    data = pd.DataFrame(result).to_csv(output, index=False, header=(['id'] + [str(i) for i in range((model.vector_size))]), float_format="%11.17f")
    
def name_similarity(model, n_first, n_second):
    copy_first = list(n_first)
    copy_second = list(n_second)
    while True:
        try:
            return model.n_similarity(copy_first,copy_second)
        except KeyError as err:
            error = str(err).split(" ")[1].strip("'")
            try:
                copy_first.remove(error)
            except:
                copy_second.remove(error)
        except ZeroDivisionError:            
            print(n_first,n_second)
            return 0


###  Loading Word2Vec

In [4]:
model = KeyedVectors.load('trained_vectors')

### Load data

In [5]:
degrees = pd.read_csv('carreras.tsv', sep='\t')
degrees = degrees[['ID','Nombre','ID Universidad','Área']]
degrees['Nombre'] = degrees.apply(lambda x : " ".join(preprocess_name(x['Nombre'])),axis=1)

In [7]:
#get_average_vec(degrees['title'][0].split(" "),model)
model.n_similarity(degrees['Nombre'][2617].split(" "),degrees['Nombre'][29].split(" "))

0.43407777458920094

### Join Tablas tuni y peseu

In [6]:
peseu_data = pd.read_csv("peseu_data.csv")
peseu_data = peseu_data[['id','mat','len','cie','his','carrera','universidad']]
peseu_data['carrera'] = peseu_data.apply(lambda x : " ".join(preprocess_name(x['carrera'])),axis=1)
university_data = pd.read_csv("universidades.tsv", sep='\t')
peseu_data.loc[:,'university_id'] = pd.Series(np.zeros((peseu_data.shape[0]),dtype='int32'),index=peseu_data.index)
peseu_data.loc[:,'tuni_carreer_id'] = pd.Series(np.zeros((peseu_data.shape[0]),dtype='int32'),index=peseu_data.index)
peseu_data.loc[:,'tuni_carreer_name'] = pd.Series(np.zeros((peseu_data.shape[0])),index=peseu_data.index)
peseu_data.loc[:,'tuni_area'] = pd.Series(np.zeros((peseu_data.shape[0])),index=peseu_data.index)

In [7]:
for i in range(university_data.shape[0]):
    booleans = peseu_data['universidad'] == university_data['Sigla'][i]
    peseu_data.loc[booleans,'university_id'] = pd.Series(np.array([university_data['ID'][i]] * len(booleans))) 


In [8]:
id_table = {}
for i in range(peseu_data.shape[0]):
    if not (peseu_data.loc[i,'carrera'],peseu_data.loc[i,'universidad']) in id_table:
        u_carreers = degrees[degrees['ID Universidad'] == peseu_data.loc[i,"university_id"]]
        try:
            most_similar = u_carreers.loc[u_carreers.index[0],:]
        except IndexError:
            print(peseu_data.loc[i,:])
        split_name = perfect_split(peseu_data.loc[i,'carrera'])
        best_similarity = name_similarity(model,split_name,perfect_split(most_similar['Nombre']))
        for j in u_carreers.index:
            similarity = name_similarity(model,split_name,perfect_split(u_carreers.loc[j,'Nombre']))
            if similarity > best_similarity:
                most_similar = u_carreers.loc[j,:]
                best_similarity = similarity
        peseu_data.loc[i,'tuni_carreer_id'] = most_similar['ID']
        peseu_data.loc[i,'tuni_carreer_name'] = most_similar['Nombre']
        peseu_data.loc[i,'tuni_area'] = most_similar['Área']
        id_table[(peseu_data.loc[i,'carrera'],peseu_data.loc[i,'universidad'])] = (most_similar['ID'],most_similar['Nombre'],most_similar['Área'])
        
    else:
        peseu_data.loc[i,'tuni_carreer_id'] = id_table[(peseu_data.loc[i,'carrera'],peseu_data.loc[i,'universidad'])][0]
        peseu_data.loc[i,'tuni_carreer_name'] = id_table[(peseu_data.loc[i,'carrera'],peseu_data.loc[i,'universidad'])][1]
        peseu_data.loc[i,'tuni_area'] = id_table[(peseu_data.loc[i,'carrera'],peseu_data.loc[i,'universidad'])][2]
    if i % 100 == 0:
        print("progress: {0:.5f}".format(i/peseu_data.shape[0]))

progress: 0.00000
progress: 0.00083
progress: 0.00167
progress: 0.00250
progress: 0.00334
progress: 0.00417
progress: 0.00500
progress: 0.00584
progress: 0.00667
progress: 0.00751
progress: 0.00834
progress: 0.00918
progress: 0.01001
progress: 0.01084
progress: 0.01168
progress: 0.01251
progress: 0.01335
progress: 0.01418
progress: 0.01501
progress: 0.01585
progress: 0.01668
progress: 0.01752
progress: 0.01835
progress: 0.01919
progress: 0.02002
progress: 0.02085
progress: 0.02169
progress: 0.02252
progress: 0.02336
progress: 0.02419
progress: 0.02502
progress: 0.02586
progress: 0.02669
progress: 0.02753
progress: 0.02836
progress: 0.02919
progress: 0.03003
progress: 0.03086
progress: 0.03170
progress: 0.03253
progress: 0.03337
progress: 0.03420
progress: 0.03503
progress: 0.03587
progress: 0.03670
progress: 0.03754
progress: 0.03837
progress: 0.03920
progress: 0.04004
progress: 0.04087
progress: 0.04171
progress: 0.04254
progress: 0.04338
progress: 0.04421
progress: 0.04504
progress: 

progress: 0.38037
progress: 0.38120
progress: 0.38204
progress: 0.38287
progress: 0.38370
progress: 0.38454
progress: 0.38537
progress: 0.38621
progress: 0.38704
progress: 0.38787
progress: 0.38871
progress: 0.38954
progress: 0.39038
progress: 0.39121
progress: 0.39205
progress: 0.39288
progress: 0.39371
progress: 0.39455
progress: 0.39538
progress: 0.39622
progress: 0.39705
progress: 0.39788
progress: 0.39872
progress: 0.39955
progress: 0.40039
progress: 0.40122
progress: 0.40206
progress: 0.40289
progress: 0.40372
progress: 0.40456
progress: 0.40539
progress: 0.40623
progress: 0.40706
progress: 0.40789
progress: 0.40873
progress: 0.40956
progress: 0.41040
progress: 0.41123
progress: 0.41206
progress: 0.41290
progress: 0.41373
progress: 0.41457
progress: 0.41540
progress: 0.41624
progress: 0.41707
progress: 0.41790
progress: 0.41874
progress: 0.41957
progress: 0.42041
progress: 0.42124
progress: 0.42207
progress: 0.42291
progress: 0.42374
progress: 0.42458
progress: 0.42541
progress: 

progress: 0.76074
progress: 0.76157
progress: 0.76240
progress: 0.76324
progress: 0.76407
progress: 0.76491
progress: 0.76574
progress: 0.76657
progress: 0.76741
progress: 0.76824
progress: 0.76908
progress: 0.76991
progress: 0.77075
progress: 0.77158
progress: 0.77241
progress: 0.77325
progress: 0.77408
progress: 0.77492
progress: 0.77575
progress: 0.77658
progress: 0.77742
progress: 0.77825
progress: 0.77909
progress: 0.77992
progress: 0.78075
progress: 0.78159
progress: 0.78242
progress: 0.78326
progress: 0.78409
progress: 0.78493
progress: 0.78576
progress: 0.78659
progress: 0.78743
progress: 0.78826
progress: 0.78910
progress: 0.78993
progress: 0.79076
progress: 0.79160
progress: 0.79243
progress: 0.79327
progress: 0.79410
progress: 0.79494
progress: 0.79577
progress: 0.79660
progress: 0.79744
progress: 0.79827
progress: 0.79911
progress: 0.79994
progress: 0.80077
progress: 0.80161
progress: 0.80244
progress: 0.80328
progress: 0.80411
progress: 0.80494
progress: 0.80578
progress: 

In [24]:
peseu_data.to_csv('joined_data.csv', index=False, header=peseu_data.columns.values)

In [22]:
areas = degrees['Área'].unique()
areas = np.delete(areas,[areas.shape[0]-1])
areas

array(['Arte y Arquitectura', 'Ciencias Básicas', 'Agropecuaria',
       'Tecnología', 'Sin área definida', 'Ciencias Sociales', 'Salud',
       'Derecho', 'Administración y comercio', 'Educación', 'Humanidades'], dtype=object)

In [23]:
for area in areas:
    split_df = peseu_data[peseu_data['tuni_area'] == area]
    split_df.to_csv('area_data/{}_data.csv'.format(area.replace(" ","_")), index=False, header=split_df.columns.values)