In [6]:
import numpy as np
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
import pandas as pd
from string import punctuation

### String preprocessing functions

In [7]:
def name_to_list(name):
    try:
        stripped = name.strip()
    except:
        name = str(name)
    word_list = name.split(" ")
    return list(map(lambda x: x.strip().lower(), word_list))

def remove_symbols(word_list):
    for w in range(len(word_list)):
        for char in punctuation:
            if char in word_list[w]:
                word_list[w] = word_list[w].replace(char," ")
    return word_list

def remove_stop_words(word_list):
    words = stopwords.words('spanish')
    return list(filter(lambda x: x not in words, word_list))
    
def preprocess_name(name):
    return remove_stop_words(remove_symbols(name_to_list(name)))
    
def perfect_split(name):
    words = name.split(" ")
    return list(filter(lambda x: x != "",map(lambda y: y.strip(),words)))

### Word2Vec functions

In [8]:
def get_average_vec(words, model):
    vectors = []
    errors = 0
    for word in words:
        if word != " ":
            try:
                vectors.append(model[word])
            except KeyError:
                errors += 1
                print("Error: {}".format(word))
                if errors == len(words):
                    return None
                continue
    #print(errors / len(words))
    return np.mean(np.array(vectors), axis=0)

def create_vector_data(df,model,column,output):
    result = np.zeros((df.shape[0],model.vector_size + 1))
    result[:,0] = df['ID']
    errors = []
    for i in range(df.shape[0]):
        vector = get_average_vec(df[column][i],model)
        if vector is not None:
            result[i,1:] = vector
        else:
            errors.append(i)
    #print(len(errors)) #0
    print(result)
    data = pd.DataFrame(result).to_csv(output, index=False, header=(['id'] + [str(i) for i in range((model.vector_size))]), float_format="%11.17f")
    
def name_similarity(model, n_first, n_second):
    copy_first = list(n_first)
    copy_second = list(n_second)
    while True:
        try:
            return model.n_similarity(copy_first,copy_second)
        except KeyError as err:
            error = str(err).split(" ")[1].strip("'")
            try:
                copy_first.remove(error)
            except:
                copy_second.remove(error)
        except ZeroDivisionError:            
            print(n_first,n_second)
            return 0


###  Loading Word2Vec

In [9]:
model = KeyedVectors.load('trained_vectors')

### Load data

In [15]:
degrees = pd.read_csv('../../preprocessed_data/carreras2.csv')
degrees = degrees[['ID','Nombre','ID Universidad','Área','area_id']]
degrees['Nombre'] = degrees.apply(lambda x : " ".join(preprocess_name(x['Nombre'])),axis=1)
degrees

Unnamed: 0,ID,Nombre,ID Universidad,Área,area_id
0,1,actuación teatral,1,Arte y Arquitectura,3.0
1,2,artes plásticas,1,Arte y Arquitectura,3.0
2,3,diseño teatral,1,Arte y Arquitectura,3.0
3,4,ingeniería sonido,1,Arte y Arquitectura,3.0
4,5,teoría música,1,Arte y Arquitectura,3.0
5,6,teoría historia arte,1,Arte y Arquitectura,3.0
6,7,programa académico bachillerato,1,Ciencias Básicas,4.0
7,8,arquitectura,1,Arte y Arquitectura,3.0
8,9,diseño,1,Arte y Arquitectura,3.0
9,10,geografía,1,Arte y Arquitectura,3.0


In [19]:
similarity_matrix = np.zeros((degrees.shape[0],degrees.shape[0]))
for i in range(degrees.shape[0]):
    for j in range(i,degrees.shape[0]):
        similarity_matrix[i,j] = name_similarity(model,perfect_split(degrees['Nombre'][i]),perfect_split(degrees['Nombre'][j]))
        similarity_matrix[j,i] = similarity_matrix[i,j]


TypeError: file must have a 'write' attribute

In [25]:
np.save('../../serialized/balltree/similarity_matrix',similarity_matrix)

In [7]:
#get_average_vec(degrees['title'][0].split(" "),model)
model.n_similarity(degrees['Nombre'][2617].split(" "),degrees['Nombre'][29].split(" "))

0.43407777458920094

### Join Tablas tuni y peseu

In [8]:
peseu_data = pd.read_csv("peseu_data2.csv")
peseu_data = peseu_data[['id','mat','len','cie','his','nem','carrera','universidad']]
peseu_data['carrera'] = peseu_data.apply(lambda x : " ".join(preprocess_name(x['carrera'])),axis=1)
university_data = pd.read_csv("universidades.tsv", sep='\t')
peseu_data.loc[:,'university_id'] = pd.Series(np.zeros((peseu_data.shape[0]),dtype='int32'),index=peseu_data.index)
peseu_data.loc[:,'tuni_carreer_id'] = pd.Series(np.zeros((peseu_data.shape[0]),dtype='int32'),index=peseu_data.index)
peseu_data.loc[:,'tuni_carreer_name'] = pd.Series(np.zeros((peseu_data.shape[0])),index=peseu_data.index)
peseu_data.loc[:,'tuni_area'] = pd.Series(np.zeros((peseu_data.shape[0])),index=peseu_data.index)
peseu_data.loc[:,'tuni_area_id'] = pd.Series(np.zeros((peseu_data.shape[0]),dtype='int32'),index=peseu_data.index)

In [9]:
for i in range(university_data.shape[0]):
    booleans = peseu_data['universidad'] == university_data['Sigla'][i]
    peseu_data.loc[booleans,'university_id'] = pd.Series(np.array([university_data['ID'][i]] * len(booleans))) 


In [10]:
id_table = {}
for i in range(peseu_data.shape[0]):
    if not (peseu_data.loc[i,'carrera'],peseu_data.loc[i,'universidad']) in id_table:
        u_carreers = degrees[degrees['ID Universidad'] == peseu_data.loc[i,"university_id"]]
        try:
            most_similar = u_carreers.loc[u_carreers.index[0],:]
        except IndexError:
            print(peseu_data.loc[i,:])
        split_name = perfect_split(peseu_data.loc[i,'carrera'])
        best_similarity = name_similarity(model,split_name,perfect_split(most_similar['Nombre']))
        for j in u_carreers.index:
            similarity = name_similarity(model,split_name,perfect_split(u_carreers.loc[j,'Nombre']))
            if similarity > best_similarity:
                most_similar = u_carreers.loc[j,:]
                best_similarity = similarity
        peseu_data.loc[i,'tuni_carreer_id'] = most_similar['ID']
        peseu_data.loc[i,'tuni_carreer_name'] = most_similar['Nombre']
        peseu_data.loc[i,'tuni_area'] = most_similar['Área']
        peseu_data.loc[i,'tuni_area_id'] = most_similar['area_id']
        id_table[(peseu_data.loc[i,'carrera'],peseu_data.loc[i,'universidad'])] = (most_similar['ID'],most_similar['Nombre'],most_similar['Área'],most_similar['area_id'])
        
    else:
        peseu_data.loc[i,'tuni_carreer_id'] = id_table[(peseu_data.loc[i,'carrera'],peseu_data.loc[i,'universidad'])][0]
        peseu_data.loc[i,'tuni_carreer_name'] = id_table[(peseu_data.loc[i,'carrera'],peseu_data.loc[i,'universidad'])][1]
        peseu_data.loc[i,'tuni_area'] = id_table[(peseu_data.loc[i,'carrera'],peseu_data.loc[i,'universidad'])][2]
        peseu_data.loc[i,'tuni_area_id'] = id_table[(peseu_data.loc[i,'carrera'],peseu_data.loc[i,'universidad'])][3]
    if i % 100 == 0:
        print("progress: {0:.5f}".format(i/peseu_data.shape[0]))

progress: 0.00000
progress: 0.00083
progress: 0.00167
progress: 0.00250
progress: 0.00334
progress: 0.00417
progress: 0.00500
progress: 0.00584
progress: 0.00667
progress: 0.00751
progress: 0.00834
progress: 0.00917
progress: 0.01001
progress: 0.01084
progress: 0.01167
progress: 0.01251
progress: 0.01334
progress: 0.01418
progress: 0.01501
progress: 0.01584
progress: 0.01668
progress: 0.01751
progress: 0.01835
progress: 0.01918
progress: 0.02001
progress: 0.02085
progress: 0.02168
progress: 0.02252
progress: 0.02335
progress: 0.02418
progress: 0.02502
progress: 0.02585
progress: 0.02668
progress: 0.02752
progress: 0.02835
progress: 0.02919
progress: 0.03002
progress: 0.03085
progress: 0.03169
progress: 0.03252
progress: 0.03336
progress: 0.03419
progress: 0.03502
progress: 0.03586
progress: 0.03669
progress: 0.03753
progress: 0.03836
progress: 0.03919
progress: 0.04003
progress: 0.04086
progress: 0.04170
progress: 0.04253
progress: 0.04336
progress: 0.04420
progress: 0.04503
progress: 

progress: 0.38026
progress: 0.38109
progress: 0.38193
progress: 0.38276
progress: 0.38360
progress: 0.38443
progress: 0.38526
progress: 0.38610
progress: 0.38693
progress: 0.38776
progress: 0.38860
progress: 0.38943
progress: 0.39027
progress: 0.39110
progress: 0.39193
progress: 0.39277
progress: 0.39360
progress: 0.39444
progress: 0.39527
progress: 0.39610
progress: 0.39694
progress: 0.39777
progress: 0.39861
progress: 0.39944
progress: 0.40027
progress: 0.40111
progress: 0.40194
progress: 0.40278
progress: 0.40361
progress: 0.40444
progress: 0.40528
progress: 0.40611
progress: 0.40694
progress: 0.40778
progress: 0.40861
progress: 0.40945
progress: 0.41028
progress: 0.41111
progress: 0.41195
progress: 0.41278
progress: 0.41362
progress: 0.41445
progress: 0.41528
progress: 0.41612
progress: 0.41695
progress: 0.41779
progress: 0.41862
progress: 0.41945
progress: 0.42029
progress: 0.42112
progress: 0.42196
progress: 0.42279
progress: 0.42362
progress: 0.42446
progress: 0.42529
progress: 

progress: 0.76052
progress: 0.76135
progress: 0.76219
progress: 0.76302
progress: 0.76386
progress: 0.76469
progress: 0.76552
progress: 0.76636
progress: 0.76719
progress: 0.76802
progress: 0.76886
progress: 0.76969
progress: 0.77053
progress: 0.77136
progress: 0.77219
progress: 0.77303
progress: 0.77386
progress: 0.77470
progress: 0.77553
progress: 0.77636
progress: 0.77720
progress: 0.77803
progress: 0.77887
progress: 0.77970
progress: 0.78053
progress: 0.78137
progress: 0.78220
progress: 0.78304
progress: 0.78387
progress: 0.78470
progress: 0.78554
progress: 0.78637
progress: 0.78720
progress: 0.78804
progress: 0.78887
progress: 0.78971
progress: 0.79054
progress: 0.79137
progress: 0.79221
progress: 0.79304
progress: 0.79388
progress: 0.79471
progress: 0.79554
progress: 0.79638
progress: 0.79721
progress: 0.79805
progress: 0.79888
progress: 0.79971
progress: 0.80055
progress: 0.80138
progress: 0.80221
progress: 0.80305
progress: 0.80388
progress: 0.80472
progress: 0.80555
progress: 

In [11]:
peseu_data.to_csv('joined_data2.csv', index=False, header=peseu_data.columns.values)


In [12]:
areas = degrees['area_id'].unique()
areas = np.delete(areas,[areas.shape[0]-1])
areas

array([  3.,   4.,   2.,  11.,  10.,   5.,   9.,   6.,   1.,   7.,   8.])

In [13]:
for area in areas:
    split_df = peseu_data[peseu_data['tuni_area_id'] == area]
    split_df.to_csv('area_data/{}_data.csv'.format(str(int(area)).replace(" ","_")), index=False, header=split_df.columns.values)