In [1]:
%load_ext sql

import numpy as np
import pandas as pd
import pandas.io.sql as sqlio
import psycopg2
import pickle
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse as sp
from sklearn.model_selection import KFold

conn = psycopg2.connect(host="localhost", port=5432, dbname="thegoldtree", user="postgres", password="postgres")

In [2]:
docs = None
sql = "select title, abstract, id_advisor, id, id_author from relationship;"
docs = sqlio.read_sql_query(sql, conn)

In [3]:
docs['title'] = docs['title'].apply(lambda x: x if isinstance(x, str) else '')
docs['abstract'] = docs['abstract'].apply(lambda x: x if isinstance(x, str) else '')
# docs['title_abstract'] = docs['title'] + ' ' + docs['abstract']

In [4]:
vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=False, norm='l1', stop_words=stopwords.words('portuguese'))
response = vectorizer.fit_transform(docs['title'] + ' ' + docs['abstract'])

In [5]:
sql = "select * from researcher;"
researcher = sqlio.read_sql_query(sql, conn)

In [6]:
X_doc_vect_representation = None
y_doc_vect_representation = [None,[None] * 800000]

aux_count = 0

for index, row in researcher.iterrows(): 
    
    researcher_all_advisors = docs[docs.id_author == row['id']].id_advisor.values
    for i in docs[docs.id_author == row['id']].index:
        researcher_advisor       = docs[docs.index == i].id_advisor.values[0]
        if X_doc_vect_representation == None:
            X_doc_vect_representation = response[i]
            y_doc_vect_representation[0] = np.array([researcher_advisor])
            y_doc_vect_representation[1][aux_count] = np.unique(researcher_all_advisors)
        else:
            X_doc_vect_representation = sp.vstack((X_doc_vect_representation,response[i]))
            y_doc_vect_representation[0] = np.append(y_doc_vect_representation[0], researcher_advisor)
            y_doc_vect_representation[1][aux_count] = np.unique(researcher_all_advisors)
        
        aux_count += 1
        
    
        
        

In [7]:
pickle.dump(X_doc_vect_representation, open("X_doc_vect_representation", "wb"))
pickle.dump(y_doc_vect_representation, open("y_doc_vect_representation", "wb"))
pickle.dump(vectorizer, open("vectorizer", "wb"))

In [8]:
kf = KFold(n_splits=10, random_state=245, shuffle=True)
kf.get_n_splits(X_doc_vect_representation)

aux_count = 1

y_doc_vect_representation[1] = np.array(y_doc_vect_representation[1])

for train_index, test_index in kf.split(X_doc_vect_representation):

    train_X_doc_vect_representation, test_X_doc_vect_representation = X_doc_vect_representation[train_index], X_doc_vect_representation[test_index]
    train_y_doc_vect_representation_0, test_y_doc_vect_representation_0 = y_doc_vect_representation[0][train_index], y_doc_vect_representation[0][test_index]
    train_y_doc_vect_representation_1, test_y_doc_vect_representation_1 = y_doc_vect_representation[1][train_index], y_doc_vect_representation[1][test_index]
    
    pickle.dump(train_X_doc_vect_representation, open("fold"+str(aux_count)+"_train_X_doc_vect_representation", "wb"))
    pickle.dump(train_y_doc_vect_representation_0, open("fold"+str(aux_count)+"_train_y_doc_vect_representation", "wb"))
    
    pickle.dump(test_X_doc_vect_representation, open("fold"+str(aux_count)+"_test_X_doc_vect_representation", "wb"))
    pickle.dump([test_y_doc_vect_representation_0,test_y_doc_vect_representation_1], open("fold"+str(aux_count)+"_test_y_doc_vect_representation", "wb"))
    
    aux_count += 1