In [1]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_pickle('../data/corpus.pkl')
data = pd.read_pickle('../data/data_df.pkl')

In [None]:
df['syllables'] = df.syllables.str.split(' ')

In [None]:
df.loc[1:1]

In [5]:
def get_features(word):
    return {
        'characters': len(word[0]),
        'syllable_count': len(word[2])
    }

def get_targets(word):
    return {
        'pos1': word[1][0],
        'pos2': word[1][1],
        'pos3': word[1][2],
        'pos4': word[1][3],
        'pos5': word[1][4],
        'pos6': word[1][5],
        'pos7': word[1][6],
        'pos8': word[1][7],
        'pos9': word[1][8],
    }

def make_array(to_arrayify):
    return np.array([np.array(i).astype('float64') for i in to_arrayify])
        

In [6]:
words = [tuple((df.word[i],df.pos[i],df.syllables[i])) for i in range(len(df))]

In [7]:
features = [get_features(word) for word in words]

In [8]:
from sklearn.feature_extraction import DictVectorizer

In [9]:
feature_vectorizer = DictVectorizer(sparse=False)

features = feature_vectorizer.fit_transform(features)


In [10]:
word_vecs = make_array(df.word_vec.values)

syllable_vecs = make_array(df.syllable_vec.values)

In [11]:
X = np.concatenate((word_vecs, syllable_vecs, features), axis=1)

In [12]:
targets = [get_targets(word) for word in words]

In [13]:
target_vectorizer = DictVectorizer(sparse=False)

y = target_vectorizer.fit_transform(targets)

In [14]:
# transforming back from feature_vectorizer

feature_vectorizer.inverse_transform(np.reshape(X[0][-2:], (1,-1)))

[{'characters': 7.0, 'syllable_count': 3.0}]

In [15]:
# transforming back from target_vectorizer

# reshape input
np.reshape(y[1][-2:], (-1,2))

target_vectorizer.inverse_transform(y[0].reshape((1,-1)))

[{'pos1=n': 1.0,
  'pos2=-': 1.0,
  'pos3=s': 1.0,
  'pos4=-': 1.0,
  'pos5=-': 1.0,
  'pos6=-': 1.0,
  'pos7=f': 1.0,
  'pos8=n': 1.0,
  'pos9=-': 1.0}]

In [78]:
from sklearn.preprocessing import LabelEncoder

In [88]:
target_encoder = LabelEncoder()

In [89]:
y = target_encoder.fit_transform(df.pos.values)

In [91]:
y[1]

550

In [96]:
target_encoder.inverse_transform(np.ravel(y[1]))

array(['v--pna---'], dtype=object)