In [3]:
import os

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

from scipy.sparse import coo_matrix, hstack

Due to a bit confusing input format

In [20]:
def read_train(file_name):
    f = open(file_name)
    lines = f.readlines()
    lines_splitted = []
    for line in lines:
        try:
            splitted =  line.split(",")
            lines_splitted.append([splitted[1], splitted[2].split('+')[0], splitted[2].split('+')[1][0]])
        except: 
            "wrong lines"
    return pd.DataFrame(lines_splitted, columns=['Word', 'Init', 'Prop'])

In [21]:
lemmas_train = read_train("data/lemmas_train.csv")
lemmas_test = pd.read_csv("data/lemmas_test.csv")

In [22]:
lemmas_train.tail()

Unnamed: 0,Word,Init,Prop
118635,posereste,posare,V
118636,cogestiste,cogestire,V
118637,autocorreggerebbero,autocorreggere,V
118638,gorgogliassimo,gorgogliare,V
118639,desecretaste,desecretare,V


In [24]:
lemmas_test.head()

Unnamed: 0,Id,X
0,1,gettonan
1,2,incidentali
2,3,involtino
3,4,lievi
4,5,comunistizzasse


## Will determine part of speech and initial form separately

### Determine part of speech via ngrams (again)

In [48]:
needed_cols = lemmas_train.columns.drop(['Init'])

In [49]:
xtrain, xcv = train_test_split(lemmas_train[needed_cols], test_size = 0.2)

In [50]:
# Create transformer into ngrams
transformer = CountVectorizer(ngram_range=(2, 8), analyzer='char_wb', binary=True, lowercase=True, max_df=0.84)

In [51]:
train_matrix = transformer.fit_transform(xtrain['Word'])

In [52]:
%%time
predictor = LogisticRegression().fit(train_matrix, xtrain['Prop'])
predictions = predictor.predict(transformer.transform(xcv['Word']))

CPU times: user 1min 13s, sys: 1.33 s, total: 1min 15s
Wall time: 20 s


In [53]:
accuracy_score(xcv['Prop'], predictions)

0.96750674308833451

Nice score ?

## To determine initial form we will cut the ending and append something to the remainder

1. Find the same prefix
- Count how many symbols to cut
- Find what to append

In [145]:
def are_strs(smth1, smth2):
    if type(smth1) == type("") and type(smth2) == type(""):
        return True
    else:
        return False

In [155]:
def same_prefix_length(word1_, word2_):
    def for_strs(word1, word2):
        retval = 0
        for i in range(min(len(word1), len(word2))):
            if word1[i] == word2[i]:
                retval+=1
            else:
                break
        return retval
    
    if are_strs(word1_, word2_):
        return for_strs(word1_, word2_)
    else:
        # Consider them as arrays
        return np.array([for_strs(w1,w2) for w1,w2 in zip(word1_, word2_)])

In [156]:
# to cut from the end of the word1
def to_cut(word1_, word2_):
    def for_strs(word1, word2):
        return len(word1) - same_prefix_length(word1, word2)
    if are_strs(word1_, word2_):
        return for_strs(word1_, word2_)
    else:
        return np.array([for_strs(w1,w2) for w1,w2 in zip(word1_, word2_)])

In [168]:
# What to append to the word1 which has been cut
def to_append(word1_, word2_):
    def for_strs(word1, word2):
        ending = word2[same_prefix_length(word1, word2):]
        if ending == "":
            ending = "$"
        return ending
    if are_strs(word1_, word2_):
        return for_strs(word1_, word2_)
    else:
        return np.array([for_strs(w1,w2) for w1,w2 in zip(word1_, word2_)])

In [169]:
# Creating a relation between words : "<symbols to cut>_<what to append>"
def get_relation(word1_, word2_):
    def for_strs(word1, word2):
        return str(to_cut(word1, word2)) + "_" + to_append(word1, word2)
    if are_strs(word1_, word2_):
        return for_strs(word1_, word2_)
    else:
        return np.array([for_strs(w1,w2) for w1,w2 in zip(word1_, word2_)])

In [170]:
# Little test
s1 = "blakukurg"
s2 = "blakava"
s3 = "black"
s4 = "hello"

print(same_prefix_length(s1,s2))
print(same_prefix_length(s2,s3))
print(same_prefix_length(s3,s4))
print(to_cut(s1, s2))
print(to_append(s1,s2))
print(get_relation(s1,s2))

4
3
0
5
ava
5_ava


#### Realtions are our classes. We will classify using them.

## And now - cross validation using ngrams

In [176]:
lemmas_train['relation'] = get_relation(lemmas_train['Word'], lemmas_train['Init'])

In [177]:
# Creating a relation between words : "<symbols to cut>_<what to append>"
lemmas_train.head()

Unnamed: 0,Word,Init,Prop,relation
0,vergognerete,vergognare,V,5_are
1,amnistiavate,amnistiare,V,4_re
2,menomazione,menomazione,N,0_$
3,sfaldavamo,sfaldare,V,4_re
4,sfodererei,sfoderare,V,4_are


In [None]:
xtrain, xcv = train_test_split(lemmas_train, test_size = 0.2)

In [None]:
xtrain.head()

In [None]:
# Create transformer into ngrams
transformer = CountVectorizer(ngram_range=(2, 8), analyzer='char_wb', binary=True, lowercase=True, max_df=0.84)

In [None]:
# Transform word into features matrix where features are ngrams
train_matrix = transformer.fit_transform(lemmas_train['Word'])

In [None]:
predictor = LogisticRegression().fit(train_matrix, lemmas_train['relation'])