In [22]:
import pickle
import numpy as np
from nltk.stem import SnowballStemmer

In [23]:
with open('cognates', 'rb') as f:
    cognates = pickle.load(f)

In [24]:
cognates_sub = cognates[:100]

stemmer = SnowballStemmer("russian")
cognates_stemmed = [(stemmer.stem(word), i) for word, i in cognates_sub]

In [16]:
def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

In [17]:
def lcs(a, b):
    lengths = [[0 for j in range(len(b)+1)] for i in range(len(a)+1)]
    # row 0 and column 0 are initialized to 0 already
    for i, x in enumerate(a):
        for j, y in enumerate(b):
            if x == y:
                lengths[i+1][j+1] = lengths[i][j] + 1
            else:
                lengths[i+1][j+1] = max(lengths[i+1][j], lengths[i][j+1])
    # read the substring out from the matrix
    result = ""
    x, y = len(a), len(b)
    while x != 0 and y != 0:
        if lengths[x][y] == lengths[x-1][y]:
            x -= 1
        elif lengths[x][y] == lengths[x][y-1]:
            y -= 1
        else:
            assert a[x-1] == b[y-1]
            result = a[x-1] + result
            x -= 1
            y -= 1
    return result

In [25]:
X = []
y = []

for word_1, class_1 in cognates_stemmed:
    for word_2, class_2 in cognates_stemmed:
        if word_1 == word_2:
            continue
        y.append(1 if class_1 == class_2 else 0)
        X.append([
                    levenshtein(word_1, word_2) / max(len(word_1), len(word_2)),
                    abs(len(word_1) - len(word_2)),
                    len(lcs(word_1, word_2)) / max(len(word_1), len(word_2))
                 ]
                )

In [28]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn.svm import SVC
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred_svm = clf.predict(X_test)

#from sklearn.ensemble import RandomForestClassifier
#clf = RandomForestClassifier()
#clf.fit(X_train, y_train)
#y_pred_rf = clf.predict(X_test)

In [29]:
y_pred = list(map(lambda x: 0 if x < 0.95 else 1, y_pred_svm))

from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, y_pred, average='binary')

(0.8928571428571429, 0.6983240223463687, 0.78369905956112851, None)

In [30]:
np.sum([y_pred_svm[0], y_pred_rf[0]], axis=0)

array([ 1.94615234,  0.05384766])

In [30]:
clf.coef_

array([[-2.5288377 ,  0.20594365,  8.06039389]])