In [119]:
import os
import math
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [42]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [98]:
def by_vect_cosine_similarity(v1, v2):
    sumx2,  sumy2,  sumxy =  0, 0, 0
    for x, y in zip(v1, v2):
        sumx2 +=  x*x
        sumy2 += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumx2*sumy2)

In [47]:
path = 'data/'
train_texts_files = os.listdir(path+'base')
target_texts_ids = [1, 2562, 1539, 3588, 3593, 2575, 939, 2066, 2579, 22, 3607, 536, 1561, 1054, 31, 2481, 1571, 3626, 2603, 45, 1076, 569, 570, 2656, 579, 3141, 1608, 586, 2635, 78, 1616, 1107, 3940, 2649, 2139, 2140, 607, 2144, 612, 1126, 1639, 2668, 2158, 3183, 636, 2174, 1151, 2688, 1666, 3203, 535, 3719, 3213, 3728, 2194, 147, 2708, 1174, 2712, 2201, 1691, 1180, 2206, 1697, 3748, 1701, 680, 2224, 1714, 3913, 695, 3256, 1726, 3775, 1728, 1217, 3780, 2758, 3783, 715, 2253, 110, 2258, 723, 1244, 1757, 2427, 3808, 2583, 1262, 240, 241, 60, 3827, 3830, 3456, 770, 3332, 262, 775, 776, 3850, 269, 273, 274, 2324, 789, 3547, 2337, 290, 2339, 1331, 2863, 2782, 3378, 307, 52, 2361, 2362, 3387, 3901, 833, 2884, 3397, 2374, 1154, 2891, 2896, 3409, 1849, 1367, 857, 3418, 347, 656, 1885, 2398, 1892, 2921, 829, 880, 1394, 2931, 2423, 888, 891, 382, 1920, 2028, 1928, 909, 3473, 2450, 1432, 581, 1437, 1951, 2977, 2467, 932, 2984, 413, 1963, 2992, 1457, 2377, 950, 1525, 3516, 1469, 587, 3007, 2806, 3545, 2510, 967, 3576, 462, 2126, 2003, 2006, 3544, 1227, 2010, 153, 2526, 2531, 375, 999, 2025, 2538, 3240, 2548, 1013, 887, 1016, 3065, 1020, 510, 511]
target_texts_files = [x for x in os.listdir(path+'test') if int(x.replace("test_", "").replace(".txt",  "")) in target_texts_ids]

In [58]:
train_texts = []
target_texts = []
for file in train_texts_files:
    with open(path+'base/'+file, 'r') as f:
        train_texts.append( strip_tags(f.read().lower()) )
for file in target_texts_files:
    with open(path+'test/'+file, 'r') as f:
        target_texts.append( strip_tags(f.read().lower()) )

In [96]:
train_texts[2]

'ищем прекрасного программиста 1с в дружный отдел сотрудников сферы it с неутомительным режимом работы. у нас есть 1с 8.2, 8.3, бухгалтерия строительной организации, жкх, зуп + самописные. что надо делать: - доработать существующие конфигурации,- разработать новые конфигураций,- отчеты, базы данных...- осуществлять поддержку наших продвинутых пользователей режим работы с 10 до 18 по вторникам и пятницам (строго!).  '

In [97]:
target_texts[99]

'продавецтребования:   пунктуальность, порядочность,  условия:   6 дневная рабочая неделя с 9.00- 19.00  обязанности:  продавец в контейнер 1000 мелочей (болты, гайки, инструменты, электроинструменты, саморезы и т.д. )   '

In [103]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_texts)
X_target_counts = count_vect.transform(target_texts)

In [107]:
tf_idf = TfidfTransformer(norm=None, smooth_idf=False)
X_train_tfidf = tf_idf.fit_transform(X_train_counts)
X_target_tfidf = tf_idf.transform(X_target_counts)

In [106]:
print(X_train_counts.shape)
print(X_target_counts.shape)

(20, 1461)
(200, 1461)


In [101]:
by_vect_cosine_similarity(X_train_tfidf.toarray()[0], X_train_tfidf.toarray()[1])

0.085336510249571187

In [102]:
cosine_similarity(X_train_tfidf[0:1], X_train_tfidf)

array([[ 1.        ,  0.08533651,  0.00973161,  0.03904741,  0.03681657,
         0.35014289,  0.03681657,  0.06902423,  0.014982  ,  0.03681657,
         0.01540582,  0.06119115,  0.01075213,  0.07372137,  0.03591924,
         0.05083279,  0.06012874,  0.03876726,  0.03904741,  0.03531867]])

In [None]:
# Cosine similarity sum between train texts and each of the test texts

In [115]:
target_coisine_vect = []
for text in range(len(target_texts)):
    cs_vect = cosine_similarity(X_target_tfidf[text], X_train_tfidf)[0]  # 0 element bc we compared only 1 text
    target_coisine_vect.append(sum(cs_vect))

In [121]:
target_cosine_mmean = np.mean(target_coisine_vect)

In [122]:
classes = [1 if x >= target_cosine_mmean else 0 for x in target_coisine_vect]

In [210]:
defined  = np.multiply([int(x.replace("test_","").replace(".txt","").strip()) for x in target_texts_files], classes)
print(len(defined))
defined  = [np.asscalar(x) for x in defined if x > 0]  # To fix JSON Dump Erros we convert np.int64 to Python native int
print(len(defined))
other = [int(x.replace("test_","").replace(".txt","").strip()) for x in target_texts_files if int(x.replace("test_","").replace(".txt","")) not in defined]
print(len(other))

200
97
103


In [215]:
type(defined[0])

int

In [217]:
out =  {}
out['defined'] = defined
out['other'] = other

In [218]:
import json
out = json.dumps(out, ensure_ascii=False, sort_keys=True, indent=4)

In [219]:
with open('lab4.json', 'w') as f:
    f.write(str(out))