In [1]:
import numpy as np
import pickle
import random
import re
from sklearn import preprocessing
from sklearn.svm import SVC

In [2]:
data_dir = 'data'
input_file_name = 'arxiv_abstracts_10000.txt'
extensionless_input_file_name = input_file_name.split('.')[0]
input_file_path = f'{data_dir}/{input_file_name}'

concept_feature_path = f'{data_dir}/{extensionless_input_file_name}_econ_feature.txt'
concept_score_path = f'{data_dir}/{extensionless_input_file_name}_score_list.bin'

In [3]:
def get_feature(row):
    try:
        if len(row) != 2:
            return
        text = row[1].strip()
        res = re.split('\s+', text[1:-1].strip())
        res = [float(r) for r in res]
        if len(res) == 4:
            return res
    except Exception as e:
        print(e)
        
feature_dict = {}
with open(concept_feature_path) as fin:
    for i, line in enumerate(fin):
        row = line.split('\t')
        feature = get_feature(row)
        if feature:
            phrase = row[0]
            feature_dict[phrase] = feature
        else:
            break

In [4]:
positive_samples = pickle.load(open('data/positive_samples.pkl', 'rb'))

In [5]:
SIZE = 1000

neg_phrase = []
start = random.randint(0, len(list(feature_dict.keys()))-SIZE)
end = start + SIZE
for phrase in list(feature_dict.keys())[start: end]:
    if phrase in feature_dict:
        if np.random.random() > 0.5: 
            neg_phrase.append(phrase)
        if len(neg_phrase) > 1000:
            break


pos_phrase = []
for phrase in positive_samples:
        if phrase in feature_dict:
            pos_phrase.append(phrase)
        if len(pos_phrase) > SIZE:
            break

In [6]:
X = []
y = []
for phrase in pos_phrase:
    X.append(feature_dict[phrase])
    y.append(1)
for phrase in neg_phrase:
    X.append(feature_dict[phrase])
    y.append(0)

In [7]:
X_normalized = preprocessing.normalize(X, norm='l2')

In [8]:
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X)

In [9]:
display(X[:10])
display(X_normalized[:10])
display(X_train_minmax[:10])

[[50.0, 0.56133494, 2.0, 0.0],
 [50.0, 0.62330029, 9.0, 0.0],
 [50.0, 0.53991033, 2.0, 0.0],
 [50.0, 0.63692987, 9.0, 0.0],
 [50.0, 0.54054098, 3.0, 0.0],
 [50.0, 0.62701565, 7.0, 0.0],
 [50.0, 0.59342183, 1.0, 0.0],
 [50.0, 0.6792788, 7.0, 0.0],
 [41.0, 0.52474151, 4.0, 0.0],
 [50.0, 0.53333279, 3.0, 0.0]]

array([[0.9991381 , 0.01121702, 0.03996552, 0.        ],
       [0.98410926, 0.01226791, 0.17713967, 0.        ],
       [0.9991428 , 0.01078895, 0.03996571, 0.        ],
       [0.98410599, 0.01253613, 0.17713908, 0.        ],
       [0.99814673, 0.01079078, 0.0598888 , 0.        ],
       [0.99026538, 0.01241824, 0.13863715, 0.        ],
       [0.99972968, 0.01186523, 0.01999459, 0.        ],
       [0.99025212, 0.01345315, 0.1386353 , 0.        ],
       [0.9951939 , 0.01273706, 0.09709209, 0.        ],
       [0.99814827, 0.0106469 , 0.0598889 , 0.        ]])

array([[1.        , 0.68738037, 0.10526316, 1.        ],
       [1.        , 0.76325978, 0.47368421, 1.        ],
       [1.        , 0.66114495, 0.10526316, 1.        ],
       [1.        , 0.77994982, 0.47368421, 1.        ],
       [1.        , 0.66191721, 0.15789474, 1.        ],
       [1.        , 0.76780941, 0.36842105, 1.        ],
       [1.        , 0.72667223, 0.05263158, 1.        ],
       [1.        , 0.83180803, 0.36842105, 1.        ],
       [0.82      , 0.64257003, 0.21052632, 1.        ],
       [1.        , 0.65309045, 0.15789474, 1.        ]])

In [10]:
clf = SVC(probability=True, kernel='linear')
clf.fit(X, y)

SVC(kernel='linear', probability=True)

In [11]:
print(clf.coef_)
print(clf.intercept_)

[[-8.42131976e-05 -1.39922808e+00  9.33853805e-03 -1.00409563e+00]]
[-0.25061879]


In [12]:
train_phrase = set(pos_phrase + neg_phrase)

In [13]:
X_test_phrase = []
X_test = []
for phrase, feature in feature_dict.items():
    if phrase not in train_phrase:
        X_test_phrase.append(phrase)
        X_test.append(feature)

In [14]:
y_pred = clf.predict(X_test)
y_score = clf.decision_function(X_test)
y_prob = clf.predict_proba(X_test)

In [15]:
pickle.dump(y_prob[:,1], open(concept_score_path, 'wb'))