In [None]:
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

In [None]:
secop = pd.read_pickle('../data/secop_proc_text.pickle')

In [None]:
secop = secop.assign(is_covid = lambda x: (x.clean_description.str.contains('covid|coronavirus|pandemia')) & (x.fecha_de_firma>='2020-03-17'))

Separate a portion of the documents for testing, stratified by `is_covid`

In [None]:
secop_train, secop_test = train_test_split(secop, stratify=secop.is_covid, test_size=0.2)

In [None]:
secop_train.groupby('is_covid')['is_covid'].count()

In [None]:
secop_test.groupby('is_covid')['is_covid'].count()

## one class SVM

text processing 

In [None]:
all_documents = secop_train.stemmed_descriptions
covid_documents = (secop_train
    .loc[lambda x: x.is_covid]
    .stemmed_descriptions
)
# documents = documents.apply(lambda words: [w for w in words if w in vocab_set])
print(all_documents.shape)
print(covid_documents.shape)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

Train the dictionary and tf-idf on all texts, but train the anomalies only in covid subset, yet using all tokens

In [None]:
count_vectorizer = CountVectorizer(max_df=0.5, min_df=5, stop_words=['covid', 'coronavirus', 'pandemia']).fit(all_documents)

all_documents_counts = count_vectorizer.transform(all_documents)
covid_documents_counts = count_vectorizer.transform(covid_documents)

In [None]:
tf_transformer = TfidfTransformer(use_idf=True).fit(all_documents_counts)

all_documents_tfidf = tf_transformer.transform(all_documents_counts)
covid_documents_tfidf = tf_transformer.transform(covid_documents_counts)

In [None]:
svm_model = OneClassSVM(kernel = 'linear', nu=0.1).fit(covid_documents_tfidf)

In [None]:
covid_documents.iloc[0]

Internal predictions

In [None]:
covid_predictions = svm_model.predict(covid_documents_tfidf)
all_predictions = svm_model.predict(all_documents_tfidf)

In [None]:
all_predictions = pd.Series(all_predictions, index=all_documents.index)
covid_predictions = pd.Series(covid_predictions, index=covid_documents.index)

In [None]:
test_documents = all_documents.drop(covid_documents.index, axis=0)
test_predictions = all_predictions.drop(covid_documents.index)

In [None]:
import numpy as np

In [None]:
print(np.sum(covid_predictions>0)/len(covid_predictions))
print(np.sum(covid_predictions<0))

In [None]:
print(np.sum(test_predictions>0)/len(test_predictions))
print(np.sum(test_predictions<0))

In [None]:
test_documents[test_predictions<0].iloc[120]

Test predictions

In [None]:
test_documents = secop_test.stemmed_descriptions
test_documents_counts = count_vectorizer.transform(test_documents)
test_documents_tfidf = tf_transformer.transform(test_documents_counts)

test_predictions = svm_model.predict(test_documents_tfidf)

In [None]:
# TODO function for evaluating covid vs non_covid
test_predictions[test_predictions<0].size

In [None]:
import matplotlib.pyplot as plt  # doctest: +SKIP
from sklearn.datasets import make_classification
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
X, y = make_classification(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=0)
clf = SVC(random_state=0)
clf.fit(X_train, y_train)

plot_confusion_matrix(clf, X_test, y_test, cmap=plt.cm.Blues)  # doctest: +SKIP
plt.show()  # doctest: +SKIP