In [1]:
import os.path
import sys
sys.path.append('/home/galm/software/django/tmv/BasicBrowser/')
sys.path.append('/home/max/software/django-tmv/tmv_mcc-apsis/BasicBrowser')
import scoping
from utils.text import *

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC

import pandas as pd
import numpy as np

import pickle
import scipy.sparse

import matplotlib.pyplot as plt

In [2]:
seen_df = pd.read_csv('../data/0_labelled_documents.csv')
unseen_df = pd.read_csv('../data/0_unlabelled_documents.csv')

df = (pd.concat([seen_df,unseen_df])
      .sort_values('id')
      .sample(frac=1, random_state=1)
      .reset_index(drop=True)
)

seen_index = df[df['seen']==1].index
unseen_index = df[df['seen']==0].index

In [3]:
revectorize = False

X_exists = os.path.isfile(f'../data/X_{df.shape[0]}.npz')

if revectorize is True or X_exists is False:
    print("running vectorisation again")
    vec = TfidfVectorizer(
        ngram_range=(1,2),
        min_df=10, max_df=0.8, strip_accents='unicode', 
        max_features=20000,
        tokenizer=snowball_stemmer()
    )
    vec.fit(df.loc[seen_index,"content"].astype("str"))

    X = vec.transform(df['content'].astype("str"))   
    with open (f'../data/vec_{seen_df.shape[0]}.pickle','wb') as f:
        pickle.dump(vec, f)
    import scipy.sparse
    scipy.sparse.save_npz(f'../data/X_{df.shape[0]}.npz', X)
else:
    print("loading feature matrix")
    with open (f'../data/vec_{seen_df.shape[0]}.pickle','rb') as f:
        vec = pickle.load(f)
        X = scipy.sparse.load_npz(f'../data/X_{df.shape[0]}.npz')
        
X.shape

running vectorisation again




(378365, 7394)

In [4]:
from sklearn.model_selection import KFold

y = df['relevant']

kf = KFold(n_splits=10)
kfs = kf.split(X[seen_index],y[seen_index])
y_preds = []
for k_train, k_test in kfs:
    clf = SVC(kernel='rbf',class_weight='balanced',probability=True, C=10)
    k_train = seen_index[k_train]
    clf.fit(X[k_train],y[k_train])
    y_preds.append(clf.predict_proba(X[unseen_index])[:,1])
    
y_preds = np.array(y_preds)
np.save("../data/y_preds.npz",y_preds)

In [5]:
y_preds = np.load("../data/y_preds.npz.npy")
mean_pred = np.mean(y_preds, axis=0)
std_pred = np.std(y_preds, axis=0)
preds_upper = np.minimum(mean_pred + std_pred, 1)
preds_lower = np.maximum(mean_pred - std_pred, 0)

In [7]:
df.loc[unseen_index,'0 - relevance - mean_prediction']= mean_pred
df.loc[unseen_index,'0 - relevance - std_prediction'] = std_pred
df.loc[unseen_index,'0 - relevance - lower_pred'] = preds_lower
df.loc[unseen_index,'0 - relevance - upper_pred'] = preds_upper

cols = [
    "id",
    "0 - relevance - mean_prediction",
    "0 - relevance - std_prediction",
    "0 - relevance - lower_pred",
    "0 - relevance - upper_pred"  
]

df[cols].to_csv('../data/1_document_relevance.csv',index=False)

In [8]:
df

Unnamed: 0,id,content,title,wosarticle__de,wosarticle__wc,ar5,seen,relevant,random_sample,physical_tags,...,8 - 8.07. Qualitative - expert interviews,8 - 8.08. Qualitative - household surveys,8 - 8.09. Mixed methods,8 - 8.10. Case studies,8 - 8.11. Remote sensing,9 - observations,0 - relevance - mean_prediction,0 - relevance - std_prediction,0 - relevance - lower_pred,0 - relevance - upper_pred
0,1783633,Physicochemical properties and chemism of atmo...,Chemism of Atmospheric Precipitation as a Cons...,atmospheric immission; precipitation; acid rai...,['Environmental Sciences'],,0.0,0.0,0,,...,,,,,,,0.034508,0.006457,0.028052,0.040965
1,1455384,Wetlands occur where biotic and abiotic condit...,High altitude montane wetland vegetation class...,Platberg; Inselberg; Phytosociology; Hydrophyt...,['Plant Sciences'],,0.0,0.0,0,,...,,,,,,,0.376960,0.036163,0.340796,0.413123
2,1340101,The atmospheric forcing on the Barents Sea ice...,Atmospheric forcing on the Barents Sea winter ...,,['Meteorology & Atmospheric Sciences'],,0.0,0.0,0,,...,,,,,,,0.524275,0.063795,0.460479,0.588070
3,1461135,An important prerequisite to better understand...,River-aquifer exchange fluxes under monsoonal ...,River-aquifer exchange fluxes; Heat as a natur...,"['Engineering, Civil; Geosciences, Multidiscip...",,0.0,0.0,0,,...,,,,,,,0.231735,0.028298,0.203437,0.260033
4,1344041,"Pacific coast, until recent work provided data...",Fishing in Peru between 10000 and 3750 BP,Peru; early and mid-Holocene; pre-ceramic peri...,['Anthropology; Archaeology'],,0.0,0.0,0,,...,,,,,,,0.272186,0.051346,0.220840,0.323532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378360,488178,The global population is predicted to grow to ...,Antioxidant dynamics in the live animal and im...,food security; oxidative stress,"['Agriculture, Dairy & Animal Science']",,0.0,0.0,0,,...,,,,,,,0.057176,0.015937,0.041239,0.073113
378361,3319329,The concentrations and distribution of natural...,Radioactivity concentrations and dose assessme...,,"['Environmental Sciences; Public, Environmenta...",,0.0,0.0,0,,...,,,,,,,0.066306,0.012636,0.053669,0.078942
378362,2342937,An important factor influencing food quality a...,Fusarium mycotoxins in oats,oat; mycotoxin; Fusarium spp.,['Food Science & Technology'],,0.0,0.0,0,,...,,,,,,,0.148948,0.025102,0.123845,0.174050
378363,1921295,"The Argens upper watershed, upstream to the Va...",LANDSCAPES AND PROBLEMS OF THE HIGH BASIN OF T...,karst; springs; streams; hydochemistry; traver...,"['Geosciences, Multidisciplinary']",,0.0,0.0,0,,...,,,,,,,0.056214,0.010344,0.045869,0.066558
