In [1]:
import os.path
import sys
sys.path.append('/home/galm/software/django/tmv/BasicBrowser/')
sys.path.append('/home/max/software/django-tmv/tmv_mcc-apsis/BasicBrowser')
import scoping
from utils.text import *

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC

import pandas as pd
import numpy as np

import pickle
import scipy.sparse

import matplotlib.pyplot as plt

In [2]:
seen_df = pd.read_csv('../data/0_labelled_documents.csv')
unseen_df = pd.read_csv('../data/0_unlabelled_documents.csv')

df = (pd.concat([seen_df,unseen_df])
      .sort_values('id')
      .sample(frac=1, random_state=1)
      .reset_index(drop=True)
)

seen_index = df[df['seen']==1].index
unseen_index = df[df['seen']==0].index

In [3]:
revectorize = False

X_exists = os.path.isfile(f'../data/X_{df.shape[0]}.npz')

if revectorize is True or X_exists is False:
    print("running vectorisation again")
    vec = TfidfVectorizer(
        ngram_range=(1,2),
        min_df=10, max_df=0.8, strip_accents='unicode', 
        max_features=20000,
        tokenizer=snowball_stemmer()
    )
    vec.fit(df.loc[seen_index,"content"].astype("str"))

    X = vec.transform(df['content'].astype("str"))   
    with open (f'../data/vec_{seen_df.shape[0]}.pickle','wb') as f:
        pickle.dump(vec, f)
    import scipy.sparse
    scipy.sparse.save_npz(f'../data/X_{df.shape[0]}.npz', X)
else:
    print("loading feature matrix")
    with open (f'../data/vec_{seen_df.shape[0]}.pickle','rb') as f:
        vec = pickle.load(f)
        X = scipy.sparse.load_npz(f'../data/X_{df.shape[0]}.npz')
        
X.shape

running vectorisation again




(398971, 7450)

In [4]:
from sklearn.model_selection import KFold

y = df['relevant']

kf = KFold(n_splits=10)
kfs = kf.split(X[seen_index],y[seen_index])
y_preds = []
for k_train, k_test in kfs:
    clf = SVC(kernel='rbf',class_weight='balanced',probability=True, C=10)
    k_train = seen_index[k_train]
    clf.fit(X[k_train],y[k_train])
    y_preds.append(clf.predict_proba(X[unseen_index])[:,1])
    
y_preds = np.array(y_preds)
np.save("../data/y_preds.npz",y_preds)

In [5]:
y_preds = np.load("../data/y_preds.npz.npy")
mean_pred = np.mean(y_preds, axis=0)
std_pred = np.std(y_preds, axis=0)
preds_upper = np.minimum(mean_pred + std_pred, 1)
preds_lower = np.maximum(mean_pred - std_pred, 0)

In [6]:
df.loc[unseen_index,'0 - relevance - mean_prediction']= mean_pred
df.loc[unseen_index,'0 - relevance - std_prediction'] = std_pred
df.loc[unseen_index,'0 - relevance - lower_pred'] = preds_lower
df.loc[unseen_index,'0 - relevance - upper_pred'] = preds_upper

cols = [
    "id",
    "0 - relevance - mean_prediction",
    "0 - relevance - std_prediction",
    "0 - relevance - lower_pred",
    "0 - relevance - upper_pred"  
]

df[cols].to_csv('../data/1_document_relevance.csv',index=False)

In [7]:
df

Unnamed: 0,id,content,title,wosarticle__de,wosarticle__wc,ar5,seen,relevant,random_sample,physical_tags,...,8 - 8.08. Qualitative - household surveys,8 - 8.09. Mixed methods,8 - 8.10. Case studies,8 - 8.11. Remote sensing,9 - observations,4 - 50 Other (marine & coastal),0 - relevance - mean_prediction,0 - relevance - std_prediction,0 - relevance - lower_pred,0 - relevance - upper_pred
0,740010,Global warming is a major challenge that we ar...,Consequence analysis of accidental release of ...,Carbon dioxide; CO2 pipeline leakage; CO2 tran...,['Green & Sustainable Science & Technology; En...,,0.0,0.0,0,,...,,,,,,,0.009046,0.001371,0.007675,0.010417
1,3300415,Severe wind is one of the major hazards facing...,Monte-Carlo Modelling of Severe Wind Gust,Severe wind hazard; return periods; Monte Carl...,"['Computer Science, Information Systems; Ecolo...",,0.0,0.0,0,,...,,,,,,,0.055892,0.013545,0.042347,0.069436
2,3821128,Cold wakes of previous tropical cyclones (TCs)...,Advection by the North Equatorial Current of a...,multiple tropical cyclones; profiling floats; ...,['Oceanography'],,0.0,0.0,0,,...,,,,,,,0.269951,0.026715,0.243236,0.296665
3,711341,Discontinuous permafrost in the North American...,Edaphic and microclimatic controls over permaf...,permafrost; fire; boreal forest,['Environmental Sciences; Meteorology & Atmosp...,,0.0,0.0,0,,...,,,,,,,0.465122,0.068707,0.396415,0.533828
4,1474274,Aims The aim of this article is 4-fold: (i) to...,Reinvestigation on species richness and enviro...,liverworts; mosses; species density; species d...,['Plant Sciences; Ecology; Forestry'],,0.0,0.0,0,,...,,,,,,,0.509145,0.117428,0.391717,0.626573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398966,488178,The global population is predicted to grow to ...,Antioxidant dynamics in the live animal and im...,food security; oxidative stress,"['Agriculture, Dairy & Animal Science']",,0.0,0.0,0,,...,,,,,,,0.054905,0.011799,0.043105,0.066704
398967,3319329,The concentrations and distribution of natural...,Radioactivity concentrations and dose assessme...,,"['Environmental Sciences; Public, Environmenta...",,0.0,0.0,0,,...,,,,,,,0.067423,0.012523,0.054900,0.079947
398968,2342937,An important factor influencing food quality a...,Fusarium mycotoxins in oats,oat; mycotoxin; Fusarium spp.,['Food Science & Technology'],,0.0,0.0,0,,...,,,,,,,0.144035,0.032600,0.111435,0.176635
398969,1921295,"The Argens upper watershed, upstream to the Va...",LANDSCAPES AND PROBLEMS OF THE HIGH BASIN OF T...,karst; springs; streams; hydochemistry; traver...,"['Geosciences, Multidisciplinary']",,0.0,0.0,0,,...,,,,,,,0.055378,0.008848,0.046530,0.064226
