# GtR Topic Classifier

## Preamble

In [None]:
%run notebook_preamble.ipy

pd.set_option('max_columns', 99)

In [None]:
import ast
import seaborn as sns
from itertools import chain
from collections import Counter, defaultdict
import itertools

from eu_funding.visualization.visualize import pdf_cdf
# from src.visualization.visualize import pdf_cdf

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

from sklearn.feature_selection import chi2

import networkx as nx
import community

import warnings

warnings.simplefilter('ignore', UserWarning)

In [None]:
from nesta.packages.nlp_utils import preprocess

In [None]:
list_cols = ['research_topics', 'research_subjects']

gtr_projects_df = pd.read_csv(
    os.path.join(ext_data_path, 'gtr', 'gtr_projects.csv'),
    converters={k: ast.literal_eval for k in list_cols}
)

In [None]:
gtr_projects_df.head()

In [None]:
research_subject_counter = Counter(chain(*gtr_projects_df['research_subjects']))
research_topic_counter = Counter(chain(*gtr_projects_df['research_topics']))

In [None]:
print('There are {} unique research subjects in the GtR projects dataset.'.format(len(research_subject_counter)))
print('There are {} unique research topics in the GtR projects dataset.'.format(len(research_topic_counter)))

In [None]:
research_subject_counter.most_common(40)

### Field Definition Through Community Detection

In [None]:
combos = list(chain(*[sorted(itertools.combinations(d, 2)) for d in gtr_projects_df['research_topics']]))

In [None]:
research_topic_edge_counter = Counter(combos)

In [None]:
total_research_topics = len(list(chain(*gtr_projects_df['research_topics'])))

In [None]:
def association_strength(combo, occurrences, cooccurrences, total):
    return (2 * total * cooccurrences[combo]) / (occurrences[combo[0]] * occurrences[combo[1]])

In [None]:
edges = set(combos)

In [None]:
assoc_strengths = [association_strength(
    edge,
    research_topic_counter, 
    research_topic_edge_counter, 
    total_research_topics) for edge in edges]

In [None]:
plt.hist(np.log10(assoc_strengths), bins=100)
plt.show()

In [None]:
edge_df = pd.DataFrame()
edge_df['source'] = [e[0] for e in edges]
edge_df['target'] = [e[1] for e in edges]
edge_df['weight'] = np.log10(assoc_strengths)
g = nx.from_pandas_edgelist(edge_df, edge_attr='weight')

In [None]:
class CommunityPartition:
    def __init__(self, graph):
        self.graph = graph
    
    def edgelist_to_cooccurrence(self, repeats, **best_partition_kwargs):
        edge_counter = Counter()
        for i in range(repeats):
            partition = community.best_partition(self.graph, **best_partition_kwargs)
            edgelist = self.partition_to_edgelist(partition)
            edge_counter.update(edgelist)

        g = nx.Graph()
        g.add_weighted_edges_from([(e[0][0], e[0][1], e[1]) for e in edge_counter.items()])
        return g
    
    def partition_to_edgelist(self, partition):
        partition_reverse_mapping = self.reverse_index_partition(partition)
        edgelist = []
        for community, elements in partition_reverse_mapping.items():
            combos = [tuple(sorted(e)) for e in itertools.combinations(elements, 2)]
            edgelist.extend(combos)
        return edgelist
     
    def reverse_index_partition(self, partition):
        partition_reverse_mapping = defaultdict(list)
        for k, v in partition.items():
            partition_reverse_mapping[v].append(k)
        return partition_reverse_mapping

In [None]:
cp = CommunityPartition(g)

In [None]:
co = cp.edgelist_to_cooccurrence(3, resolution=.4)

In [None]:
nx.draw(co)

In [None]:
#Extract the best partition
part = community.best_partition(co, resolution=0.3, random_state=0, weight='weight')

In [None]:
set(part.values())

In [None]:
size = float(len(set(part.values())))
pos = nx.spring_layout(co)
count = 0.
for com in set(part.values()) :
    count = count + 1.
    list_nodes = [nodes for nodes in part.keys()
                                if part[nodes] == com]
    nx.draw_networkx_nodes(co, pos, list_nodes, node_size = 20,
                                node_color = str(count / size))


nx.draw_networkx_edges(co, pos, alpha=0.5)
plt.show()

In [None]:
pd.Series(part).reset_index(drop=False).groupby(0)['index'].apply(lambda x: print(', '.join(list(x))+'\n'))

In [None]:
category_name_lookup = {
    0: 'social_science',
    1: 'social_science',
    2: 'arts_humanities',
    3: 'social_science',
    4: 'arts_humanities',
    5: 'biological',
    6: 'engineering',
    7: 'engineering',
    8: 'maths_computing',
    9: 'physical_sciences',
    10: 'arts_humanities',
    11: 'social_science',
    12: 'physical_sciences',
}

topic_discipline_lookup = {top:category_name_lookup[disc] for top,disc in part.items()}

In [None]:
gtr_projects_df['discipline'] = gtr_projects_df['research_topics'].apply(
    lambda x: [topic_discipline_lookup[val] for val in x])

gtr_projects_df['discipline_sets'] = [set(x) for x in gtr_projects_df['discipline']]

gtr_projects_df['single_disc'] = [True if len(x)==1 else np.nan if len(x)==0 else False for x in gtr_projects_df['discipline_sets']]

gtr_projects_df['single_disc'].mean()

In [None]:
gtr_projects_df['discipline_sets'] = [
    set(['medical_sciences']) if f =='MRC' else x for f,x in zip(
        gtr_projects_df['funder_name'],
           gtr_projects_df['discipline_sets'])]

In [None]:
def modal_value(l):
    c = Counter(l)
    try:
        return c.most_common(1)[0][0]
    except:
        return np.nan

gtr_projects_df['modal_discipline'] = [modal_value(d) for d in gtr_projects_df['discipline_sets']]

In [None]:
gtr_projects_df['modal_discipline'].value_counts()

In [None]:
Counter(chain(*gtr_projects_df['discipline_sets'])).most_common()

In [None]:
n_labels = [True if len(s) > 0 else False for s in gtr_projects_df['discipline_sets']]

In [None]:
# remove projects without abstracts
gtr_projects_df = gtr_projects_df[~pd.isnull(gtr_projects_df['abstract_texts'])]
# remove projects with short abstracts
gtr_projects_df = gtr_projects_df[gtr_projects_df['abstract_texts'].str.len() > 250]
# remove projects with no labels
n_labels = [True if len(s) > 0 else False for s in gtr_projects_df['discipline_sets']]
gtr_projects_df = gtr_projects_df[n_labels]

In [None]:
import spacy

In [None]:
nlp = spacy.load('en')
nlp.remove_pipe('parser')
nlp.remove_pipe('ner')

In [None]:
with open(os.path.join(raw_data_path, 'stopwords_en_long.txt'), 'r') as f:
    stopwords = f.read().splitlines()

In [None]:
def remove_markup(text):
    tags = ['<b>', '<p>', '&nbsp;', '<li>', '<ol>', '<ul>', '<br>',
           '</b>', '</p>', '&nbsp;', '</li>', '</ol>', '</ul>', '</br>',
           '\n', '\t', '\r']
    for tag in tags:
        text = text.replace(tag, ' ')
    text = re.sub("\d+", "XXX", text)
    return text

In [None]:
abstracts = [remove_markup(a) for a in gtr_projects_df['abstract_texts']]

In [None]:
for stopword in stopwords:
    nlp.vocab[stopword.lower()].is_stop = True
    nlp.vocab[stopword.upper()].is_stop = True
    nlp.vocab[stopword.title()].is_stop = True

In [None]:
abstract_docs = [nlp(d) for d in abstracts]

In [None]:
def tokenize(doc):
    tokenized_doc = []
    for t in doc:
        if len(t) < 3:
            continue
        if t.is_stop:
            continue
        if t.like_num:
            continue
        if t.is_digit:
            continue
        if t.is_punct:
            continue
        if t.like_url:
            continue
        pos = t.pos_.upper()
        token = t.lemma_
        tokenized_doc.append(f'{token}{pos}')
    return tokenized_doc

In [None]:
abstracts_tokenized = [tokenize(doc) for doc in abstract_docs]

In [None]:
bigrams = Phrases(abstracts_tokenized, delimiter=b'x')
bigrammer = Phraser(bigrams)
abstracts_bigrammed = bigrammer[abstracts_tokenized]
# dictionary = Dictionary(abstracts_bigrammed)

In [None]:
abstracts_str = [' '.join(d) for d in abstracts_bigrammed]
tfidf = TfidfVectorizer(
#     max_df=0.5, 
    min_df=5, 
    sublinear_tf=True, 
    norm='l2'
)
tfidf_vecs = tfidf.fit_transform(abstracts_str)

In [None]:
classes = list(set(chain(*gtr_projects_df['discipline_sets'])))
mlb = MultiLabelBinarizer(classes=classes)
target_binarized = mlb.fit_transform(gtr_projects_df['discipline_sets'])
target_binarized_df = pd.DataFrame(target_binarized, columns=mlb.classes_)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_vecs, target_binarized_df, train_size=0.8, test_size=0.2)

In [None]:
feature_terms = []
indices = np.array(range(0, X_train.shape[1]))
for discipline in y_train.columns:
    features_chi2 = chi2(X_train, y_train[discipline])[0]
    threshold = np.percentile(features_chi2[~pd.isnull(features_chi2)], 90)
    discipline_indices = indices[features_chi2 > threshold]
    feature_terms.extend(np.array(tfidf.get_feature_names())[discipline_indices])

In [None]:
tfidf_stop_words = set(tfidf.get_feature_names()).difference(set(feature_terms))

In [None]:
tfidf = TfidfVectorizer(
#     max_df=0.5, 
    min_df=5, 
    sublinear_tf=True, 
    norm='l2',
    stop_words=tfidf_stop_words
)
tfidf_vecs_filt = tfidf.fit_transform(abstracts_str)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_vecs_filt, target_binarized, train_size=0.8, test_size=0.2)

In [None]:
from sklearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.ensemble import VotingClassifier

In [None]:
pipe_mnb = make_pipeline_imb(
                 MultinomialNB()
)

rf = RandomForestClassifier(n_jobs=3)

pipe_lr = make_pipeline_imb(
    LogisticRegression(n_jobs=3)
)

In [None]:
for i in range(y_test.shape[1]):
    print(mlb.classes_[i])
    clf.fit(X_train, y_train[:, i])
    print(classification_report(y_test[:, i], clf.predict(X_test)))