# GtR Topic Classifier

## Preamble

In [None]:
%run notebook_preamble.ipy

pd.set_option('max_columns', 99)

In [None]:
import ast
import seaborn as sns
from itertools import chain
from collections import Counter, defaultdict
import itertools

from eu_funding.visualization.visualize import pdf_cdf
# from src.visualization.visualize import pdf_cdf

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

import warnings

warnings.simplefilter('ignore', UserWarning)

In [None]:
from nesta.packages.nlp_utils import preprocess

In [None]:
list_cols = ['research_topics', 'research_subjects']

gtr_projects_df = pd.read_csv(
    os.path.join(ext_data_path, 'gtr', 'gtr_projects.csv'),
    converters={k: ast.literal_eval for k in list_cols}
)

In [None]:
gtr_projects_df.head()

In [None]:
research_subject_counter = Counter(chain(*gtr_projects_df['research_subjects']))
research_topic_counter = Counter(chain(*gtr_projects_df['research_topics']))

In [None]:
print('There are {} unique research subjects in the GtR projects dataset.'.format(len(research_subject_counter)))

In [None]:
research_subject_counter.most_common(40)

In [None]:
combos = list(chain(*[sorted(itertools.combinations(d, 2)) for d in gtr_projects_df['research_topics']]))

In [None]:
research_topic_edge_counter = Counter(combos)

In [None]:
total_research_topics = len(list(chain(*gtr_projects_df['research_topics'])))

In [None]:
def association_strength(combo, occurrences, cooccurrences, total):
    return (2 * total * cooccurrences[combo]) / (occurrences[combo[0]] * occurrences[combo[1]])

In [None]:
edges = set(combos)

In [None]:
assoc_strengths = [association_strength(
    edge,
    research_topic_counter, 
    research_topic_edge_counter, 
    total_research_topics) for edge in edges]

In [None]:
plt.hist(np.log10(assoc_strengths), bins=100)
plt.show()

In [None]:
edge_df = pd.DataFrame()
edge_df['source'] = [e[0] for e in edges]
edge_df['target'] = [e[1] for e in edges]
edge_df['weight'] = np.log10(assoc_strengths)

In [None]:
import networkx as nx

In [None]:
g = nx.from_pandas_edgelist(edge_df, edge_attr='weight')

In [None]:
import community

In [None]:
#Extract the best partition
part = community.best_partition(g, resolution=0.8)

In [None]:
size = float(len(set(part.values())))
pos = nx.spring_layout(g)
count = 0.
for com in set(part.values()) :
    count = count + 1.
    list_nodes = [nodes for nodes in part.keys()
                                if part[nodes] == com]
    nx.draw_networkx_nodes(g, pos, list_nodes, node_size = 20,
                                node_color = str(count / size))


nx.draw_networkx_edges(g, pos, alpha=0.5)
plt.show()

In [None]:
pd.Series(part).reset_index(drop=False).groupby(0)['index'].apply(lambda x: print(' '.join(list(x))+'\n'))

In [None]:
category_name_lookup = {
    0: 'particle_astro_phys',
    1: 'law_international_politics',
    2: 'phys_chem_eng',
    3: 'humanities',
    4: 'psychology_education',
    5: 'biological_sciences',
    6: 'environmental_sciences',
    7: 'linguistics'
}

topic_discipline_lookup = {top:category_name_lookup[disc] for top,disc in part.items()}

In [None]:
gtr_projects_df['discipline'] = gtr_projects_df['research_topics'].apply(
    lambda x: [topic_discipline_lookup[val] for val in x])

gtr_projects_df['discipline_sets'] = [set(x) for x in gtr_projects_df['discipline']]

gtr_projects_df['single_disc'] = [True if len(x)==1 else np.nan if len(x)==0 else False for x in gtr_projects_df['discipline_sets']]

gtr_projects_df['single_disc'].mean()

In [None]:
gtr_projects_df['discipline_sets'] = [
    set(['medical_sciences']) if f =='MRC' else x for f,x in zip(
        gtr_projects_df['funder_name'],
           gtr_projects_df['discipline_sets'])]

In [None]:
def modal_value(l):
    c = Counter(l)
    try:
        return c.most_common(1)[0][0]
    except:
        return np.nan

gtr_projects_df['modal_discipline'] = [modal_value(d) for d in gtr_projects_df['discipline_sets']]

In [None]:
gtr_projects_df['modal_discipline'].value_counts()

In [None]:
gtr_projects_df = gtr_projects_df[~pd.isnull(gtr_projects_df['abstract_texts'])]

In [None]:
gtr_projects_df = gtr_projects_df[gtr_projects_df['abstract_texts'].str.len() > 250]

In [None]:
abstracts_tokenised = [preprocess.tokenize_document(a) for a in gtr_projects_df['abstract_texts']]

In [None]:
abstracts_tokenised = [list(chain(*a)) for a in abstracts_tokenised]

In [None]:
from gensim.corpora import Dictionary

In [None]:
dictionary = Dictionary(abstracts_tokenised)

In [None]:
target_ohe = pd.get_dummies(gtr_projects_df['modal_discipline'])

In [None]:
from gensim.models.ldamodel import LdaModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
abstracts_bow = [dictionary.doc2bow(a) for a in abstracts_tokenised]

In [None]:
lda = LdaModel(
    abstracts_bow, 
    id2word=dictionary,
    num_topics=50,
)

In [None]:
lda_vecs = np.zeros((len(abstracts_tokenised), lda.num_topics))

In [None]:
for i, abstract in enumerate(abstracts_bow):
    for j, prob in lda[abstract]:
        lda_vecs[i][j] = prob

In [None]:
rf = RandomForestClassifier()

In [None]:
target = pd.get_dummies(gtr_projects_df['modal_discipline'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(lda_vecs, target, train_size=0.9, test_size=0.1)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
params = {'C': [0.01, 0.03, 0.1, 0.3, 1, 3]}

lr = OneVsRestClassifier(estimator=LogisticRegression())

In [None]:
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
class TextClassification():
    '''
    This class takes a corpus (could be a list of strings or a tokenised corpus) and a target (could be multiclass or single class).
    
    When it is initialised it vectorises the list of tokens using sklearn's count vectoriser.
    
    It has a grid search method that takes a list of models and parameters and trains the model.
    
    It returns the output of grid search for diagnosis
    
    '''
    
    def __init__(self,corpus,target):
        '''
        
        Initialise. The class will recognise if we are feeding it a list of strings or a list of
        tokenised documents and vectorise accordingly. 
        
        It will also recognise is this a multiclass or one class problem based on the dimensions of the target array
        
        Later on, it will use control flow to modify model parameters depending on the type of data we have
        
        '''
        
        #Is this a multiclass classification problem or a single class classification problem?
        if target.shape[1]>1:
            self.mode = 'multiclass'
            
        else:
            self.mode = 'single_class'
    
    
        #Store the target
        self.Y = target
    
        #Did we feed the model a bunch of strings or a list of tokenised docs? If the latter, we clean and tokenise.
        
        if type(corpus[0])==str:
            #corpus = CleanTokenize(corpus).clean().bigram().tokenised
            corpus = CleanTokenize(corpus).clean().tokenised
            
        #Turn every list of tokens into a string for count vectorising
        corpus_string =  [' '.join(words) for words in corpus]
        
        
        #And then we count vectorise in a hacky way.
        count_vect = CountVectorizer(stop_words='english',min_df=5).fit(corpus_string)
        
        #Store the features
        self.X = count_vect.transform(corpus_string)
        
        #Store the count vectoriser (we will use it later on for prediction on new data)
        self.count_vect = count_vect
        
    def grid_search(self,models):
        '''
        The grid search method takes a list with models and their parameters and it does grid search crossvalidation.
        
        '''
        
        #Load inputs and targets into the model
        Y = self.Y
        X = self.X
        
        if self.mode=='multiclass':
            '''
            If the model is multiclass then we need to add some prefixes to the model paramas
            
            '''
        
            for mod in models:
                #Make ovr
                mod[0] = OneVsRestClassifier(mod[0])
                
                #Add the estimator prefix
                mod[1] = {'estimator__'+k:v for k,v in mod[1].items()}
                
        
        #Container with results
        results = []

        #For each model, run the analysis.
        for num,mod in enumerate(models):
            print(num)

            #Run the classifier
            clf = GridSearchCV(mod[0],mod[1])

            #Fit
            clf.fit(X,Y)

            #Append results
            results.append(clf)
        
        self.results = results
        return(self)

    
#Class to visualise the outputs of multilabel models.

#I call it OrangeBrick after YellowBrick, the package for ML output visualisation 
#(which currently doesn't support multilabel classification)


class OrangeBrick():
    '''
    This class takes a df with the true classes for a multilabel classification exercise and produces some charts visualising findings.
    
    The methods include:
    
        .confusion_stack: creates a stacked barchart with the confusion matrices stacked by category, sorting classes by performance
        .prec_rec: creates a barchart showing each class precision and recall;
        #Tobe done: Consider mixes between classes?
    
    '''
    
    def __init__(self,true_labels,predicted_labels,var_names):
        '''
        Initialise with a true labels, predicted labels and the variable names
        '''
         
        self.true_labels = true_labels
        self.predicted_labels = predicted_labels
        self.var_names = var_names
    
    def make_metrics(self):
        '''
        Estimates performance metrics (for now just confusion charts by class and precision/recall scores for the 0.5 
        decision rule.
        
        '''
        #NB in a confusion matrix in SKlearn the X axis indicates the predicted class and the Y axis indicates the ground truth.
        #This means that:
            #cf[0,0]-> TN
            #cf[1,1]-> TP
            #cf[0,1]-> FN (prediction is false, groundtruth is true)
            #cf[1,0]-> FP (prediction is true, ground truth is false)



        #Predictions and true labels
        true_labels = self.true_labels
        pred_labels = self.predicted_labels

        #Variable names
        var_names = self.var_names

        #Store confusion matrices
        score_store = []


        for num in np.arange(len(var_names)):

            #This is the confusion matrix
            cf = confusion_matrix(pred_labels[:,num],true_labels[:,num])

            #This is a melted confusion matrix
            melt_cf = pd.melt(pd.DataFrame(cf).reset_index(drop=False),id_vars='index')['value']
            melt_cf.index = ['true_negative','false_positive','false_negative','true_positive']
            melt_cf.name = var_names[num]
            
            #Order variables to separate failed vs correct predictions
            melt_cf = melt_cf.loc[['true_positive','true_negative','false_positive','false_negative']]

            #We are also interested in precision and recall
            prec = cf[1,1]/(cf[1,1]+cf[1,0])
            rec = cf[1,1]/(cf[1,1]+cf[0,1])

            prec_rec = pd.Series([prec,rec],index=['precision','recall'])
            prec_rec.name = var_names[num]
            score_store.append([melt_cf,prec_rec])
    
        self.score_store = score_store
        
        return(self)
    
    def confusion_chart(self,ax):
        '''
        Plot the confusion charts
        
        
        '''
        
        #Visualise confusion matrix outputs
        cf_df = pd.concat([x[0] for x in self.score_store],1)

        #This ranks categories by the error rates
        failure_rate = cf_df.apply(lambda x: x/x.sum(),axis=0).loc[['false' in x for x in cf_df.index]].sum().sort_values(
            ascending=False).index

        
        #Plot and add labels
        cf_df.T.loc[failure_rate,:].plot.bar(stacked=True,ax=ax,width=0.8,cmap='Accent')

        ax.legend(bbox_to_anchor=(1.01,1))
        #ax.set_title('Stacked confusion matrix for disease areas',size=16)
    
    
    def prec_rec_chart(self,ax):
        '''
        
        Plot a precision-recall chart
        
        '''
    

        #Again, we sort them here to assess model performance in different disease areas
        prec_rec = pd.concat([x[1] for x in self.score_store],1).T.sort_values('precision')
        prec_rec.plot.bar(ax=ax)

        #Add legend and title
        ax.legend(bbox_to_anchor=(1.01,1))
        #ax.set_title('Precision and Recall by disease area',size=16)

In [None]:
tc = TextClassification(abstracts_tokenised, target)

In [None]:
models = [
    [RandomForestClassifier(),
     {'class_weight':['balanced',None],'min_samples_leaf':[1,5]}],
    
    [LogisticRegression(),
     {'class_weight':['balanced',None],'penalty':['l1','l2'],
      'C':[0.1,1,100]}]]

In [None]:
tc.grid_search(models)

In [None]:
for res in tc.results:
    print(res.best_score_)
    print(res.best_estimator_)
    
    #This is the best estimator
best_est = tc.results[1].best_estimator_

In [None]:
import pickle

In [None]:
with open(os.path.join('../models/gtr_text_models.p'), 'rb') as f:
    model = pickle.load(f)

In [None]:
c_vecs = model[0].transform([' '.join(a) for a in abstracts_tokenised])

In [None]:
preds = model[1].predict(c_vecs)

In [None]:
preds

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer()
tfidf_vecs = tfidf.fit_transform([' '.join(a) for a in abstracts_tokenised])

In [None]:
from sklearn.decomposition import TruncatedSVD, PCA

In [None]:
svd = TruncatedSVD(n_components=100)
svd_vecs = svd.fit_transform(tfidf_vecs)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(svd_vecs, target, train_size=0.9, test_size=0.1)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
lr.fit