# criticism_detection.ipynb

### 1 ###
•	It analysis papers citing another one that has been retracted.
•	It takes papers that triggered the retraction and classify them as highly_critical.
•	Papers published a year before the first highly_critical paper are classified as non_critical.
•	Papers published after the highly_critical papers are classified as critical (they are not used in the analysis later on because they may contain mixed criticism).
•	Date of publication of the papers are taken from the “PMC-ids.csv.gz” file, available through the PMC FTP service:
https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/

### 2 ###
•	It takes all the sentences citing a particular DOI in all the papers disregarding its section. 
•	It shows that clustering algorithms such as PCA and TSNE fail if trying to cluster the sentences from non_critical and high_critical papers just using vader lexicon.

### 3 ###
•	It gets all the sentences in the high_critical papers and a similar number of sentences from the non_critical papers. I finds the words and bigrams used in the high_critical set and are not used in the non_critical set.


In [None]:
%matplotlib inline

In [None]:
import sys
import re
import os
import numpy as np
import pandas as pd
import nltk
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

sys.path.insert(0, "./modules")
import words_frec_analysis_get_sentence

# Configuration

In [None]:
data_path = 'data'

ds_name = 'DOI_cited_science_1179052_retracted'

# The paper was retracted based on:
DOI_high_critical = [
    '10.1186/1742-4690-7-63',
    '10.1371/journal.pone.0008519',
    '10.1136/bmj.c1018',
    '10.1186/1742-4690-7-10',
    '10.1371/journal.pone.0008519',
    '10.1186/1743-422X-7-224'
]

analysis_path = os.path.join(data_path, 'analysis')

# In
citing_sections_tsv = os.path.join(analysis_path, '%s_sections.tsv' % ds_name)
pmc_ids_csv = os.path.join(data_path, 'PMC_ids/PMC-ids.csv.gz')

# Setup

In [None]:
# Get Vader data for sentiment analysis
nltk.download('vader_lexicon') 

### 1 ###

•	It analysis papers citing another one that has been retracted.
•	It takes papers that triggered the retraction and classify them as highly_critical.
•	Papers published a year before the first highly_critical paper are classified as non_critical.
•	Papers published after the highly_critical papers are classified as critical (they are not used in the analysis later on because they may contain mixed criticism).
•	Date of publication of the papers are taken from the “PMC-ids.csv.gz” file, available through the PMC FTP service:
https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/

In [None]:
df = pd.read_csv(
    citing_sections_tsv,
    sep='\t',
    encoding='utf-8'
)

#  df['cited_DOI', 'cited_in_conclusions','cited_in_discussion',
#    'cited_in_introduction', 'cited_in_maintext', 'citing_DOI',
#    'conclusions_found', 'discussion_found', 'introduction_found',
#    'maintext_found', 'reference_id', 'sentence_citing_conclusions',
#    'sentence_citing_discussion', 'sentence_citing_intro', 'sentence_citing_maintext']


In [None]:
df_ids = pd.read_csv(pmc_ids_csv, sep=',', encoding='utf-8', low_memory=False)


df_ids_select = df_ids[df_ids['DOI'].isin(df.citing_DOI)]
"""
for element in df.citing_DOI:
    df_ids['DOI'].isin(df.citingDOI)
    df_ids_selection2.append(df_ids[[df_ids["DOI"] == element]])
    #print(element)
"""

df_ids_select.head()

In [None]:
# The paper was retracted based on:
#DOI_high_critical = ['10.1186/1742-4690-7-63', '10.1371/journal.pone.0008519', '10.1136/bmj.c1018', '10.1186/1742-4690-7-10', '10.1371/journal.pone.0008519', '10.1186/1743-422X-7-224']

df_ids_select_info_highCritical = df_ids[df_ids["DOI"].isin(DOI_high_critical)]

df_ids_select_nonCritical = df_ids[((df_ids["Year"] < 2011)) & (df_ids['DOI'].isin(DOI_high_critical) == False)]

df_ids_select_Critical = df_ids[(df_ids["Year"] >= 2011) | (df_ids['DOI'].isin(DOI_high_critical))]


df_highCritical =df[df['citing_DOI'].isin(df_ids_select_info_highCritical['DOI'])]
 
df_nonCritical = df[df['citing_DOI'].isin(df_ids_select_nonCritical['DOI'])]
df_Critical = df[df['citing_DOI'].isin(df_ids_select_Critical['DOI'])]
#df_critical

df_highCritical_text_part1 = df_highCritical[['citing_DOI','sentence_citing_conclusions']].rename(columns = {'sentence_citing_conclusions': 'text'})
df_highCritical_text_part2 = df_highCritical[['citing_DOI','sentence_citing_discussion']].rename(columns = {'sentence_citing_discussion': 'text'})
df_highCritical_text_part3 = df_highCritical[['citing_DOI','sentence_citing_intro']].rename(columns = {'sentence_citing_intro': 'text'})
df_full_highCritical = pd.concat([df_highCritical_text_part1, df_highCritical_text_part2, df_highCritical_text_part3]).dropna().reset_index(drop=True)
df_full_highCritical['label'] = 'high_critical'

df_part1 = df_nonCritical[['citing_DOI','sentence_citing_conclusions']].rename(columns = {'sentence_citing_conclusions': 'text'})
df_part2 = df_nonCritical[['citing_DOI','sentence_citing_discussion']].rename(columns = {'sentence_citing_discussion': 'text'})
df_part3 = df_nonCritical[['citing_DOI','sentence_citing_intro']].rename(columns = {'sentence_citing_intro': 'text'})
df_full_nonCritical = pd.concat([df_part1, df_part2, df_part3]).dropna().reset_index(drop=True)
df_full_nonCritical['label'] = 'non_critizising'

df_part1 = df_Critical[['citing_DOI','sentence_citing_conclusions']].rename(columns = {'sentence_citing_conclusions': 'text'})
df_part2 = df_Critical[['citing_DOI','sentence_citing_discussion']].rename(columns = {'sentence_citing_discussion': 'text'})
df_part3 = df_Critical[['citing_DOI','sentence_citing_intro']].rename(columns = {'sentence_citing_intro': 'text'})
df_full_Critical = pd.concat([df_part1, df_part2, df_part3]).dropna().reset_index(drop=True)
df_full_Critical['label'] = 'critizising'

df_full = pd.concat([df_full_nonCritical, df_full_Critical]).reset_index(drop=True)

In [None]:
print("df_highCritical.shape; ", df_highCritical.shape)
print("df_Critical.shape; ", df_Critical.shape)
print("df_nonCritical.shape: ", df_nonCritical.shape)
print("df_full.shape: ", df_full.shape)

In [None]:
df_highCritical

### 2 ###

•	It takes all the sentences citing a particular DOI in all the papers disregarding its section. 
•	It shows that clustering algorithms such as PCA and TSNE fail if trying to cluster the sentences from non_critical and high_critical papers just using vader lexicon.

Let's try to use vader_lexicon and pca and tsne to cluster papers criticising and not criticising. 

In [None]:
# dimension reduction algorithms can be pretty slow, so let's work with a sample
# try on the whole data set if you want!

# list of colours for making nice plots later
COLOURS = ['#E91D0E', '#00A6EF']


def scatter(x, label, selected_labels, selected_colors):
    f = plt.figure(figsize=(8, 8))
    ax = plt.subplot(aspect='equal')
    for selected_label, selected_color in zip(selected_labels, selected_colors):
        x_selected = x[(label == selected_label), :]
        ax.scatter(
            x_selected[:, 0],
            x_selected[:, 1],
            c=selected_color,
            label=selected_label,
            alpha=0.5
        )
    plt.legend()
    
    return f, ax

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_vectors = tfidf_vectorizer.fit_transform(df_full['text'])

pca = PCA(n_components=2)
pca_comp = pca.fit_transform(tfidf_vectors.toarray())
scatter(pca_comp, df_full['label'], ['critizising', 'non_critizising'], COLOURS[:2])

In [None]:
# we have suggested some parameters below, feel free to experiment
tsne = TSNE(perplexity = 800, random_state = 42)

tsne_comp = tsne.fit_transform(tfidf_vectors.toarray())

scatter(tsne_comp, df_full['label'], ['critizising', 'non_critizising'], COLOURS[:2])

As expected it fails PCA and TSNE using vader lexicon fails.

### 3 ###
•	It gets all the sentences in the high_critical papers and a similar number of sentences from the non_critical papers. I finds the words and bigrams used in the high_critical set and are not used in the non_critical set.


Let's analyse the sentences where the paper is cited.

In [None]:
#print("df_highCritical.shape; ", df_highCritical.shape)
#print("df_Critical.shape; ", df_Critical.shape)
#print("df_nonCritical.shape: ", df_nonCritical.shape)
#print("df_full.shape: ", df_full.shape)

#file_df = '/project/elife/data/analysis/df_science1179052_retracted.csv'

#file_df = '/project/elife/data/analysis/df_1000_1000v2_prep_.csv'
#df = pd.read_csv(file_df, sep='\t', header = None, encoding='utf-8', names = ['cited_DOI','cited_in_conclusions','cited_in_discussion','cited_in_introduction', 'cited_in_maintext', 'citing_DOI','conclusions_found', 'discussion_found', 'introduction_found','maintext_found', 'reference_id', 'sentence_citing_conclusions','sentence_citing_discussion', 'sentence_citing_intro', 'sentence_citing_maintext'])
#  df['cited_DOI', 'cited_in_conclusions','cited_in_discussion',
#    'cited_in_introduction', 'cited_in_maintext', 'citing_DOI',
#    'conclusions_found', 'discussion_found', 'introduction_found',
#    'maintext_found', 'reference_id', 'sentence_citing_conclusions',
#    'sentence_citing_discussion', 'sentence_citing_intro', 'sentence_citing_maintext']

df_full_highCritical_sample = df_full_highCritical # All the papers the retraction is based on
#df_full_highCritical_sample = df_full_Critical.sample(4, random_state = 1)

sentences_highCritical = df_full_highCritical_sample.text.dropna()
frequent_words_highCritical = words_frec_analysis_get_sentence.analysis_nolimit(sentences_highCritical)

# I get a similar number of nonCritical text paragraphs:
df_full_nonCritical_sample = df_full_nonCritical.sample(2*sentences_highCritical.shape[0], random_state = 10)

sentences_nonCritical = df_full_nonCritical_sample.text.dropna()
frequent_words_nonCritical = words_frec_analysis_get_sentence.analysis_nolimit(sentences_nonCritical)


set_words_nonCritical = set(frequent_words_nonCritical)
set_words_highCritical = set(frequent_words_highCritical)

words_only_highCritical = set_words_highCritical - set_words_nonCritical

print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
print("Set of words only found in the highCritical paragraphs")
print(words_only_highCritical)
print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")

words_only_nonCritical = set_words_nonCritical - set_words_highCritical

print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
print("Set of words only found in the nonCritical paragraphs")
print(words_only_nonCritical)
print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")




bigrams_highCritical = words_frec_analysis_get_sentence.analyse_bigrams(sentences_highCritical)
bigrams_nonCritical =  words_frec_analysis_get_sentence.analyse_bigrams(sentences_nonCritical)    

set_bigrams_highCritical = set(bigrams_highCritical)
set_bigrams_nonCritical = set(bigrams_nonCritical)

bigrams_only_highCritical = set_bigrams_highCritical-set_bigrams_nonCritical


print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
print("Set of bi-grams only found in the highCritical paragraphs")
print(bigrams_only_highCritical)
print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")


### Let's check the frequency of apereance of the words and bigrams highCritical in all the papers###
#df_full_Critical['mark'] =


### SOME CHECKS

In [None]:
sentences_highCritical[1:2]

In [None]:
with open('./english_words/wordsEn.txt', 'r') as word_file:
    english_words = list(word.strip().lower() for word in word_file)
#english_words[1000:1005]

if 'workload' in english_words:
    print("yes")

In [None]:
sentence = sentences_highCritical[0:4]
sentence.to_csv(sys.stdout)

In [None]:
frequent_words_nonCritical[0]

In [None]:
frequent_words_highCritical

In [None]:
df_full_nonCritical

In [None]:
df_full_nonCritical.sample(4, random_state = 1)